Writing Tests¶

Test File Templates¶

Basic Test File Structure¶

"""
Test module for [module_name].

This module contains unit tests for the [module_name] functionality.
"""

import pytest
from unittest.mock import Mock, patch
from eval_runner import [module_name]


class Test[ClassName]:
    """Test cases for [ClassName]."""

    def setup_method(self):
        """Set up test fixtures before each test method."""
        pass

    def teardown_method(self):
        """Clean up after each test method."""
        pass

    def test_[functionality]_[condition](self):
        """Test [specific behavior] when [condition]."""
        # Arrange
        # Act
        # Assert
        pass


def test_[function_name]_[condition]():
    """Test [function_name] when [condition]."""
    # Arrange
    # Act
    # Assert
    pass

Test File Template for Evaluation Components¶

"""
Test module for evaluation engine components.

This module contains tests for the evaluation engine, including scenario loading,
metric calculation, and result generation.
"""

import pytest
import json
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
from eval_runner import engine, loader, metrics, reporter


class TestEvaluationEngine:
    """Test cases for the evaluation engine."""

    @pytest.fixture
    def sample_scenario(self):
        """Provide a sample scenario for testing."""
        return {
            "scenario_id": "test_001",
            "title": "Test Scenario",
            "tasks": [
                {
                    "task_id": "task_1",
                    "description": "Test task",
                    "required_tools": ["search"],
                    "success_criteria": [
                        {
                            "metric": "tool_call_correctness",
                            "threshold": 1.0
                        }
                    ]
                }
            ]
        }

    def test_run_evaluation_with_valid_scenario(self, sample_scenario):
        """Test evaluation engine with a valid scenario."""
        # Arrange
        expected_tasks = 1

        # Act
        results = engine.run_evaluation(sample_scenario)

        # Assert
        assert len(results) == expected_tasks
        assert results[0]["task_id"] == "task_1"
        assert "metrics" in results[0]


class TestScenarioLoading:
    """Test cases for scenario loading functionality."""

    def test_load_valid_scenario(self, tmp_path):
        """Test loading a valid scenario file."""
        # Arrange
        scenario_data = {"scenario_id": "test", "title": "Test"}
        scenario_file = tmp_path / "test_scenario.json"
        scenario_file.write_text(json.dumps(scenario_data))

        # Act
        result = loader.load_scenario(scenario_file)

        # Assert
        assert result["scenario_id"] == "test"
        assert result["title"] == "Test"

    def test_load_nonexistent_scenario(self):
        """Test loading a scenario file that doesn't exist."""
        # Arrange
        nonexistent_file = Path("nonexistent.json")

        # Act & Assert
        with pytest.raises(FileNotFoundError):
            loader.load_scenario(nonexistent_file)


class TestMetrics:
    """Test cases for metric calculations."""

    def test_tool_call_correctness_perfect_match(self):
        """Test tool call correctness with perfect match."""
        # Arrange
        expected = ["search", "lookup"]
        actual = ["search", "lookup"]

        # Act
        result = metrics.calculate_tool_call_correctness(expected, actual)

        # Assert
        assert result == 1.0

    def test_tool_call_correctness_no_match(self):
        """Test tool call correctness with no match."""
        # Arrange
        expected = ["search"]
        actual = ["lookup"]

        # Act
        result = metrics.calculate_tool_call_correctness(expected, actual)

        # Assert
        assert result == 0.0

    def test_state_verification_nested_path(self):
        """Test state verification with nested dot-notation paths (v1.1+)."""
        # Arrange
        expected = {"user.profile.status": "active"}
        actual = {"user": {"profile": {"status": "active"}}}

        # Act
        result = metrics.calculate_state_correctness(expected, actual)

        # Assert
        assert result == 1.0

    async def test_judge_required_guard(self):
        """Test judge guarding for required metrics (v1.1+)."""
        # Arrange
        criterion = {"metric": "luna_judge_score", "required": True}

        # Act & Assert
        with pytest.raises(RuntimeError, match="Judge provider .* is required"):
            await metrics.calculate_luna_judge_score(criterion, {})

Naming Conventions¶

Test Function Names¶

Use descriptive names that follow the pattern: test_[functionality]_[condition]

# Good examples
def test_load_scenario_with_valid_file():
def test_calculate_metrics_with_empty_input():
def test_evaluation_engine_with_invalid_scenario():
def test_report_generation_with_multiple_tasks():

# Bad examples
def test_loader():
def test_metrics():
def test_engine():

Test Class Names¶

Use descriptive class names that indicate what is being tested:

# Good examples
class TestEvaluationEngine:
class TestScenarioLoading:
class TestMetricCalculation:
class TestReportGeneration:

# Bad examples
class TestEngine:
class TestLoader:
class TestMetrics:

Fixture Names¶

Use descriptive fixture names that indicate what they provide:

# Good examples
@pytest.fixture
def sample_scenario():
@pytest.fixture
def mock_agent_api():
@pytest.fixture
def valid_scenario_file():
@pytest.fixture
def test_metrics_data():

# Bad examples
@pytest.fixture
def scenario():
@pytest.fixture
def api():
@pytest.fixture
def file():

Mock Usage Patterns¶

Mocking External APIs¶

@patch('eval_runner.cli.requests.post')
def test_agent_api_integration(mock_post):
    """Test integration with agent API."""
    # Arrange
    mock_response = Mock()
    mock_response.json.return_value = {"tool_name": "search"}
    mock_response.raise_for_status.return_value = None
    mock_post.return_value = mock_response

    scenario = {"tasks": [{"description": "test"}]}

    # Act
    results = engine.run_evaluation(scenario)

    # Assert
    mock_post.assert_called_once()
    assert len(results) > 0

Mocking File Operations¶

@patch('builtins.open', mock_open(read_data='{"test": "data"}'))
def test_file_loading_with_mock():
    """Test file loading with mocked file operations."""
    # Arrange
    file_path = Path("test.json")

    # Act
    result = loader.load_scenario(file_path)

    # Assert
    assert result["test"] == "data"

Mocking Environment Variables¶

@patch.dict(os.environ, {'AGENT_API_URL': 'http://test.com'})
def test_environment_variable_usage():
    """Test that environment variables are used correctly."""
    # Arrange
    # Act
    # Assert
    pass

Agent Testing Best Practices¶

Testing Agent Integration¶

class TestAgentIntegration:
    """Test cases for agent integration."""

    @pytest.fixture
    def mock_agent_response(self):
        """Provide a mock agent response."""
        return {
            "tool_name": "search",
            "result": "test result",
            "confidence": 0.95
        }

    def test_agent_correct_tool_usage(self, mock_agent_response):
        """Test that agent uses the correct tools."""
        # Arrange
        expected_tools = ["search"]

        # Act
        actual_tools = [mock_agent_response["tool_name"]]

        # Assert
        assert actual_tools == expected_tools

    def test_agent_response_validation(self, mock_agent_response):
        """Test that agent responses are properly validated."""
        # Arrange
        required_fields = ["tool_name", "result"]

        # Act & Assert
        for field in required_fields:
            assert field in mock_agent_response

Testing Error Conditions¶

class TestErrorHandling:
    """Test cases for error handling."""

    def test_agent_api_timeout(self):
        """Test handling of agent API timeout."""
        # Arrange
        with patch('eval_runner.cli.requests.post') as mock_post:
            mock_post.side_effect = requests.exceptions.Timeout()

            scenario = {"tasks": [{"description": "test"}]}

            # Act
            results = engine.run_evaluation(scenario)

            # Assert
            assert len(results) > 0
            # Verify that timeout was handled gracefully

    def test_invalid_json_response(self):
        """Test handling of invalid JSON response from agent."""
        # Arrange
        with patch('eval_runner.cli.requests.post') as mock_post:
            mock_response = Mock()
            mock_response.json.side_effect = json.JSONDecodeError("", "", 0)
            mock_post.return_value = mock_response

            scenario = {"tasks": [{"description": "test"}]}

            # Act
            results = engine.run_evaluation(scenario)

            # Assert
            assert len(results) > 0
            # Verify that JSON decode error was handled gracefully

Scenario Validation Testing¶

Testing Schema Validation¶

class TestScenarioValidation:
    """Test cases for scenario validation."""

    @pytest.fixture
    def valid_scenario(self):
        """Provide a valid scenario for testing."""
        return {
            "scenario_id": "test_001",
            "title": "Test Scenario",
            "industry": "test",
            "description": "A test scenario",
            "tasks": [
                {
                    "task_id": "task_1",
                    "description": "Test task",
                    "required_tools": ["search"],
                    "success_criteria": [
                        {
                            "metric": "tool_call_correctness",
                            "threshold": 1.0
                        }
                    ]
                }
            ]
        }

    def test_valid_scenario_passes_validation(self, valid_scenario):
        """Test that valid scenarios pass validation."""
        # Arrange
        # Act
        # Assert
        # This would typically use the schema validation function
        pass

    def test_invalid_scenario_fails_validation(self):
        """Test that invalid scenarios fail validation."""
        # Arrange
        invalid_scenario = {"invalid": "data"}

        # Act & Assert
        # This would typically use the schema validation function
        # and expect it to raise a validation error
        pass

Performance Testing Guidelines¶

Basic Performance Tests¶

class TestPerformance:
    """Test cases for performance benchmarks."""

    def test_evaluation_engine_performance(self, benchmark):
        """Test evaluation engine performance."""
        # Arrange
        scenario = {
            "scenario_id": "perf_test",
            "tasks": [{"task_id": "task_1", "description": "test"}]
        }

        # Act
        def run_evaluation():
            return engine.run_evaluation(scenario)

        result = benchmark(run_evaluation)

        # Assert
        assert result.stats.mean < 1.0  # Should complete in under 1 second

    def test_scenario_loading_performance(self, benchmark, tmp_path):
        """Test scenario loading performance."""
        # Arrange
        scenario_data = {"scenario_id": "test", "title": "Test"}
        scenario_file = tmp_path / "perf_test.json"
        scenario_file.write_text(json.dumps(scenario_data))

        # Act
        def load_scenario():
            return loader.load_scenario(scenario_file)

        result = benchmark(load_scenario)

        # Assert
        assert result.stats.mean < 0.1  # Should load in under 100ms

Test Data Management¶

Using Fixtures for Test Data¶

@pytest.fixture(scope="module")
def sample_scenarios():
    """Provide a collection of sample scenarios for testing."""
    return [
        {
            "scenario_id": "scenario_1",
            "title": "Scenario 1",
            "tasks": [{"task_id": "task_1", "description": "Task 1"}]
        },
        {
            "scenario_id": "scenario_2",
            "title": "Scenario 2",
            "tasks": [{"task_id": "task_2", "description": "Task 2"}]
        }
    ]

@pytest.fixture
def mock_agent_responses():
    """Provide mock agent responses for testing."""
    return {
        "search": {"tool_name": "search", "result": "search result"},
        "lookup": {"tool_name": "lookup", "result": "lookup result"}
    }

Test Data Factories¶

def create_test_scenario(scenario_id="test_001", num_tasks=1):
    """Create a test scenario with specified parameters."""
    return {
        "scenario_id": scenario_id,
        "title": f"Test Scenario {scenario_id}",
        "tasks": [
            {
                "task_id": f"task_{i}",
                "description": f"Test task {i}",
                "required_tools": ["search"],
                "success_criteria": [
                    {
                        "metric": "tool_call_correctness",
                        "threshold": 1.0
                    }
                ]
            }
            for i in range(num_tasks)
        ]
    }

def create_test_agent_response(tool_name="search", result="test result"):
    """Create a test agent response."""
    return {
        "tool_name": tool_name,
        "result": result,
        "confidence": 0.95
    }

Best Practices Summary¶

Writing Effective Tests¶

Test One Thing: Each test should verify one specific behavior
Use Descriptive Names: Test names should clearly describe what is being tested
Follow AAA Pattern: Arrange, Act, Assert structure
Use Fixtures: Reuse test data and setup code
Mock External Dependencies: Avoid external service calls in unit tests
Test Edge Cases: Include tests for error conditions and boundary values
Keep Tests Fast: Unit tests should run quickly
Maintain Test Independence: Tests should not depend on each other

Common Anti-patterns to Avoid¶

Testing Implementation Details: Focus on behavior, not implementation
Over-mocking: Only mock what's necessary
Testing Multiple Behaviors: Each test should verify one thing
Hard-coded Test Data: Use factories and fixtures for test data
Slow Tests: Avoid unnecessary setup or external calls
Fragile Tests: Don't make tests dependent on specific implementation details