glam/docs/plan/prompt-query_template_mapping/tdd.md

25 KiB

Test-Driven Development Strategy

Overview

This document defines the TDD approach for implementing the template-based SPARQL system. Tests are organized by component and follow the Red-Green-Refactor cycle.

Test Organization

tests/
  template_sparql/
    __init__.py
    test_intent_classifier.py     # Intent classification tests
    test_slot_extractor.py        # Slot extraction tests
    test_template_registry.py     # Template registry tests
    test_template_instantiation.py # Template filling tests
    test_sparql_validation.py     # SPARQL validation tests
    test_query_pipeline.py        # End-to-end pipeline tests
    test_integration.py           # Integration with RAG system
    conftest.py                   # Shared fixtures

Test Categories

1. Unit Tests - Intent Classification

# tests/template_sparql/test_intent_classifier.py

import pytest
from backend.rag.template_sparql.intent_classifier import IntentClassifier, QueryIntent

class TestIntentClassifier:
    """Test suite for intent classification."""
    
    @pytest.fixture
    def classifier(self):
        """Create classifier instance."""
        return IntentClassifier()
    
    # =========================================================================
    # Location-Based Queries (Class 1)
    # =========================================================================
    
    @pytest.mark.parametrize("question,expected_template,expected_slots", [
        # Dutch - Archives in province
        (
            "Welke archieven zijn er in Drenthe?",
            "region_institution_search",
            {"institution_type": "archieven", "province": "Drenthe"}
        ),
        # Dutch - Museums in province
        (
            "Welke musea zijn er in Noord-Holland?",
            "region_institution_search",
            {"institution_type": "musea", "province": "Noord-Holland"}
        ),
        # Dutch - Libraries in province
        (
            "Welke bibliotheken zijn er in Utrecht?",
            "region_institution_search",
            {"institution_type": "bibliotheken", "province": "Utrecht"}
        ),
        # English variant
        (
            "Archives in Drenthe",
            "region_institution_search",
            {"institution_type": "Archives", "province": "Drenthe"}
        ),
        # Informal Dutch
        (
            "archieven in drenthe",
            "region_institution_search",
            {"institution_type": "archieven", "province": "drenthe"}
        ),
    ])
    def test_location_based_intent(self, classifier, question, expected_template, expected_slots):
        """Test classification of location-based queries."""
        intent = classifier.classify(question)
        
        assert intent.template_id == expected_template
        assert intent.extracted_slots == expected_slots
    
    # =========================================================================
    # Aggregation Queries (Class 5)
    # =========================================================================
    
    @pytest.mark.parametrize("question,expected_template", [
        ("Hoeveel archieven zijn er in Nederland?", "count_by_type"),
        ("Hoeveel musea zijn er in Drenthe?", "count_by_type_region"),
        ("Tel alle bibliotheken", "count_by_type"),
        ("How many archives are there?", "count_by_type"),
    ])
    def test_aggregation_intent(self, classifier, question, expected_template):
        """Test classification of aggregation queries."""
        intent = classifier.classify(question)
        assert intent.template_id == expected_template
    
    # =========================================================================
    # Entity Lookup Queries (Class 3)
    # =========================================================================
    
    @pytest.mark.parametrize("question,expected_template", [
        ("Wat is het Nationaal Archief?", "entity_lookup"),
        ("Informatie over Rijksmuseum", "entity_lookup"),
        ("Details van NL-HaNA", "entity_lookup_by_ghcid"),
    ])
    def test_entity_lookup_intent(self, classifier, question, expected_template):
        """Test classification of entity lookup queries."""
        intent = classifier.classify(question)
        assert intent.template_id == expected_template
    
    # =========================================================================
    # Fallback Cases
    # =========================================================================
    
    @pytest.mark.parametrize("question", [
        "Wat is de beste manier om een archief te digitaliseren?",
        "Hoe kan ik toegang krijgen tot de collectie?",
        "Hello, I have a question",
    ])
    def test_fallback_to_llm(self, classifier, question):
        """Test that unmatched questions fall back to LLM."""
        intent = classifier.classify(question)
        assert intent.template_id == "llm_fallback"


class TestIntentClassifierEdgeCases:
    """Edge cases and error handling."""
    
    @pytest.fixture
    def classifier(self):
        return IntentClassifier()
    
    def test_empty_question(self, classifier):
        """Empty question should raise ValueError."""
        with pytest.raises(ValueError, match="Question cannot be empty"):
            classifier.classify("")
    
    def test_very_long_question(self, classifier):
        """Very long questions should still be classified."""
        long_question = "Welke archieven " + "zijn er " * 100 + "in Drenthe?"
        intent = classifier.classify(long_question)
        # Should still extract the key information
        assert intent.template_id is not None
    
    def test_question_with_special_characters(self, classifier):
        """Questions with special characters should be handled."""
        intent = classifier.classify("Welke archieven zijn er in 's-Hertogenbosch?")
        assert intent.template_id is not None

2. Unit Tests - Slot Extraction

# tests/template_sparql/test_slot_extractor.py

import pytest
from backend.rag.template_sparql.slot_extractor import SlotExtractor

class TestSlotExtractor:
    """Test suite for slot value extraction."""
    
    @pytest.fixture
    def extractor(self):
        return SlotExtractor()
    
    # =========================================================================
    # Province Code Extraction
    # =========================================================================
    
    @pytest.mark.parametrize("input_province,expected_code", [
        # Standard Dutch province names
        ("Drenthe", "NL-DR"),
        ("Noord-Holland", "NL-NH"),
        ("Zuid-Holland", "NL-ZH"),
        ("Noord-Brabant", "NL-NB"),
        ("Utrecht", "NL-UT"),
        ("Gelderland", "NL-GE"),
        ("Limburg", "NL-LI"),
        ("Overijssel", "NL-OV"),
        ("Flevoland", "NL-FL"),
        ("Friesland", "NL-FR"),
        ("Groningen", "NL-GR"),
        ("Zeeland", "NL-ZE"),
        # Case variations
        ("drenthe", "NL-DR"),
        ("DRENTHE", "NL-DR"),
        ("DrEnThE", "NL-DR"),
        # Alternative spellings
        ("Fryslan", "NL-FR"),
        ("Fryslân", "NL-FR"),
    ])
    def test_province_to_code(self, extractor, input_province, expected_code):
        """Test province name to ISO 3166-2 code conversion."""
        result = extractor.extract_province_code(input_province)
        assert result == expected_code
    
    def test_unknown_province(self, extractor):
        """Unknown province should return None."""
        result = extractor.extract_province_code("Atlantis")
        assert result is None
    
    # =========================================================================
    # Institution Type Extraction
    # =========================================================================
    
    @pytest.mark.parametrize("input_type,expected_code", [
        # Dutch singular/plural
        ("archief", "A"),
        ("archieven", "A"),
        ("museum", "M"),
        ("musea", "M"),
        ("bibliotheek", "L"),
        ("bibliotheken", "L"),
        ("galerie", "G"),
        ("galerijen", "G"),
        # English
        ("archive", "A"),
        ("archives", "A"),
        ("museum", "M"),
        ("museums", "M"),
        ("library", "L"),
        ("libraries", "L"),
        ("gallery", "G"),
        ("galleries", "G"),
        # Descriptive terms
        ("regionaal archief", "A"),
        ("stadsarchief", "A"),
        ("rijksmuseum", "M"),
        ("openbare bibliotheek", "L"),
    ])
    def test_institution_type_to_code(self, extractor, input_type, expected_code):
        """Test institution type to single-letter code conversion."""
        result = extractor.extract_institution_type_code(input_type)
        assert result == expected_code
    
    # =========================================================================
    # Full Slot Extraction
    # =========================================================================
    
    def test_extract_all_slots_location_query(self, extractor):
        """Test full slot extraction for location query."""
        question = "Welke archieven zijn er in Drenthe?"
        template_slots = {
            "institution_type_code": {"required": True},
            "province_code": {"required": True},
        }
        
        result = extractor.extract(question, template_slots)
        
        assert result["institution_type_code"] == "A"
        assert result["province_code"] == "NL-DR"
    
    def test_extract_with_missing_required_slot(self, extractor):
        """Missing required slot should raise ValueError."""
        question = "Welke zijn er in Nederland?"  # No institution type
        template_slots = {
            "institution_type_code": {"required": True},
            "province_code": {"required": False},
        }
        
        with pytest.raises(ValueError, match="Missing required slot"):
            extractor.extract(question, template_slots)


class TestSlotExtractorFuzzyMatching:
    """Test fuzzy matching for slot extraction."""
    
    @pytest.fixture
    def extractor(self):
        return SlotExtractor(fuzzy_threshold=80)
    
    @pytest.mark.parametrize("input_province,expected_code", [
        ("Drent", "NL-DR"),      # Partial match
        ("Dremthe", "NL-DR"),    # Typo
        ("N-Holland", "NL-NH"),  # Abbreviation
        ("Noordholland", "NL-NH"),  # Without hyphen
    ])
    def test_fuzzy_province_matching(self, extractor, input_province, expected_code):
        """Test fuzzy matching for province names."""
        result = extractor.extract_province_code(input_province, fuzzy=True)
        assert result == expected_code

3. Unit Tests - Template Instantiation

# tests/template_sparql/test_template_instantiation.py

import pytest
from backend.rag.template_sparql.templates import SimpleTemplate, CompositeTemplate, SlotDefinition

class TestSimpleTemplate:
    """Test suite for simple template instantiation."""
    
    def test_basic_instantiation(self):
        """Test basic template slot filling."""
        template = SimpleTemplate(
            template_id="region_search",
            description="Search by region",
            slots={
                "province_code": SlotDefinition(name="province_code"),
            },
            sparql_template="""
PREFIX hc: <https://nde.nl/ontology/hc/class/>
SELECT ?s WHERE {
  ?s a hc:Custodian .
  FILTER(CONTAINS(STR(?s), "{{province_code}}"))
}"""
        )
        
        result = template.instantiate({"province_code": "NL-DR"})
        
        assert "NL-DR" in result
        assert "{{province_code}}" not in result
    
    def test_multiple_slots(self):
        """Test template with multiple slots."""
        template = SimpleTemplate(
            template_id="type_region_search",
            description="Search by type and region",
            slots={
                "institution_type_code": SlotDefinition(name="institution_type_code"),
                "province_code": SlotDefinition(name="province_code"),
            },
            sparql_template="""
SELECT ?s WHERE {
  ?s hcp:institutionType "{{institution_type_code}}" .
  FILTER(CONTAINS(STR(?s), "{{province_code}}"))
}"""
        )
        
        result = template.instantiate({
            "institution_type_code": "A",
            "province_code": "NL-DR"
        })
        
        assert '"A"' in result
        assert "NL-DR" in result
    
    def test_missing_required_slot(self):
        """Missing required slot should raise ValueError."""
        template = SimpleTemplate(
            template_id="test",
            description="Test",
            slots={
                "province_code": SlotDefinition(name="province_code", required=True),
            },
            sparql_template="FILTER(CONTAINS(STR(?s), '{{province_code}}'))"
        )
        
        with pytest.raises(ValueError, match="Missing required slot"):
            template.instantiate({})
    
    def test_invalid_slot_value(self):
        """Invalid slot value should raise ValueError."""
        template = SimpleTemplate(
            template_id="test",
            description="Test",
            slots={
                "institution_type_code": SlotDefinition(
                    name="institution_type_code",
                    valid_values=["A", "M", "L", "G"]
                ),
            },
            sparql_template='hcp:institutionType "{{institution_type_code}}"'
        )
        
        with pytest.raises(ValueError, match="Invalid value"):
            template.instantiate({"institution_type_code": "X"})
    
    def test_optional_slot_not_provided(self):
        """Optional slot not provided should use default."""
        template = SimpleTemplate(
            template_id="test",
            description="Test",
            slots={
                "limit": SlotDefinition(
                    name="limit",
                    required=False,
                    default_value="100"
                ),
            },
            sparql_template="LIMIT {{limit}}"
        )
        
        result = template.instantiate({})
        assert "LIMIT 100" in result


class TestCompositeTemplate:
    """Test suite for composite template instantiation."""
    
    def test_and_composition(self):
        """Test AND composition of sub-templates."""
        type_filter = SimpleTemplate(
            template_id="type_filter",
            description="Filter by type",
            slots={"type": SlotDefinition(name="type")},
            sparql_template='?s hcp:institutionType "{{type}}" .'
        )
        
        region_filter = SimpleTemplate(
            template_id="region_filter",
            description="Filter by region",
            slots={"region": SlotDefinition(name="region")},
            sparql_template='FILTER(CONTAINS(STR(?s), "{{region}}"))'
        )
        
        composite = CompositeTemplate(
            template_id="type_region",
            description="Type and region filter",
            slots={
                "type": SlotDefinition(name="type"),
                "region": SlotDefinition(name="region"),
            },
            sub_templates=[type_filter, region_filter],
            join_type="AND"
        )
        
        result = composite.instantiate({"type": "A", "region": "NL-DR"})
        
        assert '"A"' in result
        assert "NL-DR" in result

4. Integration Tests - Query Pipeline

# tests/template_sparql/test_query_pipeline.py

import pytest
from backend.rag.template_sparql.pipeline import QueryPipeline, QueryContext

class TestQueryPipeline:
    """End-to-end pipeline tests."""
    
    @pytest.fixture
    def pipeline(self):
        """Create fully configured pipeline."""
        return QueryPipeline.create_default()
    
    # =========================================================================
    # Successful Query Generation
    # =========================================================================
    
    @pytest.mark.parametrize("question,expected_contains", [
        (
            "Welke archieven zijn er in Drenthe?",
            ['hc:Custodian', '"A"', 'NL-DR']
        ),
        (
            "Welke musea zijn er in Noord-Holland?",
            ['hc:Custodian', '"M"', 'NL-NH']
        ),
        (
            "Hoeveel bibliotheken zijn er in Utrecht?",
            ['COUNT', '"L"', 'NL-UT']
        ),
    ])
    def test_successful_query_generation(self, pipeline, question, expected_contains):
        """Test successful end-to-end query generation."""
        context = QueryContext(original_question=question)
        result = pipeline.process(context)
        
        assert result.errors == []
        assert result.sparql_query is not None
        
        for expected in expected_contains:
            assert expected in result.sparql_query, \
                f"Expected '{expected}' in query:\n{result.sparql_query}"
    
    # =========================================================================
    # Error Handling
    # =========================================================================
    
    def test_invalid_slot_value_error(self, pipeline):
        """Test error handling for invalid slot values."""
        # Question with unrecognized province
        context = QueryContext(original_question="Welke archieven zijn er in Atlantis?")
        result = pipeline.process(context)
        
        # Should either fall back to LLM or report error
        assert result.sparql_query is not None or len(result.errors) > 0
    
    # =========================================================================
    # SPARQL Validation
    # =========================================================================
    
    def test_generated_sparql_is_valid(self, pipeline):
        """Test that all generated SPARQL queries are syntactically valid."""
        questions = [
            "Welke archieven zijn er in Drenthe?",
            "Welke musea zijn er in Noord-Holland?",
            "Hoeveel bibliotheken zijn er?",
        ]
        
        for question in questions:
            context = QueryContext(original_question=question)
            result = pipeline.process(context)
            
            if result.sparql_query:
                # Validate with sparql_linter
                from glam_extractor.api.sparql_linter import lint_sparql
                lint_result = lint_sparql(result.sparql_query)
                
                assert lint_result.valid, \
                    f"Invalid SPARQL for '{question}':\n{result.sparql_query}\nErrors: {lint_result.issues}"


class TestQueryPipelineWithSPARQLEndpoint:
    """Integration tests with actual SPARQL endpoint."""
    
    @pytest.fixture
    def pipeline(self):
        return QueryPipeline.create_default()
    
    @pytest.mark.integration
    @pytest.mark.parametrize("question,min_results", [
        ("Welke archieven zijn er in Drenthe?", 1),
        ("Welke musea zijn er in Noord-Holland?", 1),
    ])
    async def test_query_returns_results(self, pipeline, question, min_results):
        """Test that generated queries return expected results."""
        import httpx
        
        context = QueryContext(original_question=question)
        result = pipeline.process(context)
        
        assert result.sparql_query is not None
        
        # Execute against SPARQL endpoint
        async with httpx.AsyncClient() as client:
            response = await client.post(
                "https://bronhouder.nl/sparql",
                data={"query": result.sparql_query},
                headers={"Accept": "application/sparql-results+json"}
            )
            
            assert response.status_code == 200
            data = response.json()
            
            results = data.get("results", {}).get("bindings", [])
            assert len(results) >= min_results, \
                f"Expected at least {min_results} results for '{question}'"

5. Fixtures and Shared Setup

# tests/template_sparql/conftest.py

import pytest
from pathlib import Path
import yaml

@pytest.fixture(scope="session")
def validation_rules():
    """Load SPARQL validation rules."""
    rules_path = Path("data/validation/sparql_validation_rules.json")
    import json
    with open(rules_path) as f:
        return json.load(f)

@pytest.fixture(scope="session")
def template_config():
    """Load template configuration."""
    config_path = Path("data/templates/sparql_templates.yaml")
    with open(config_path) as f:
        return yaml.safe_load(f)

@pytest.fixture
def sample_questions():
    """Sample questions for testing."""
    return {
        "location_archive_drenthe": "Welke archieven zijn er in Drenthe?",
        "location_museum_nh": "Welke musea zijn er in Noord-Holland?",
        "count_libraries": "Hoeveel bibliotheken zijn er in Nederland?",
        "entity_lookup": "Wat is het Nationaal Archief?",
        "complex_query": "Welke archieven in Drenthe hebben meer dan 1000 items?",
    }

@pytest.fixture
def expected_sparql_patterns():
    """Expected patterns in generated SPARQL."""
    return {
        "location_archive_drenthe": {
            "must_contain": ['hc:Custodian', 'hcp:institutionType', '"A"', 'NL-DR'],
            "must_not_contain": ['crm:', 'cidoc:'],
        },
        "location_museum_nh": {
            "must_contain": ['hc:Custodian', '"M"', 'NL-NH'],
            "must_not_contain": ['crm:', 'cidoc:'],
        },
    }

Test Execution

Running Tests

# Run all template SPARQL tests
pytest tests/template_sparql/ -v

# Run with coverage
pytest tests/template_sparql/ --cov=backend.rag.template_sparql --cov-report=html

# Run only unit tests (fast)
pytest tests/template_sparql/ -v -m "not integration"

# Run integration tests (requires SPARQL endpoint)
pytest tests/template_sparql/ -v -m integration

# Run specific test file
pytest tests/template_sparql/test_intent_classifier.py -v

# Run specific test case
pytest tests/template_sparql/test_intent_classifier.py::TestIntentClassifier::test_location_based_intent -v

Test Markers

# conftest.py at project root
import pytest

def pytest_configure(config):
    config.addinivalue_line("markers", "integration: marks tests as integration tests")
    config.addinivalue_line("markers", "slow: marks tests as slow")

Coverage Targets

Component Target Coverage
IntentClassifier 95%
SlotExtractor 95%
TemplateRegistry 90%
Template instantiation 95%
QueryPipeline 85%
Integration 80%

Test Data Management

Golden Test Cases

Store expected outputs for regression testing:

# tests/template_sparql/golden/location_queries.yaml
test_cases:
  - id: "archive_drenthe_001"
    input:
      question: "Welke archieven zijn er in Drenthe?"
      language: "nl"
    expected:
      template_id: "region_institution_search"
      slots:
        institution_type_code: "A"
        province_code: "NL-DR"
      sparql_contains:
        - "hc:Custodian"
        - 'hcp:institutionType "A"'
        - 'FILTER(CONTAINS(STR(?'
        - '"NL-DR"'

Loading Golden Cases

@pytest.fixture
def golden_cases():
    """Load golden test cases."""
    import yaml
    golden_dir = Path("tests/template_sparql/golden")
    cases = {}
    for yaml_file in golden_dir.glob("*.yaml"):
        with open(yaml_file) as f:
            data = yaml.safe_load(f)
            cases.update({
                case["id"]: case 
                for case in data.get("test_cases", [])
            })
    return cases

def test_golden_cases(pipeline, golden_cases):
    """Test against golden cases."""
    for case_id, case in golden_cases.items():
        context = QueryContext(original_question=case["input"]["question"])
        result = pipeline.process(context)
        
        assert result.intent.template_id == case["expected"]["template_id"], \
            f"Case {case_id}: template mismatch"
        
        for pattern in case["expected"]["sparql_contains"]:
            assert pattern in result.sparql_query, \
                f"Case {case_id}: missing pattern '{pattern}'"

Continuous Integration

# .github/workflows/template-sparql-tests.yml
name: Template SPARQL Tests

on:
  push:
    paths:
      - 'backend/rag/template_sparql/**'
      - 'tests/template_sparql/**'
  pull_request:
    paths:
      - 'backend/rag/template_sparql/**'
      - 'tests/template_sparql/**'

jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
      
      - name: Install dependencies
        run: |
          pip install -e ".[dev]"          
      
      - name: Run unit tests
        run: |
          pytest tests/template_sparql/ -v -m "not integration" --cov=backend.rag.template_sparql          
      
      - name: Upload coverage
        uses: codecov/codecov-action@v4