# Specificity Score System - Test-Driven Development Strategy ## Overview This document outlines the TDD approach for implementing the specificity scoring system. All components will be developed test-first to ensure correctness and maintainability. > **INTEGRATION NOTE**: Tests in this document should verify integration with **existing infrastructure**: > - The `TemplateClassifier` at `backend/rag/template_sparql.py:1104` - **DO NOT mock** in integration tests > - The `TemplateClassifierSignature` at `backend/rag/template_sparql.py:634` - verify compatibility > - SPARQL templates in `data/sparql_templates.yaml` - use as test fixtures --- ## Existing Component Testing Strategy ### Wrapper Pattern Tests Since we're **wrapping** the existing `TemplateClassifier` rather than replacing it, add these tests: ```python # tests/integration/test_classifier_wrapper.py import pytest from backend.rag.template_sparql import TemplateClassifier from specificity.wrapper import SpecificityAwareClassifier from specificity.mapper import SPARQLToContextMapper class TestSpecificityAwareClassifier: """Tests for wrapper around existing TemplateClassifier.""" @pytest.fixture def wrapper(self): """Create wrapper with real TemplateClassifier.""" base_classifier = TemplateClassifier() mapper = SPARQLToContextMapper() return SpecificityAwareClassifier(base_classifier, mapper) def test_delegates_to_base_classifier(self, wrapper): """Wrapper delegates classification to existing classifier.""" result = wrapper.classify("What archives are in Drenthe?") # Verify it got a SPARQL template from base classifier assert result.sparql_template in [ "list_institutions_by_type_city", "list_institutions_by_type_region", ] def test_maps_sparql_to_context_template(self, wrapper): """Wrapper maps SPARQL template ID to context template.""" result = wrapper.classify("What archives are in Drenthe?") # SPARQL template → context template mapping assert result.context_template in ["archive_search", "location_browse"] def test_institution_type_refinement(self, wrapper): """Refines context template based on institution_type slot.""" result = wrapper.classify("What museums are in Amsterdam?") # institution_type = M → museum_search if result.slots.get("institution_type") == "M": assert result.context_template == "museum_search" def test_returns_specificity_scores(self, wrapper): """Wrapper returns specificity scores for filtered classes.""" result = wrapper.classify_with_scores("What archives are in Drenthe?") assert hasattr(result, "class_scores") assert "Archive" in result.class_scores assert result.class_scores["Archive"] > 0.8 # High for archive_search ``` ### SPARQL → Context Mapper Tests ```python # tests/unit/test_sparql_to_context_mapper.py import pytest from specificity.mapper import SPARQLToContextMapper class TestSPARQLToContextMapper: """Tests for SPARQL template → context template mapping.""" @pytest.fixture def mapper(self): return SPARQLToContextMapper() @pytest.mark.parametrize("sparql_template,expected_context", [ ("list_institutions_by_type_city", "location_browse"), ("list_institutions_by_type_region", "location_browse"), ("find_institution_by_identifier", "identifier_lookup"), ("find_institutions_by_founding_date", "organizational_change"), ("none", "general_heritage"), ]) def test_maps_sparql_to_context(self, mapper, sparql_template, expected_context): """Maps SPARQL template IDs to context template IDs.""" context = mapper.map(sparql_template) assert context == expected_context @pytest.mark.parametrize("institution_type,expected_context", [ ("A", "archive_search"), ("M", "museum_search"), ("L", "library_search"), ("G", "general_heritage"), # Gallery falls back to general ]) def test_refines_by_institution_type(self, mapper, institution_type, expected_context): """Refines context template based on institution_type slot.""" context = mapper.map( "list_institutions_by_type_city", slots={"institution_type": institution_type} ) assert context == expected_context def test_unknown_template_returns_general(self, mapper): """Unknown SPARQL templates map to general_heritage.""" context = mapper.map("unknown_template_xyz") assert context == "general_heritage" ``` --- ## Testing Pyramid ``` /\ / \ / E2E \ ← 5% - Full pipeline tests /______\ / \ / Integr. \ ← 20% - Cross-component tests /____________\ / \ / Unit \ ← 75% - Individual function tests /__________________\ ``` --- ## Unit Tests ### 1. Score Repository Tests ```python # tests/unit/test_score_repository.py import pytest from specificity.repository import LinkMLScoreRepository, InMemoryScoreRepository class TestInMemoryScoreRepository: """Tests for in-memory repository (used in other tests).""" def test_get_score_default(self): """Unknown classes return default score of 0.5.""" repo = InMemoryScoreRepository() assert repo.get_score("UnknownClass") == 0.5 def test_set_and_get_general_score(self): """Can set and retrieve general specificity score.""" repo = InMemoryScoreRepository() repo.set_score("Archive", 0.75) assert repo.get_score("Archive") == 0.75 def test_set_and_get_template_score(self): """Can set and retrieve template-specific score.""" repo = InMemoryScoreRepository() repo.set_score("Archive", 0.95, template_id="archive_search") repo.set_score("Archive", 0.20, template_id="museum_search") assert repo.get_score("Archive", "archive_search") == 0.95 assert repo.get_score("Archive", "museum_search") == 0.20 assert repo.get_score("Archive") == 0.5 # General score unchanged def test_get_all_scores(self): """Can retrieve all scores for a template.""" repo = InMemoryScoreRepository() repo.set_score("Archive", 0.95, "archive_search") repo.set_score("Museum", 0.30, "archive_search") repo.set_score("Library", 0.40, "archive_search") scores = repo.get_all_scores("archive_search") assert scores == {"Archive": 0.95, "Museum": 0.30, "Library": 0.40} def test_bulk_update(self): """Can update multiple scores at once.""" repo = InMemoryScoreRepository() repo.bulk_update({ "Archive": 0.90, "Museum": 0.85, "Library": 0.80 }, template_id="general_heritage") assert repo.get_score("Archive", "general_heritage") == 0.90 assert repo.get_score("Museum", "general_heritage") == 0.85 assert repo.get_score("Library", "general_heritage") == 0.80 class TestLinkMLScoreRepository: """Tests for LinkML file-based repository.""" @pytest.fixture def temp_schema_dir(self, tmp_path): """Create temporary schema directory with sample class files.""" classes_dir = tmp_path / "modules" / "classes" classes_dir.mkdir(parents=True) # Create sample class file archive_yaml = classes_dir / "Archive.yaml" archive_yaml.write_text(""" classes: Archive: is_a: HeritageCustodian description: An archive institution annotations: specificity_score: 0.75 template_specificity: archive_search: 0.95 museum_search: 0.20 """) return tmp_path def test_load_existing_score(self, temp_schema_dir): """Loads existing score from YAML file.""" repo = LinkMLScoreRepository(temp_schema_dir) assert repo.get_score("Archive") == 0.75 assert repo.get_score("Archive", "archive_search") == 0.95 def test_update_score_persists(self, temp_schema_dir): """Updated scores are persisted to file.""" repo = LinkMLScoreRepository(temp_schema_dir) repo.set_score("Archive", 0.80) # Create new repository instance to verify persistence repo2 = LinkMLScoreRepository(temp_schema_dir) assert repo2.get_score("Archive") == 0.80 def test_missing_class_raises_error(self, temp_schema_dir): """Raises ValueError for non-existent class.""" repo = LinkMLScoreRepository(temp_schema_dir) with pytest.raises(ValueError, match="Class file not found"): repo.get_score("NonExistentClass") ``` ### 2. Scoring Strategy Tests ```python # tests/unit/test_scoring_strategies.py import pytest from specificity.strategies import ( ArchiveSearchStrategy, MuseumSearchStrategy, LocationBrowseStrategy, ScoringStrategyFactory ) class TestArchiveSearchStrategy: """Tests for archive search scoring strategy.""" @pytest.fixture def strategy(self): return ArchiveSearchStrategy() def test_archive_class_high_score(self, strategy): """Archive class gets high score for archive_search template.""" score = strategy.calculate_score("Archive", {}) assert score >= 0.90 def test_museum_class_low_score(self, strategy): """Museum class gets low score for archive_search template.""" score = strategy.calculate_score("Museum", {}) assert score <= 0.30 def test_location_class_moderate_score(self, strategy): """Location class gets moderate score (useful across templates).""" score = strategy.calculate_score("Location", {}) assert 0.50 <= score <= 0.70 def test_archival_properties_boost(self, strategy): """Classes with archival properties get score boost.""" metadata_without = {"slots": ["name", "description"]} metadata_with = {"slots": ["name", "finding_aid", "extent"]} score_without = strategy.calculate_score("Collection", metadata_without) score_with = strategy.calculate_score("Collection", metadata_with) assert score_with > score_without def test_template_id(self, strategy): """Strategy returns correct template ID.""" assert strategy.get_template_id() == "archive_search" class TestScoringStrategyFactory: """Tests for strategy factory.""" def test_get_registered_strategy(self): """Factory returns registered strategy.""" strategy = ScoringStrategyFactory.get_strategy("archive_search") assert isinstance(strategy, ArchiveSearchStrategy) def test_get_unregistered_returns_default(self): """Factory returns default strategy for unknown template.""" strategy = ScoringStrategyFactory.get_strategy("unknown_template") assert strategy.get_template_id() == "default" def test_register_custom_strategy(self): """Can register custom strategy.""" class CustomStrategy(ScoringStrategy): def calculate_score(self, class_name, metadata): return 0.99 def get_template_id(self): return "custom" ScoringStrategyFactory.register(CustomStrategy()) strategy = ScoringStrategyFactory.get_strategy("custom") assert strategy.calculate_score("Any", {}) == 0.99 ``` ### 3. Score Calculator (Decorator) Tests ```python # tests/unit/test_score_calculator.py import pytest from specificity.calculator import ( BaseScoreCalculator, CustodianTypeBoostDecorator, InheritanceDepthDecorator ) from specificity.repository import InMemoryScoreRepository class TestBaseScoreCalculator: """Tests for base score calculator.""" @pytest.fixture def calculator(self): repo = InMemoryScoreRepository() repo.set_score("Archive", 0.75) repo.set_score("Archive", 0.95, "archive_search") return BaseScoreCalculator(repo) def test_returns_general_score(self, calculator): """Returns general score when no template specified.""" score = calculator.calculate("Archive", None) assert score == 0.75 def test_returns_template_score(self, calculator): """Returns template-specific score when template specified.""" score = calculator.calculate("Archive", "archive_search") assert score == 0.95 def test_fallback_to_general_for_unknown_template(self, calculator): """Falls back to general score for unknown template.""" score = calculator.calculate("Archive", "unknown_template") assert score == 0.75 class TestCustodianTypeBoostDecorator: """Tests for custodian type boost decorator.""" @pytest.fixture def decorated_calculator(self): repo = InMemoryScoreRepository() repo.set_score("Archive", 0.70) # Mock schema loader with custodian_types annotation schema_loader = MockSchemaLoader({ "Archive": {"annotations": {"custodian_types": ["A", "O"]}}, "Museum": {"annotations": {"custodian_types": ["M"]}}, "Location": {"annotations": {"custodian_types": ["*"]}} # Universal }) base = BaseScoreCalculator(repo) return CustodianTypeBoostDecorator(base, custodian_type="A") def test_boost_matching_custodian_type(self, decorated_calculator): """Boosts score for matching custodian type.""" score = decorated_calculator.calculate("Archive", None) assert score == 0.85 # 0.70 + 0.15 boost def test_no_boost_non_matching_type(self, decorated_calculator): """No boost for non-matching custodian type.""" score = decorated_calculator.calculate("Museum", None) assert score == 0.50 # Default score, no boost def test_boost_universal_type(self, decorated_calculator): """Boosts score for universal (*) custodian type.""" score = decorated_calculator.calculate("Location", None) assert score >= 0.65 # Gets boost because "*" matches all class TestInheritanceDepthDecorator: """Tests for inheritance depth decorator.""" @pytest.fixture def decorated_calculator(self): repo = InMemoryScoreRepository() repo.set_score("HeritageCustodian", 0.10) repo.set_score("Archive", 0.70) repo.set_score("NationalArchive", 0.80) # Mock schema loader with inheritance hierarchy schema_loader = MockSchemaLoader({ "HeritageCustodian": {}, # Root class, depth 0 "Archive": {"is_a": "HeritageCustodian"}, # Depth 1 "NationalArchive": {"is_a": "Archive"} # Depth 2 }) base = BaseScoreCalculator(repo) return InheritanceDepthDecorator(base, schema_loader) def test_root_class_no_boost(self, decorated_calculator): """Root class (depth 0) gets no depth boost.""" score = decorated_calculator.calculate("HeritageCustodian", None) assert score == 0.10 def test_child_class_small_boost(self, decorated_calculator): """Direct child (depth 1) gets small boost.""" score = decorated_calculator.calculate("Archive", None) assert score == 0.73 # 0.70 + 0.03 (depth 1 * 0.03) def test_grandchild_class_larger_boost(self, decorated_calculator): """Grandchild (depth 2) gets larger boost.""" score = decorated_calculator.calculate("NationalArchive", None) assert score == 0.86 # 0.80 + 0.06 (depth 2 * 0.03) ``` ### 4. Command Tests ```python # tests/unit/test_commands.py import pytest from datetime import datetime from specificity.commands import ( UpdateScoreCommand, BatchScoreCommand, ScoreCommandInvoker ) from specificity.repository import InMemoryScoreRepository class TestUpdateScoreCommand: """Tests for single score update command.""" def test_execute_updates_score(self): """Execute updates the score in repository.""" repo = InMemoryScoreRepository() repo.set_score("Archive", 0.50) cmd = UpdateScoreCommand(repo, "Archive", 0.75) change = cmd.execute() assert repo.get_score("Archive") == 0.75 assert change.old_score == 0.50 assert change.new_score == 0.75 def test_undo_restores_original(self): """Undo restores the original score.""" repo = InMemoryScoreRepository() repo.set_score("Archive", 0.50) cmd = UpdateScoreCommand(repo, "Archive", 0.75) cmd.execute() cmd.undo() assert repo.get_score("Archive") == 0.50 def test_execute_records_metadata(self): """Execute records change metadata.""" repo = InMemoryScoreRepository() cmd = UpdateScoreCommand( repo, "Archive", 0.75, author="kempersc", rationale="User feedback indicated higher relevance" ) change = cmd.execute() assert change.author == "kempersc" assert "feedback" in change.rationale assert isinstance(change.timestamp, datetime) class TestBatchScoreCommand: """Tests for batch score update command.""" def test_execute_all_updates(self): """Execute updates all scores in batch.""" repo = InMemoryScoreRepository() batch = BatchScoreCommand([ UpdateScoreCommand(repo, "Archive", 0.90), UpdateScoreCommand(repo, "Museum", 0.85), UpdateScoreCommand(repo, "Library", 0.80), ]) changes = batch.execute() assert len(changes) == 3 assert repo.get_score("Archive") == 0.90 assert repo.get_score("Museum") == 0.85 assert repo.get_score("Library") == 0.80 def test_rollback_on_failure(self): """Rollback all changes if any update fails.""" repo = InMemoryScoreRepository() repo.set_score("Archive", 0.50) repo.set_score("Museum", 0.50) # Create a command that will fail class FailingCommand(UpdateScoreCommand): def execute(self): raise ValueError("Simulated failure") batch = BatchScoreCommand([ UpdateScoreCommand(repo, "Archive", 0.90), FailingCommand(repo, "Fail", 0.00), UpdateScoreCommand(repo, "Museum", 0.85), ]) with pytest.raises(ValueError): batch.execute() # First update should be rolled back assert repo.get_score("Archive") == 0.50 class TestScoreCommandInvoker: """Tests for command invoker with undo/redo.""" def test_undo_reverts_last_command(self): """Undo reverts the most recent command.""" repo = InMemoryScoreRepository() repo.set_score("Archive", 0.50) invoker = ScoreCommandInvoker() invoker.execute(UpdateScoreCommand(repo, "Archive", 0.75)) invoker.execute(UpdateScoreCommand(repo, "Archive", 0.90)) assert repo.get_score("Archive") == 0.90 invoker.undo() assert repo.get_score("Archive") == 0.75 invoker.undo() assert repo.get_score("Archive") == 0.50 def test_redo_reapplies_undone_command(self): """Redo reapplies an undone command.""" repo = InMemoryScoreRepository() invoker = ScoreCommandInvoker() invoker.execute(UpdateScoreCommand(repo, "Archive", 0.75)) invoker.undo() assert repo.get_score("Archive") == 0.50 invoker.redo() assert repo.get_score("Archive") == 0.75 def test_redo_cleared_after_new_execute(self): """Redo stack is cleared after new command execution.""" repo = InMemoryScoreRepository() invoker = ScoreCommandInvoker() invoker.execute(UpdateScoreCommand(repo, "Archive", 0.75)) invoker.undo() invoker.execute(UpdateScoreCommand(repo, "Archive", 0.80)) invoker.redo() # Should have no effect assert repo.get_score("Archive") == 0.80 ``` --- ## Integration Tests ### 1. RAG Pipeline Integration ```python # tests/integration/test_rag_integration.py import pytest from specificity.rag import SpecificityFilteredRetriever from specificity.repository import InMemoryScoreRepository class TestSpecificityFilteredRetriever: """Tests for RAG retriever with specificity filtering.""" @pytest.fixture def retriever(self): repo = InMemoryScoreRepository() # Set up scores for archive_search template repo.set_score("Archive", 0.95, "archive_search") repo.set_score("Collection", 0.85, "archive_search") repo.set_score("Location", 0.70, "archive_search") repo.set_score("Museum", 0.20, "archive_search") repo.set_score("Gallery", 0.15, "archive_search") return SpecificityFilteredRetriever(repo, threshold=0.5) def test_filters_low_score_classes(self, retriever): """Filters out classes below threshold.""" classes = retriever.get_relevant_classes("archive_search") assert "Archive" in classes assert "Collection" in classes assert "Location" in classes assert "Museum" not in classes assert "Gallery" not in classes def test_ranks_by_score(self, retriever): """Returns classes ranked by specificity score.""" classes = retriever.get_relevant_classes("archive_search") assert classes.index("Archive") < classes.index("Collection") assert classes.index("Collection") < classes.index("Location") def test_threshold_adjustment(self, retriever): """Can adjust threshold at query time.""" classes = retriever.get_relevant_classes("archive_search", threshold=0.80) assert "Archive" in classes assert "Collection" in classes assert "Location" not in classes # 0.70 < 0.80 ``` ### 2. Template Classifier Integration ```python # tests/integration/test_template_classifier.py import pytest from specificity.classifier import TemplateClassifier class TestTemplateClassifier: """Tests for question-to-template classification.""" @pytest.fixture def classifier(self): return TemplateClassifier() @pytest.mark.parametrize("question,expected_template", [ ("What archives are in Drenthe?", "archive_search"), ("Which museums have art collections?", "museum_search"), ("Find libraries in Amsterdam", "library_search"), ("Who is the director of the Rijksmuseum?", "person_research"), ("Where is the National Archive located?", "location_browse"), ("What is the ISIL code for Amsterdam Museum?", "identifier_lookup"), ("When did the museum merge with the archive?", "organizational_change"), ]) def test_classifies_common_questions(self, classifier, question, expected_template): """Classifies common question patterns correctly.""" result = classifier.classify(question) assert result.template_id == expected_template def test_returns_confidence_score(self, classifier): """Returns confidence score with classification.""" result = classifier.classify("What archives are in Drenthe?") assert hasattr(result, "confidence") assert 0.0 <= result.confidence <= 1.0 def test_fallback_for_ambiguous_questions(self, classifier): """Falls back to general_heritage for ambiguous questions.""" result = classifier.classify("Tell me about heritage in the Netherlands") assert result.template_id == "general_heritage" ``` --- ## End-to-End Tests ### 1. Full Pipeline Test ```python # tests/e2e/test_full_pipeline.py import pytest from specificity.pipeline import SpecificityPipeline class TestFullPipeline: """End-to-end tests for complete specificity scoring pipeline.""" @pytest.fixture def pipeline(self, real_schema_dir): """Create pipeline with real schema files.""" return SpecificityPipeline(real_schema_dir) def test_archive_question_filters_correctly(self, pipeline): """Archive question filters to archive-relevant classes.""" question = "What archives in Noord-Holland have photo collections?" result = pipeline.process(question) # Should include archive-relevant classes assert "Archive" in result.relevant_classes assert "Collection" in result.relevant_classes assert "Location" in result.relevant_classes # Should exclude irrelevant classes assert "LinkedInConnectionExtraction" not in result.relevant_classes assert "PersonProfileExtraction" not in result.relevant_classes def test_person_question_includes_person_classes(self, pipeline): """Person research question includes person-relevant classes.""" question = "Who are the curators at the Rijksmuseum?" result = pipeline.process(question) assert "PersonProfile" in result.relevant_classes assert "Staff" in result.relevant_classes assert "HeritageCustodian" in result.relevant_classes def test_context_reduces_class_count(self, pipeline): """Template-specific filtering reduces total class count.""" # Without filtering all_classes = pipeline.get_all_classes() # With filtering question = "What archives are in Drenthe?" result = pipeline.process(question) # Filtered should be significantly smaller assert len(result.relevant_classes) < len(all_classes) * 0.5 ``` --- ## Test Fixtures ### Shared Fixtures ```python # tests/conftest.py import pytest from pathlib import Path import yaml @pytest.fixture def temp_schema_dir(tmp_path): """Create temporary schema directory with sample classes.""" classes_dir = tmp_path / "modules" / "classes" classes_dir.mkdir(parents=True) sample_classes = { "HeritageCustodian": { "description": "Base class for heritage institutions", "annotations": {"specificity_score": 0.10} }, "Archive": { "is_a": "HeritageCustodian", "description": "An archive institution", "annotations": { "specificity_score": 0.75, "template_specificity": { "archive_search": 0.95, "museum_search": 0.20 } } }, "Museum": { "is_a": "HeritageCustodian", "description": "A museum institution", "annotations": { "specificity_score": 0.75, "template_specificity": { "archive_search": 0.20, "museum_search": 0.95 } } }, "Location": { "description": "Geographic location", "annotations": { "specificity_score": 0.30, "custodian_types": ["*"] } } } for class_name, class_def in sample_classes.items(): yaml_path = classes_dir / f"{class_name}.yaml" yaml_path.write_text(yaml.dump({"classes": {class_name: class_def}})) return tmp_path @pytest.fixture def real_schema_dir(): """Path to real schema directory for integration tests.""" return Path("/Users/kempersc/apps/glam/schemas/20251121/linkml") class MockSchemaLoader: """Mock schema loader for unit tests.""" def __init__(self, classes: dict): self._classes = classes def get_class(self, class_name: str) -> dict: return self._classes.get(class_name, {}) ``` --- ## Test Coverage Targets | Component | Target Coverage | |-----------|-----------------| | Repository | 95% | | Strategies | 90% | | Calculator/Decorators | 90% | | Commands | 95% | | Classifier | 85% | | RAG Integration | 80% | | UML Integration | 80% | | **Overall** | **85%** | --- ## CI/CD Integration ```yaml # .github/workflows/specificity-tests.yml name: Specificity Score Tests on: push: paths: - 'src/specificity/**' - 'tests/**/*specificity*' - 'schemas/**/classes/**' jobs: test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.11' - name: Install dependencies run: | pip install -r requirements-dev.txt - name: Run unit tests run: | pytest tests/unit -v --cov=src/specificity --cov-report=xml - name: Run integration tests run: | pytest tests/integration -v - name: Upload coverage uses: codecov/codecov-action@v4 with: file: coverage.xml ``` --- ## References - pytest documentation: https://docs.pytest.org/ - pytest-cov: https://pytest-cov.readthedocs.io/ - TDD best practices: https://martinfowler.com/bliki/TestDrivenDevelopment.html