""" Integration Tests for Ontology-Driven RAG Components Tests the end-to-end flow: 1. OntologyLoader → SynonymResolver integration 2. OntologyLoader → SchemaAwareSlotValidator integration 3. TTL-based caching behavior 4. Mock KG response handling These tests verify that the RAG system properly relies on the ontology and Knowledge Graph rather than hardcoded heuristics. Author: OpenCode Created: 2025-01-08 """ import pytest import time from unittest.mock import patch, MagicMock from typing import Any # ============================================================================= # FIXTURES # ============================================================================= @pytest.fixture def reset_singletons(): """Reset all singleton instances before each test. Note: SynonymResolver uses module-level _synonym_resolver (not class-level _instance) while OntologyLoader and SchemaAwareSlotValidator use class-level _instance. """ import backend.rag.template_sparql as module # Reset OntologyLoader (uses class-level _instance) if module.OntologyLoader._instance is not None: module.OntologyLoader._instance = None module.OntologyLoader._predicates = set() module.OntologyLoader._external_predicates = set() module.OntologyLoader._classes = set() module.OntologyLoader._slot_values = {} module.OntologyLoader._synonyms = {} module.OntologyLoader._enums = {} module.OntologyLoader._institution_type_codes = set() module.OntologyLoader._institution_type_mappings = {} module.OntologyLoader._subregion_mappings = {} module.OntologyLoader._country_mappings = {} module.OntologyLoader._loaded = False module.OntologyLoader._kg_cache = {} module.OntologyLoader._kg_cache_timestamps = {} module._ontology_loader = None # Reset SynonymResolver (uses module-level _synonym_resolver only, no class-level _instance) # Just reset the module-level variable module._synonym_resolver = None # Reset SchemaAwareSlotValidator (uses class-level _instance) if module.SchemaAwareSlotValidator._instance is not None: module.SchemaAwareSlotValidator._instance = None module.SchemaAwareSlotValidator._valid_values = {} module.SchemaAwareSlotValidator._synonym_maps = {} module.SchemaAwareSlotValidator._loaded = False module.SchemaAwareSlotValidator._kg_validation_cache = {} module.SchemaAwareSlotValidator._kg_validation_timestamps = {} module._schema_slot_validator = None yield # Cleanup after test module._ontology_loader = None module._synonym_resolver = None module._schema_slot_validator = None @pytest.fixture def mock_kg_responses(): """Mock Knowledge Graph responses for testing.""" return { "institution_types": {"M", "L", "A", "G", "O", "R", "C", "U", "B", "E", "S", "F", "I", "X", "P", "H", "D", "N", "T"}, "subregions": {"NL-NH", "NL-ZH", "NL-GE", "NL-NB", "NL-LI"}, "countries": {"NL", "BE", "DE", "FR", "GB"}, "cities": {"Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Eindhoven"}, } # ============================================================================= # ONTOLOGY LOADER TESTS # ============================================================================= class TestOntologyLoaderIntegration: """Integration tests for OntologyLoader.""" def test_loader_loads_from_validation_rules(self, reset_singletons): """OntologyLoader should load institution type codes from validation rules JSON.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() loader.load() # Should have loaded institution type codes type_codes = loader.get_institution_type_codes() assert len(type_codes) > 0 assert "M" in type_codes # Museum assert "L" in type_codes # Library assert "A" in type_codes # Archive def test_loader_loads_institution_type_mappings(self, reset_singletons): """OntologyLoader should load institution type mappings.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() loader.load() mappings = loader.get_institution_type_mappings() assert len(mappings) > 0 # Check some common mappings assert "museum" in mappings or "MUSEUM" in mappings or any("museum" in k.lower() for k in mappings) def test_loader_loads_subregion_mappings(self, reset_singletons): """OntologyLoader should load subregion mappings.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() loader.load() mappings = loader.get_subregion_mappings() # May be empty if no subregion mappings in validation rules assert isinstance(mappings, dict) def test_loader_singleton_pattern(self, reset_singletons): """OntologyLoader should be a singleton.""" from backend.rag.template_sparql import get_ontology_loader loader1 = get_ontology_loader() loader2 = get_ontology_loader() assert loader1 is loader2 def test_loader_caches_loaded_state(self, reset_singletons): """OntologyLoader should only load once.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() # First load loader.load() assert loader._loaded is True # Second load should be no-op with patch.object(loader, '_load_from_validation_rules') as mock_load: loader.load() mock_load.assert_not_called() # ============================================================================= # ONTOLOGY LOADER → SYNONYM RESOLVER INTEGRATION # ============================================================================= class TestOntologyLoaderSynonymResolverIntegration: """Integration tests for OntologyLoader → SynonymResolver flow.""" def test_synonym_resolver_uses_ontology_type_codes(self, reset_singletons): """SynonymResolver should use type codes from OntologyLoader.""" from backend.rag.template_sparql import get_synonym_resolver, get_ontology_loader # First load ontology loader = get_ontology_loader() loader.load() ontology_codes = loader.get_institution_type_codes() # Then load synonym resolver resolver = get_synonym_resolver() resolver.load() # Resolver should have the same valid type codes assert resolver._valid_type_codes == ontology_codes def test_synonym_resolver_resolves_museum_to_M(self, reset_singletons): """SynonymResolver should resolve 'museum' to 'M' using ontology mappings.""" from backend.rag.template_sparql import get_synonym_resolver resolver = get_synonym_resolver() resolver.load() # Test various forms assert resolver.resolve_institution_type("museum") == "M" assert resolver.resolve_institution_type("musea") == "M" assert resolver.resolve_institution_type("Museum") == "M" def test_synonym_resolver_resolves_library_to_L(self, reset_singletons): """SynonymResolver should resolve library terms to 'L'.""" from backend.rag.template_sparql import get_synonym_resolver resolver = get_synonym_resolver() resolver.load() assert resolver.resolve_institution_type("library") == "L" assert resolver.resolve_institution_type("bibliotheek") == "L" def test_synonym_resolver_resolves_archive_to_A(self, reset_singletons): """SynonymResolver should resolve archive terms to 'A'.""" from backend.rag.template_sparql import get_synonym_resolver resolver = get_synonym_resolver() resolver.load() assert resolver.resolve_institution_type("archive") == "A" assert resolver.resolve_institution_type("archief") == "A" def test_synonym_resolver_accepts_valid_codes_directly(self, reset_singletons): """SynonymResolver should accept valid single-letter codes directly.""" from backend.rag.template_sparql import get_synonym_resolver resolver = get_synonym_resolver() resolver.load() # Direct codes should pass through assert resolver.resolve_institution_type("M") == "M" assert resolver.resolve_institution_type("L") == "L" assert resolver.resolve_institution_type("A") == "A" def test_synonym_resolver_uses_ontology_mappings_not_hardcoded(self, reset_singletons): """SynonymResolver should get mappings from OntologyLoader, not hardcoded strings.""" from backend.rag.template_sparql import get_synonym_resolver, get_ontology_loader resolver = get_synonym_resolver() resolver.load() # The valid_type_codes should come from OntologyLoader ontology_loader = get_ontology_loader() expected_codes = ontology_loader.get_institution_type_codes() # Resolver should have exactly the same codes assert resolver._valid_type_codes == expected_codes # Should NOT have hardcoded string "MLAGORCUBESFIXPHDNT" # Instead should have a set from the ontology assert isinstance(resolver._valid_type_codes, set) # ============================================================================= # ONTOLOGY LOADER → SLOT VALIDATOR INTEGRATION # ============================================================================= class TestOntologyLoaderSlotValidatorIntegration: """Integration tests for OntologyLoader → SchemaAwareSlotValidator flow.""" def test_slot_validator_loads_from_synonym_resolver(self, reset_singletons): """SlotValidator should load mappings from SynonymResolver (which uses OntologyLoader).""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() validator._load_validation_rules() # Should have institution type mappings assert "institution_type" in validator._synonym_maps assert len(validator._synonym_maps["institution_type"]) > 0 def test_slot_validator_validates_museum(self, reset_singletons): """SlotValidator should validate 'museum' → 'M'.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() result = validator.validate_slot("institution_type", "museum") assert result.valid is True assert result.corrected_value == "M" def test_slot_validator_validates_dutch_terms(self, reset_singletons): """SlotValidator should validate Dutch institution type terms.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() # Test Dutch terms result = validator.validate_slot("institution_type", "bibliotheek") assert result.valid is True assert result.corrected_value == "L" result = validator.validate_slot("institution_type", "archief") assert result.valid is True assert result.corrected_value == "A" def test_slot_validator_corrects_typos(self, reset_singletons): """SlotValidator should attempt to correct typos using fuzzy matching.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() # Typo in "museum" result = validator.validate_slot("institution_type", "musem", auto_correct=True) # Should either correct to M or flag as invalid with suggestion if result.valid: assert result.corrected_value == "M" else: assert len(result.suggestions) > 0 def test_slot_validator_validate_slots_batch(self, reset_singletons): """SlotValidator should validate multiple slots at once.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() slots = { "institution_type": "museum", "city": "Amsterdam", } results = validator.validate_slots(slots) assert "institution_type" in results assert results["institution_type"].corrected_value == "M" def test_get_corrected_slots(self, reset_singletons): """get_corrected_slots should return corrected values.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() slots = { "institution_type": "bibliotheek", } corrected = validator.get_corrected_slots(slots) assert corrected["institution_type"] == "L" # ============================================================================= # TTL-BASED CACHING TESTS # ============================================================================= class TestOntologyLoaderCaching: """Tests for OntologyLoader TTL-based caching.""" def test_kg_cache_ttl_default(self, reset_singletons): """OntologyLoader should have default TTL of 300 seconds.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() assert loader.get_kg_cache_ttl() == 300.0 def test_kg_cache_ttl_setter(self, reset_singletons): """Should be able to set KG cache TTL.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() loader.set_kg_cache_ttl(60.0) assert loader.get_kg_cache_ttl() == 60.0 def test_clear_kg_cache(self, reset_singletons): """clear_kg_cache should clear the cache.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() loader.load() # Add some mock cache entries loader._kg_cache["test_hash"] = {"value1", "value2"} loader._kg_cache_timestamps["test_hash"] = time.time() # Clear cache loader.clear_kg_cache() assert len(loader._kg_cache) == 0 assert len(loader._kg_cache_timestamps) == 0 def test_get_kg_cache_stats(self, reset_singletons): """get_kg_cache_stats should return cache statistics.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() loader.load() stats = loader.get_kg_cache_stats() assert "cache_size" in stats assert "ttl_seconds" in stats assert "entries" in stats def test_clear_all_cache(self, reset_singletons): """clear_all_cache should reset loader to initial state.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() loader.load() # Verify data is loaded assert loader._loaded is True assert len(loader._institution_type_codes) > 0 # Clear all cache loader.clear_all_cache() # Verify reset assert loader._loaded is False assert len(loader._institution_type_codes) == 0 assert len(loader._kg_cache) == 0 def test_kg_query_caching_behavior(self, reset_singletons): """KG queries should be cached and reused within TTL.""" from backend.rag.template_sparql import get_ontology_loader import hashlib loader = get_ontology_loader() # Set short TTL for testing loader.set_kg_cache_ttl(10.0) # Mock the actual HTTP request test_query = "SELECT ?x WHERE { ?x ?y ?z }" query_hash = hashlib.md5(test_query.encode()).hexdigest() # Pre-populate cache loader._kg_cache[query_hash] = {"value1", "value2"} loader._kg_cache_timestamps[query_hash] = time.time() # Query should return cached result result = loader._query_kg_for_values(test_query, use_cache=True) assert result == {"value1", "value2"} def test_kg_query_cache_expiration(self, reset_singletons): """Expired cache entries should trigger fresh query.""" from backend.rag.template_sparql import get_ontology_loader import hashlib loader = get_ontology_loader() loader.set_kg_cache_ttl(0.1) # Very short TTL test_query = "SELECT ?expired WHERE { ?x ?y ?z }" query_hash = hashlib.md5(test_query.encode()).hexdigest() # Pre-populate cache with old timestamp loader._kg_cache[query_hash] = {"old_value"} loader._kg_cache_timestamps[query_hash] = time.time() - 1.0 # 1 second ago # Wait for expiration time.sleep(0.2) # Mock HTTP to return empty (KG unavailable) with patch('urllib.request.urlopen') as mock_urlopen: mock_urlopen.side_effect = Exception("KG unavailable") # Should return stale cache on failure result = loader._query_kg_for_values(test_query, use_cache=True) # Returns stale cache because KG query failed assert result == {"old_value"} class TestSlotValidatorCaching: """Tests for SchemaAwareSlotValidator TTL-based caching.""" def test_kg_validation_ttl_default(self, reset_singletons): """SlotValidator should have default TTL of 300 seconds.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() assert validator.get_kg_validation_ttl() == 300.0 def test_kg_validation_ttl_setter(self, reset_singletons): """Should be able to set KG validation cache TTL.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() validator.set_kg_validation_ttl(120.0) assert validator.get_kg_validation_ttl() == 120.0 def test_clear_kg_validation_cache(self, reset_singletons): """clear_kg_validation_cache should clear the validation cache.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() # Add mock cache entry validator._kg_validation_cache["institution_type:M"] = True validator._kg_validation_timestamps["institution_type:M"] = time.time() # Clear cache validator.clear_kg_validation_cache() assert len(validator._kg_validation_cache) == 0 assert len(validator._kg_validation_timestamps) == 0 def test_get_kg_validation_cache_stats(self, reset_singletons): """get_kg_validation_cache_stats should return statistics.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() # Add some mock entries validator._kg_validation_cache["slot1:value1"] = True validator._kg_validation_cache["slot2:value2"] = False validator._kg_validation_timestamps["slot1:value1"] = time.time() validator._kg_validation_timestamps["slot2:value2"] = time.time() stats = validator.get_kg_validation_cache_stats() assert stats["cache_size"] == 2 assert stats["valid_entries"] == 1 assert stats["invalid_entries"] == 1 assert "ttl_seconds" in stats def test_kg_validation_caching_behavior(self, reset_singletons): """KG validations should be cached and reused within TTL.""" from backend.rag.template_sparql import get_schema_slot_validator validator = get_schema_slot_validator() validator.set_kg_validation_ttl(10.0) # Pre-populate cache cache_key = "institution_type:TEST" validator._kg_validation_cache[cache_key] = True validator._kg_validation_timestamps[cache_key] = time.time() # Validation should return cached result without calling OntologyLoader with patch('backend.rag.template_sparql.get_ontology_loader') as mock_loader: result = validator.validate_slot_against_kg("institution_type", "TEST", use_cache=True) # Should return cached result assert result is True # OntologyLoader should not be called mock_loader.assert_not_called() # ============================================================================= # MOCK KG RESPONSE TESTS # ============================================================================= class TestMockKGResponses: """Tests with mocked KG responses.""" def test_ontology_loader_with_mock_kg(self, reset_singletons, mock_kg_responses): """OntologyLoader should handle mock KG responses correctly.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() # Mock the KG query method def mock_query(query, use_cache=True): if "institutionType" in query: return mock_kg_responses["institution_types"] elif "subregionCode" in query: return mock_kg_responses["subregions"] elif "countryCode" in query: return mock_kg_responses["countries"] elif "settlementName" in query: return mock_kg_responses["cities"] return set() with patch.object(loader, '_query_kg_for_values', side_effect=mock_query): # Trigger KG loading loader._load_institution_types_from_kg() loader._load_subregions_from_kg() loader._load_countries_from_kg() loader._load_cities_from_kg() # Verify mock data was loaded assert loader._slot_values.get("institution_type") == mock_kg_responses["institution_types"] assert loader._slot_values.get("subregion") == mock_kg_responses["subregions"] def test_slot_validator_with_mock_kg_validation(self, reset_singletons, mock_kg_responses): """SlotValidator KG validation should work with mock responses.""" from backend.rag.template_sparql import get_schema_slot_validator, get_ontology_loader # Setup mock OntologyLoader loader = get_ontology_loader() loader._slot_values["institution_type"] = mock_kg_responses["institution_types"] loader._slot_values["city"] = mock_kg_responses["cities"] loader._loaded = True validator = get_schema_slot_validator() # Validate against mock KG data assert validator.validate_slot_against_kg("institution_type", "M") is True assert validator.validate_slot_against_kg("city", "Amsterdam") is True assert validator.validate_slot_against_kg("city", "NonexistentCity") is False def test_kg_unavailable_fallback(self, reset_singletons): """System should gracefully handle KG unavailability.""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() # Mock KG query to always fail with patch.object(loader, '_query_kg_for_values', return_value=set()): loader._load_institution_types_from_kg() # Should have empty slot values (no KG data) # But should not raise an error assert loader._slot_values.get("institution_type", set()) == set() def test_is_valid_value_with_empty_kg_data(self, reset_singletons): """is_valid_value should return True when KG has no data (assume valid).""" from backend.rag.template_sparql import get_ontology_loader loader = get_ontology_loader() loader._slot_values = {} # Empty KG data loader._loaded = True # Should return True (assume valid when no KG data) assert loader.is_valid_value("institution_type", "ANYTHING") is True # ============================================================================= # END-TO-END INTEGRATION TESTS # ============================================================================= class TestEndToEndOntologyFlow: """End-to-end tests for the complete ontology-driven flow.""" def test_full_validation_flow(self, reset_singletons): """Test complete flow: OntologyLoader → SynonymResolver → SlotValidator.""" from backend.rag.template_sparql import ( get_ontology_loader, get_synonym_resolver, get_schema_slot_validator ) # Step 1: Load ontology loader = get_ontology_loader() loader.load() # Step 2: Get synonym resolver (uses ontology) resolver = get_synonym_resolver() resolver.load() # Step 3: Get slot validator (uses resolver) validator = get_schema_slot_validator() # Step 4: Validate a slot value result = validator.validate_slot("institution_type", "museum") # Verify the chain worked assert result.valid is True assert result.corrected_value == "M" # The code "M" should be in the ontology's valid codes assert "M" in loader.get_institution_type_codes() def test_no_hardcoded_mlagorcubesfixphdnt(self, reset_singletons): """Verify the system doesn't rely on hardcoded 'MLAGORCUBESFIXPHDNT' string.""" from backend.rag.template_sparql import get_synonym_resolver resolver = get_synonym_resolver() resolver.load() # The valid type codes should be a set, not derived from a hardcoded string assert isinstance(resolver._valid_type_codes, set) # All 19 GLAMORCUBESFIXPHDNT codes should be present expected_codes = {"G", "L", "A", "M", "O", "R", "C", "U", "B", "E", "S", "F", "I", "X", "P", "H", "D", "N", "T"} assert resolver._valid_type_codes == expected_codes def test_validation_rules_json_is_source_of_truth(self, reset_singletons): """Verify that validation rules JSON is used as source of truth.""" from backend.rag.template_sparql import get_ontology_loader, VALIDATION_RULES_PATH import json loader = get_ontology_loader() loader.load() # Load rules directly if VALIDATION_RULES_PATH.exists(): with open(VALIDATION_RULES_PATH) as f: rules = json.load(f) # Check that HeritageTypeEnum values match loader's codes heritage_enum = rules.get("enums", {}).get("HeritageTypeEnum", {}) expected_codes = set(heritage_enum.get("values", [])) if expected_codes: assert loader.get_institution_type_codes() == expected_codes if __name__ == "__main__": pytest.main([__file__, "-v"])