glam/backend/rag/test_ontology_integration.py
2026-01-08 15:56:28 +01:00

681 lines
27 KiB
Python

"""
Integration Tests for Ontology-Driven RAG Components
Tests the end-to-end flow:
1. OntologyLoader → SynonymResolver integration
2. OntologyLoader → SchemaAwareSlotValidator integration
3. TTL-based caching behavior
4. Mock KG response handling
These tests verify that the RAG system properly relies on the ontology
and Knowledge Graph rather than hardcoded heuristics.
Author: OpenCode
Created: 2025-01-08
"""
import pytest
import time
from unittest.mock import patch, MagicMock
from typing import Any
# =============================================================================
# FIXTURES
# =============================================================================
@pytest.fixture
def reset_singletons():
"""Reset all singleton instances before each test.
Note: SynonymResolver uses module-level _synonym_resolver (not class-level _instance)
while OntologyLoader and SchemaAwareSlotValidator use class-level _instance.
"""
import backend.rag.template_sparql as module
# Reset OntologyLoader (uses class-level _instance)
if module.OntologyLoader._instance is not None:
module.OntologyLoader._instance = None
module.OntologyLoader._predicates = set()
module.OntologyLoader._external_predicates = set()
module.OntologyLoader._classes = set()
module.OntologyLoader._slot_values = {}
module.OntologyLoader._synonyms = {}
module.OntologyLoader._enums = {}
module.OntologyLoader._institution_type_codes = set()
module.OntologyLoader._institution_type_mappings = {}
module.OntologyLoader._subregion_mappings = {}
module.OntologyLoader._country_mappings = {}
module.OntologyLoader._loaded = False
module.OntologyLoader._kg_cache = {}
module.OntologyLoader._kg_cache_timestamps = {}
module._ontology_loader = None
# Reset SynonymResolver (uses module-level _synonym_resolver only, no class-level _instance)
# Just reset the module-level variable
module._synonym_resolver = None
# Reset SchemaAwareSlotValidator (uses class-level _instance)
if module.SchemaAwareSlotValidator._instance is not None:
module.SchemaAwareSlotValidator._instance = None
module.SchemaAwareSlotValidator._valid_values = {}
module.SchemaAwareSlotValidator._synonym_maps = {}
module.SchemaAwareSlotValidator._loaded = False
module.SchemaAwareSlotValidator._kg_validation_cache = {}
module.SchemaAwareSlotValidator._kg_validation_timestamps = {}
module._schema_slot_validator = None
yield
# Cleanup after test
module._ontology_loader = None
module._synonym_resolver = None
module._schema_slot_validator = None
@pytest.fixture
def mock_kg_responses():
"""Mock Knowledge Graph responses for testing."""
return {
"institution_types": {"M", "L", "A", "G", "O", "R", "C", "U", "B", "E", "S", "F", "I", "X", "P", "H", "D", "N", "T"},
"subregions": {"NL-NH", "NL-ZH", "NL-GE", "NL-NB", "NL-LI"},
"countries": {"NL", "BE", "DE", "FR", "GB"},
"cities": {"Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Eindhoven"},
}
# =============================================================================
# ONTOLOGY LOADER TESTS
# =============================================================================
class TestOntologyLoaderIntegration:
"""Integration tests for OntologyLoader."""
def test_loader_loads_from_validation_rules(self, reset_singletons):
"""OntologyLoader should load institution type codes from validation rules JSON."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
loader.load()
# Should have loaded institution type codes
type_codes = loader.get_institution_type_codes()
assert len(type_codes) > 0
assert "M" in type_codes # Museum
assert "L" in type_codes # Library
assert "A" in type_codes # Archive
def test_loader_loads_institution_type_mappings(self, reset_singletons):
"""OntologyLoader should load institution type mappings."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
loader.load()
mappings = loader.get_institution_type_mappings()
assert len(mappings) > 0
# Check some common mappings
assert "museum" in mappings or "MUSEUM" in mappings or any("museum" in k.lower() for k in mappings)
def test_loader_loads_subregion_mappings(self, reset_singletons):
"""OntologyLoader should load subregion mappings."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
loader.load()
mappings = loader.get_subregion_mappings()
# May be empty if no subregion mappings in validation rules
assert isinstance(mappings, dict)
def test_loader_singleton_pattern(self, reset_singletons):
"""OntologyLoader should be a singleton."""
from backend.rag.template_sparql import get_ontology_loader
loader1 = get_ontology_loader()
loader2 = get_ontology_loader()
assert loader1 is loader2
def test_loader_caches_loaded_state(self, reset_singletons):
"""OntologyLoader should only load once."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
# First load
loader.load()
assert loader._loaded is True
# Second load should be no-op
with patch.object(loader, '_load_from_validation_rules') as mock_load:
loader.load()
mock_load.assert_not_called()
# =============================================================================
# ONTOLOGY LOADER → SYNONYM RESOLVER INTEGRATION
# =============================================================================
class TestOntologyLoaderSynonymResolverIntegration:
"""Integration tests for OntologyLoader → SynonymResolver flow."""
def test_synonym_resolver_uses_ontology_type_codes(self, reset_singletons):
"""SynonymResolver should use type codes from OntologyLoader."""
from backend.rag.template_sparql import get_synonym_resolver, get_ontology_loader
# First load ontology
loader = get_ontology_loader()
loader.load()
ontology_codes = loader.get_institution_type_codes()
# Then load synonym resolver
resolver = get_synonym_resolver()
resolver.load()
# Resolver should have the same valid type codes
assert resolver._valid_type_codes == ontology_codes
def test_synonym_resolver_resolves_museum_to_M(self, reset_singletons):
"""SynonymResolver should resolve 'museum' to 'M' using ontology mappings."""
from backend.rag.template_sparql import get_synonym_resolver
resolver = get_synonym_resolver()
resolver.load()
# Test various forms
assert resolver.resolve_institution_type("museum") == "M"
assert resolver.resolve_institution_type("musea") == "M"
assert resolver.resolve_institution_type("Museum") == "M"
def test_synonym_resolver_resolves_library_to_L(self, reset_singletons):
"""SynonymResolver should resolve library terms to 'L'."""
from backend.rag.template_sparql import get_synonym_resolver
resolver = get_synonym_resolver()
resolver.load()
assert resolver.resolve_institution_type("library") == "L"
assert resolver.resolve_institution_type("bibliotheek") == "L"
def test_synonym_resolver_resolves_archive_to_A(self, reset_singletons):
"""SynonymResolver should resolve archive terms to 'A'."""
from backend.rag.template_sparql import get_synonym_resolver
resolver = get_synonym_resolver()
resolver.load()
assert resolver.resolve_institution_type("archive") == "A"
assert resolver.resolve_institution_type("archief") == "A"
def test_synonym_resolver_accepts_valid_codes_directly(self, reset_singletons):
"""SynonymResolver should accept valid single-letter codes directly."""
from backend.rag.template_sparql import get_synonym_resolver
resolver = get_synonym_resolver()
resolver.load()
# Direct codes should pass through
assert resolver.resolve_institution_type("M") == "M"
assert resolver.resolve_institution_type("L") == "L"
assert resolver.resolve_institution_type("A") == "A"
def test_synonym_resolver_uses_ontology_mappings_not_hardcoded(self, reset_singletons):
"""SynonymResolver should get mappings from OntologyLoader, not hardcoded strings."""
from backend.rag.template_sparql import get_synonym_resolver, get_ontology_loader
resolver = get_synonym_resolver()
resolver.load()
# The valid_type_codes should come from OntologyLoader
ontology_loader = get_ontology_loader()
expected_codes = ontology_loader.get_institution_type_codes()
# Resolver should have exactly the same codes
assert resolver._valid_type_codes == expected_codes
# Should NOT have hardcoded string "MLAGORCUBESFIXPHDNT"
# Instead should have a set from the ontology
assert isinstance(resolver._valid_type_codes, set)
# =============================================================================
# ONTOLOGY LOADER → SLOT VALIDATOR INTEGRATION
# =============================================================================
class TestOntologyLoaderSlotValidatorIntegration:
"""Integration tests for OntologyLoader → SchemaAwareSlotValidator flow."""
def test_slot_validator_loads_from_synonym_resolver(self, reset_singletons):
"""SlotValidator should load mappings from SynonymResolver (which uses OntologyLoader)."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
validator._load_validation_rules()
# Should have institution type mappings
assert "institution_type" in validator._synonym_maps
assert len(validator._synonym_maps["institution_type"]) > 0
def test_slot_validator_validates_museum(self, reset_singletons):
"""SlotValidator should validate 'museum''M'."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
result = validator.validate_slot("institution_type", "museum")
assert result.valid is True
assert result.corrected_value == "M"
def test_slot_validator_validates_dutch_terms(self, reset_singletons):
"""SlotValidator should validate Dutch institution type terms."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
# Test Dutch terms
result = validator.validate_slot("institution_type", "bibliotheek")
assert result.valid is True
assert result.corrected_value == "L"
result = validator.validate_slot("institution_type", "archief")
assert result.valid is True
assert result.corrected_value == "A"
def test_slot_validator_corrects_typos(self, reset_singletons):
"""SlotValidator should attempt to correct typos using fuzzy matching."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
# Typo in "museum"
result = validator.validate_slot("institution_type", "musem", auto_correct=True)
# Should either correct to M or flag as invalid with suggestion
if result.valid:
assert result.corrected_value == "M"
else:
assert len(result.suggestions) > 0
def test_slot_validator_validate_slots_batch(self, reset_singletons):
"""SlotValidator should validate multiple slots at once."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
slots = {
"institution_type": "museum",
"city": "Amsterdam",
}
results = validator.validate_slots(slots)
assert "institution_type" in results
assert results["institution_type"].corrected_value == "M"
def test_get_corrected_slots(self, reset_singletons):
"""get_corrected_slots should return corrected values."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
slots = {
"institution_type": "bibliotheek",
}
corrected = validator.get_corrected_slots(slots)
assert corrected["institution_type"] == "L"
# =============================================================================
# TTL-BASED CACHING TESTS
# =============================================================================
class TestOntologyLoaderCaching:
"""Tests for OntologyLoader TTL-based caching."""
def test_kg_cache_ttl_default(self, reset_singletons):
"""OntologyLoader should have default TTL of 300 seconds."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
assert loader.get_kg_cache_ttl() == 300.0
def test_kg_cache_ttl_setter(self, reset_singletons):
"""Should be able to set KG cache TTL."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
loader.set_kg_cache_ttl(60.0)
assert loader.get_kg_cache_ttl() == 60.0
def test_clear_kg_cache(self, reset_singletons):
"""clear_kg_cache should clear the cache."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
loader.load()
# Add some mock cache entries
loader._kg_cache["test_hash"] = {"value1", "value2"}
loader._kg_cache_timestamps["test_hash"] = time.time()
# Clear cache
loader.clear_kg_cache()
assert len(loader._kg_cache) == 0
assert len(loader._kg_cache_timestamps) == 0
def test_get_kg_cache_stats(self, reset_singletons):
"""get_kg_cache_stats should return cache statistics."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
loader.load()
stats = loader.get_kg_cache_stats()
assert "cache_size" in stats
assert "ttl_seconds" in stats
assert "entries" in stats
def test_clear_all_cache(self, reset_singletons):
"""clear_all_cache should reset loader to initial state."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
loader.load()
# Verify data is loaded
assert loader._loaded is True
assert len(loader._institution_type_codes) > 0
# Clear all cache
loader.clear_all_cache()
# Verify reset
assert loader._loaded is False
assert len(loader._institution_type_codes) == 0
assert len(loader._kg_cache) == 0
def test_kg_query_caching_behavior(self, reset_singletons):
"""KG queries should be cached and reused within TTL."""
from backend.rag.template_sparql import get_ontology_loader
import hashlib
loader = get_ontology_loader()
# Set short TTL for testing
loader.set_kg_cache_ttl(10.0)
# Mock the actual HTTP request
test_query = "SELECT ?x WHERE { ?x ?y ?z }"
query_hash = hashlib.md5(test_query.encode()).hexdigest()
# Pre-populate cache
loader._kg_cache[query_hash] = {"value1", "value2"}
loader._kg_cache_timestamps[query_hash] = time.time()
# Query should return cached result
result = loader._query_kg_for_values(test_query, use_cache=True)
assert result == {"value1", "value2"}
def test_kg_query_cache_expiration(self, reset_singletons):
"""Expired cache entries should trigger fresh query."""
from backend.rag.template_sparql import get_ontology_loader
import hashlib
loader = get_ontology_loader()
loader.set_kg_cache_ttl(0.1) # Very short TTL
test_query = "SELECT ?expired WHERE { ?x ?y ?z }"
query_hash = hashlib.md5(test_query.encode()).hexdigest()
# Pre-populate cache with old timestamp
loader._kg_cache[query_hash] = {"old_value"}
loader._kg_cache_timestamps[query_hash] = time.time() - 1.0 # 1 second ago
# Wait for expiration
time.sleep(0.2)
# Mock HTTP to return empty (KG unavailable)
with patch('urllib.request.urlopen') as mock_urlopen:
mock_urlopen.side_effect = Exception("KG unavailable")
# Should return stale cache on failure
result = loader._query_kg_for_values(test_query, use_cache=True)
# Returns stale cache because KG query failed
assert result == {"old_value"}
class TestSlotValidatorCaching:
"""Tests for SchemaAwareSlotValidator TTL-based caching."""
def test_kg_validation_ttl_default(self, reset_singletons):
"""SlotValidator should have default TTL of 300 seconds."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
assert validator.get_kg_validation_ttl() == 300.0
def test_kg_validation_ttl_setter(self, reset_singletons):
"""Should be able to set KG validation cache TTL."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
validator.set_kg_validation_ttl(120.0)
assert validator.get_kg_validation_ttl() == 120.0
def test_clear_kg_validation_cache(self, reset_singletons):
"""clear_kg_validation_cache should clear the validation cache."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
# Add mock cache entry
validator._kg_validation_cache["institution_type:M"] = True
validator._kg_validation_timestamps["institution_type:M"] = time.time()
# Clear cache
validator.clear_kg_validation_cache()
assert len(validator._kg_validation_cache) == 0
assert len(validator._kg_validation_timestamps) == 0
def test_get_kg_validation_cache_stats(self, reset_singletons):
"""get_kg_validation_cache_stats should return statistics."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
# Add some mock entries
validator._kg_validation_cache["slot1:value1"] = True
validator._kg_validation_cache["slot2:value2"] = False
validator._kg_validation_timestamps["slot1:value1"] = time.time()
validator._kg_validation_timestamps["slot2:value2"] = time.time()
stats = validator.get_kg_validation_cache_stats()
assert stats["cache_size"] == 2
assert stats["valid_entries"] == 1
assert stats["invalid_entries"] == 1
assert "ttl_seconds" in stats
def test_kg_validation_caching_behavior(self, reset_singletons):
"""KG validations should be cached and reused within TTL."""
from backend.rag.template_sparql import get_schema_slot_validator
validator = get_schema_slot_validator()
validator.set_kg_validation_ttl(10.0)
# Pre-populate cache
cache_key = "institution_type:TEST"
validator._kg_validation_cache[cache_key] = True
validator._kg_validation_timestamps[cache_key] = time.time()
# Validation should return cached result without calling OntologyLoader
with patch('backend.rag.template_sparql.get_ontology_loader') as mock_loader:
result = validator.validate_slot_against_kg("institution_type", "TEST", use_cache=True)
# Should return cached result
assert result is True
# OntologyLoader should not be called
mock_loader.assert_not_called()
# =============================================================================
# MOCK KG RESPONSE TESTS
# =============================================================================
class TestMockKGResponses:
"""Tests with mocked KG responses."""
def test_ontology_loader_with_mock_kg(self, reset_singletons, mock_kg_responses):
"""OntologyLoader should handle mock KG responses correctly."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
# Mock the KG query method
def mock_query(query, use_cache=True):
if "institutionType" in query:
return mock_kg_responses["institution_types"]
elif "subregionCode" in query:
return mock_kg_responses["subregions"]
elif "countryCode" in query:
return mock_kg_responses["countries"]
elif "settlementName" in query:
return mock_kg_responses["cities"]
return set()
with patch.object(loader, '_query_kg_for_values', side_effect=mock_query):
# Trigger KG loading
loader._load_institution_types_from_kg()
loader._load_subregions_from_kg()
loader._load_countries_from_kg()
loader._load_cities_from_kg()
# Verify mock data was loaded
assert loader._slot_values.get("institution_type") == mock_kg_responses["institution_types"]
assert loader._slot_values.get("subregion") == mock_kg_responses["subregions"]
def test_slot_validator_with_mock_kg_validation(self, reset_singletons, mock_kg_responses):
"""SlotValidator KG validation should work with mock responses."""
from backend.rag.template_sparql import get_schema_slot_validator, get_ontology_loader
# Setup mock OntologyLoader
loader = get_ontology_loader()
loader._slot_values["institution_type"] = mock_kg_responses["institution_types"]
loader._slot_values["city"] = mock_kg_responses["cities"]
loader._loaded = True
validator = get_schema_slot_validator()
# Validate against mock KG data
assert validator.validate_slot_against_kg("institution_type", "M") is True
assert validator.validate_slot_against_kg("city", "Amsterdam") is True
assert validator.validate_slot_against_kg("city", "NonexistentCity") is False
def test_kg_unavailable_fallback(self, reset_singletons):
"""System should gracefully handle KG unavailability."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
# Mock KG query to always fail
with patch.object(loader, '_query_kg_for_values', return_value=set()):
loader._load_institution_types_from_kg()
# Should have empty slot values (no KG data)
# But should not raise an error
assert loader._slot_values.get("institution_type", set()) == set()
def test_is_valid_value_with_empty_kg_data(self, reset_singletons):
"""is_valid_value should return True when KG has no data (assume valid)."""
from backend.rag.template_sparql import get_ontology_loader
loader = get_ontology_loader()
loader._slot_values = {} # Empty KG data
loader._loaded = True
# Should return True (assume valid when no KG data)
assert loader.is_valid_value("institution_type", "ANYTHING") is True
# =============================================================================
# END-TO-END INTEGRATION TESTS
# =============================================================================
class TestEndToEndOntologyFlow:
"""End-to-end tests for the complete ontology-driven flow."""
def test_full_validation_flow(self, reset_singletons):
"""Test complete flow: OntologyLoader → SynonymResolver → SlotValidator."""
from backend.rag.template_sparql import (
get_ontology_loader,
get_synonym_resolver,
get_schema_slot_validator
)
# Step 1: Load ontology
loader = get_ontology_loader()
loader.load()
# Step 2: Get synonym resolver (uses ontology)
resolver = get_synonym_resolver()
resolver.load()
# Step 3: Get slot validator (uses resolver)
validator = get_schema_slot_validator()
# Step 4: Validate a slot value
result = validator.validate_slot("institution_type", "museum")
# Verify the chain worked
assert result.valid is True
assert result.corrected_value == "M"
# The code "M" should be in the ontology's valid codes
assert "M" in loader.get_institution_type_codes()
def test_no_hardcoded_mlagorcubesfixphdnt(self, reset_singletons):
"""Verify the system doesn't rely on hardcoded 'MLAGORCUBESFIXPHDNT' string."""
from backend.rag.template_sparql import get_synonym_resolver
resolver = get_synonym_resolver()
resolver.load()
# The valid type codes should be a set, not derived from a hardcoded string
assert isinstance(resolver._valid_type_codes, set)
# All 19 GLAMORCUBESFIXPHDNT codes should be present
expected_codes = {"G", "L", "A", "M", "O", "R", "C", "U", "B", "E", "S", "F", "I", "X", "P", "H", "D", "N", "T"}
assert resolver._valid_type_codes == expected_codes
def test_validation_rules_json_is_source_of_truth(self, reset_singletons):
"""Verify that validation rules JSON is used as source of truth."""
from backend.rag.template_sparql import get_ontology_loader, VALIDATION_RULES_PATH
import json
loader = get_ontology_loader()
loader.load()
# Load rules directly
if VALIDATION_RULES_PATH.exists():
with open(VALIDATION_RULES_PATH) as f:
rules = json.load(f)
# Check that HeritageTypeEnum values match loader's codes
heritage_enum = rules.get("enums", {}).get("HeritageTypeEnum", {})
expected_codes = set(heritage_enum.get("values", []))
if expected_codes:
assert loader.get_institution_type_codes() == expected_codes
if __name__ == "__main__":
pytest.main([__file__, "-v"])