""" Layer 3: Integration Tests for Heritage RAG These tests verify: - API endpoint health - Oxigraph connectivity - End-to-end query processing - Sample query responses Requires: - Live Oxigraph instance - ANTHROPIC_API_KEY for LLM queries """ import os import time from typing import Any import httpx import pytest # Configuration OXIGRAPH_URL = os.environ.get("OXIGRAPH_ENDPOINT", "http://91.98.224.44:7878") API_BASE_URL = os.environ.get("API_BASE_URL", "http://localhost:8000") # ============================================================================= # Oxigraph Connectivity Tests # ============================================================================= class TestOxigraphConnectivity: """Test Oxigraph SPARQL endpoint connectivity.""" def test_oxigraph_health(self): """Verify Oxigraph is accessible.""" query = "SELECT (COUNT(*) as ?count) WHERE { ?s ?p ?o } LIMIT 1" response = httpx.post( f"{OXIGRAPH_URL}/query", data={"query": query}, headers={"Accept": "application/sparql-results+json"}, timeout=10.0, ) assert response.status_code == 200 data = response.json() assert "results" in data assert "bindings" in data["results"] def test_oxigraph_triple_count(self): """Verify Oxigraph has data loaded.""" query = "SELECT (COUNT(*) as ?count) WHERE { ?s ?p ?o }" response = httpx.post( f"{OXIGRAPH_URL}/query", data={"query": query}, headers={"Accept": "application/sparql-results+json"}, timeout=30.0, ) assert response.status_code == 200 data = response.json() count = int(data["results"]["bindings"][0]["count"]["value"]) # Should have substantial data assert count > 100000, f"Expected > 100k triples, got {count}" def test_dutch_institutions_exist(self): """Verify Dutch institution data is present.""" query = """ PREFIX hc: SELECT (COUNT(DISTINCT ?s) as ?count) WHERE { ?s hc:countryCode "NL" . } """ response = httpx.post( f"{OXIGRAPH_URL}/query", data={"query": query}, headers={"Accept": "application/sparql-results+json"}, timeout=30.0, ) assert response.status_code == 200 data = response.json() count = int(data["results"]["bindings"][0]["count"]["value"]) # Should have Dutch institutions assert count > 2000, f"Expected > 2000 Dutch institutions, got {count}" def test_dutch_institutions_with_coordinates(self): """Verify Dutch institutions have coordinate data.""" query = """ PREFIX hc: PREFIX schema: PREFIX geo: SELECT (COUNT(DISTINCT ?s) as ?count) WHERE { ?s hc:countryCode "NL" . ?s schema:location ?loc . ?loc geo:lat ?lat . } """ response = httpx.post( f"{OXIGRAPH_URL}/query", data={"query": query}, headers={"Accept": "application/sparql-results+json"}, timeout=30.0, ) assert response.status_code == 200 data = response.json() count = int(data["results"]["bindings"][0]["count"]["value"]) # Should have geocoded institutions assert count > 2500, f"Expected > 2500 Dutch institutions with coords, got {count}" def test_amsterdam_museums_query(self): """Test specific Amsterdam museums query.""" query = """ PREFIX hc: PREFIX schema: SELECT (COUNT(DISTINCT ?s) as ?count) WHERE { ?s hc:countryCode "NL" . ?s a schema:Museum . ?s schema:location ?loc . ?loc hc:city "Amsterdam" . } """ response = httpx.post( f"{OXIGRAPH_URL}/query", data={"query": query}, headers={"Accept": "application/sparql-results+json"}, timeout=30.0, ) assert response.status_code == 200 data = response.json() count = int(data["results"]["bindings"][0]["count"]["value"]) # Amsterdam should have many museums assert count > 50, f"Expected > 50 Amsterdam museums, got {count}" print(f"Found {count} museums in Amsterdam") # ============================================================================= # API Health Tests # ============================================================================= class TestAPIHealth: """Test API endpoint health.""" @pytest.fixture def client(self): """Create HTTP client.""" return httpx.Client(base_url=API_BASE_URL, timeout=30.0) @pytest.mark.skip(reason="API server not always running in CI") def test_dspy_rag_health(self, client): """Test DSPy RAG health endpoint.""" response = client.get("/api/dspy/rag/health") assert response.status_code == 200 data = response.json() assert data.get("status") == "ok" assert "components" in data @pytest.mark.skip(reason="API server not always running in CI") def test_dspy_rag_training_data(self, client): """Test training data endpoint.""" response = client.get("/api/dspy/rag/training-data") assert response.status_code == 200 data = response.json() assert data.get("total_training", 0) > 0 assert data.get("total_validation", 0) > 0 # ============================================================================= # Sample Query Tests # ============================================================================= class TestSampleQueries: """Test sample queries against live system.""" SAMPLE_QUERIES = [ { "question": "Hoeveel musea zijn er in Amsterdam?", "language": "nl", "expected_intent": "statistical", }, { "question": "Waar is het Rijksmuseum gevestigd?", "language": "nl", "expected_intent": "entity_lookup", }, { "question": "How many libraries are in the Netherlands?", "language": "en", "expected_intent": "statistical", }, ] @pytest.fixture def async_client(self): """Create async HTTP client.""" return httpx.AsyncClient(base_url=API_BASE_URL, timeout=60.0) @pytest.mark.skip(reason="API server not always running in CI") @pytest.mark.asyncio async def test_sample_queries(self, async_client): """Test sample queries return valid responses.""" for query in self.SAMPLE_QUERIES: start = time.time() response = await async_client.post( "/api/dspy/rag/query", json={ "question": query["question"], "language": query["language"], "include_visualization": False, "use_agent": False, }, ) duration_ms = (time.time() - start) * 1000 assert response.status_code == 200, f"Failed for: {query['question']}" data = response.json() # Verify response structure assert "answer" in data assert "intent" in data assert "confidence" in data # Verify answer is not empty assert data["answer"], f"Empty answer for: {query['question']}" # Log results print(f"\nQuery: {query['question'][:50]}...") print(f" Intent: {data['intent']} (expected: {query['expected_intent']})") print(f" Duration: {duration_ms:.0f}ms") print(f" Answer: {data['answer'][:100]}...") # ============================================================================= # Direct SPARQL Tests for Heritage Queries # ============================================================================= class TestHeritageSPARQL: """Test heritage-specific SPARQL queries directly.""" def test_count_museums_amsterdam(self): """Count museums in Amsterdam via SPARQL.""" query = """ PREFIX hc: PREFIX schema: SELECT (COUNT(DISTINCT ?s) as ?count) WHERE { ?s a schema:Museum . ?s hc:countryCode "NL" . ?s schema:location ?loc . ?loc hc:city "Amsterdam" . } """ response = httpx.post( f"{OXIGRAPH_URL}/query", data={"query": query}, headers={"Accept": "application/sparql-results+json"}, timeout=30.0, ) assert response.status_code == 200 data = response.json() count = int(data["results"]["bindings"][0]["count"]["value"]) print(f"Museums in Amsterdam: {count}") assert count > 0 def test_find_rijksmuseum(self): """Find Rijksmuseum by name.""" query = """ PREFIX hc: PREFIX schema: PREFIX skos: SELECT ?s ?name ?city WHERE { ?s skos:prefLabel ?name . FILTER(CONTAINS(LCASE(?name), "rijksmuseum")) ?s schema:location ?loc . ?loc hc:city ?city . } LIMIT 5 """ response = httpx.post( f"{OXIGRAPH_URL}/query", data={"query": query}, headers={"Accept": "application/sparql-results+json"}, timeout=30.0, ) assert response.status_code == 200 data = response.json() bindings = data["results"]["bindings"] assert len(bindings) > 0, "No Rijksmuseum found" # Should find Amsterdam Rijksmuseum names = [b["name"]["value"] for b in bindings] cities = [b.get("city", {}).get("value", "") for b in bindings] print(f"Found: {names}") print(f"Cities: {cities}") def test_count_libraries_nl(self): """Count libraries in Netherlands.""" query = """ PREFIX hc: PREFIX schema: SELECT (COUNT(DISTINCT ?s) as ?count) WHERE { ?s a schema:Library . ?s hc:countryCode "NL" . } """ response = httpx.post( f"{OXIGRAPH_URL}/query", data={"query": query}, headers={"Accept": "application/sparql-results+json"}, timeout=30.0, ) assert response.status_code == 200 data = response.json() count = int(data["results"]["bindings"][0]["count"]["value"]) print(f"Libraries in Netherlands: {count}") assert count > 100, f"Expected > 100 libraries, got {count}" def test_geographic_query_amsterdam(self): """Test geographic query near Amsterdam coordinates.""" # Amsterdam coordinates: 52.37, 4.89 query = """ PREFIX hc: PREFIX schema: PREFIX geo: PREFIX xsd: SELECT ?s ?name ?lat ?lon WHERE { ?s hc:countryCode "NL" . ?s skos:prefLabel ?name . ?s schema:location ?loc . ?loc geo:lat ?lat . ?loc geo:long ?lon . FILTER( xsd:decimal(?lat) > 52.3 && xsd:decimal(?lat) < 52.4 && xsd:decimal(?lon) > 4.8 && xsd:decimal(?lon) < 5.0 ) } LIMIT 10 """ response = httpx.post( f"{OXIGRAPH_URL}/query", data={"query": query}, headers={"Accept": "application/sparql-results+json"}, timeout=30.0, ) assert response.status_code == 200 data = response.json() bindings = data["results"]["bindings"] print(f"Found {len(bindings)} institutions near Amsterdam") for b in bindings[:5]: print(f" - {b.get('name', {}).get('value', 'N/A')}") if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short"])