#!/usr/bin/env python3 """ Test extended entity annotation with real NDE entries. Tests the updated LLM annotator with: - EntityClaim.class_uri auto-population from hyponym - RelationshipClaim extraction and processing - Full _populate_session() with new fields """ import asyncio import json import sys from pathlib import Path # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.annotators.llm_annotator import LLMAnnotator, LLMAnnotatorConfig, LLMProvider from glam_extractor.annotators.base import ( EntityClaim, RelationshipClaim, AnnotationSession, get_ontology_class, ) # Sample LLM response that matches our updated prompt format MOCK_LLM_RESPONSE = { "entities": [ { "text": "Luther Museum Amsterdam", "hypernym": "GRP", "hyponym": "GRP.HER", "confidence": 0.95, "xpath": "/html/head/title" }, { "text": "Maarten Luther", "hypernym": "AGT", "hyponym": "AGT.PER", "confidence": 0.90, "xpath": "/html/body/div[1]/p[1]" }, { "text": "Amsterdam", "hypernym": "TOP", "hyponym": "TOP.SET", "confidence": 0.95, "xpath": "/html/body/div[1]/p[1]" }, { "text": "De Stichting", "hypernym": "GRP", "hyponym": "GRP.ASS", "confidence": 0.85, "xpath": "/html/body/nav/a[2]" }, { "text": "Katharina von Bora", "hypernym": "AGT", "hyponym": "AGT.PER", "confidence": 0.88, "xpath": "/html/body/div[2]/p[1]" } ], "relationships": [ { "relationship_type": "REL.SPA.LOC", "subject": { "text": "Luther Museum Amsterdam", "type": "GRP.HER" }, "object": { "text": "Amsterdam", "type": "TOP.SET" }, "predicate": { "label": "located in" }, "confidence": 0.92, "xpath": "/html/head/title" }, { "relationship_type": "REL.SUB.ABT", "subject": { "text": "Luther Museum Amsterdam", "type": "GRP.HER" }, "object": { "text": "Maarten Luther", "type": "AGT.PER" }, "predicate": { "label": "about" }, "confidence": 0.88, "xpath": "/html/body/div[1]" }, { "relationship_type": "REL.SOC.FAM.SPO", "subject": { "text": "Maarten Luther", "type": "AGT.PER" }, "object": { "text": "Katharina von Bora", "type": "AGT.PER" }, "predicate": { "label": "spouse of" }, "temporal": { "start_date": "1525-06-13", "end_date": "1546-02-18" }, "confidence": 0.85 }, { "relationship_type": "REL.ORG.FND", "subject": { "text": "De Stichting", "type": "GRP.ASS" }, "object": { "text": "Luther Museum Amsterdam", "type": "GRP.HER" }, "predicate": { "label": "founded" }, "confidence": 0.75 } ], "layout_regions": [ { "region": "DOC.HDR", "semantic_role": "PRIM", "xpath": "/html/head/title", "text_preview": "Luther Museum Amsterdam" }, { "region": "DOC.NAV", "semantic_role": "NAV", "xpath": "/html/body/nav", "text_preview": "Over het museum, De Stichting, Agenda..." } ], "claims": [] } def test_populate_session_with_mock_response(): """Test _populate_session with a mock LLM response.""" print("=" * 60) print("Test: _populate_session with extended entity annotation") print("=" * 60) # Create annotator config with dummy API key (we're not making real API calls) config = LLMAnnotatorConfig( provider=LLMProvider.ZAI, model="glm-4-flash", api_key="dummy-key-for-testing", # Bypass API key validation ) # Create annotator (we won't call the LLM, just test parsing) annotator = LLMAnnotator(config) # Create session session = AnnotationSession( agent_name="test-agent", agent_version="1.0.0", model_id="glm-4-flash", source_url="https://luthermuseum.nl/", ) # Populate session with mock response annotator._populate_session(session, MOCK_LLM_RESPONSE, "https://luthermuseum.nl/") # Verify entity claims print(f"\n--- Entity Claims ({len(session.entity_claims)}) ---") for ec in session.entity_claims: print(f" [{ec.claim_id}] {ec.text_content}") print(f" Hypernym: {ec.hypernym.value if ec.hypernym else 'None'}") print(f" Hyponym: {ec.hyponym}") print(f" Class URI: {ec.class_uri}") print() # Verify relationship claims print(f"--- Relationship Claims ({len(session.relationship_claims)}) ---") for rc in session.relationship_claims: subj = rc.subject.span_text if rc.subject else "?" obj = rc.object.span_text if rc.object else "?" pred = rc.predicate.label if rc.predicate else "?" print(f" [{rc.claim_id}] {subj} --[{pred}]--> {obj}") print(f" Type: {rc.relationship_hyponym}") print(f" Predicate URIs: {rc.predicate_uris}") if rc.temporal_scope: print(f" Temporal: {rc.temporal_scope.start_date} to {rc.temporal_scope.end_date}") print() # Assertions assert len(session.entity_claims) == 5, f"Expected 5 entities, got {len(session.entity_claims)}" assert len(session.relationship_claims) == 4, f"Expected 4 relationships, got {len(session.relationship_claims)}" # Check class_uri auto-population museum_entity = next((e for e in session.entity_claims if "Museum" in e.text_content), None) assert museum_entity is not None, "Should have museum entity" assert museum_entity.class_uri == "glam:HeritageCustodian", f"Expected glam:HeritageCustodian, got {museum_entity.class_uri}" person_entity = next((e for e in session.entity_claims if "Maarten Luther" in e.text_content), None) assert person_entity is not None, "Should have person entity" assert person_entity.class_uri == "crm:E21_Person", f"Expected crm:E21_Person, got {person_entity.class_uri}" # Check relationship predicate_uris auto-population location_rel = next((r for r in session.relationship_claims if r.relationship_hyponym == "REL.SPA.LOC"), None) assert location_rel is not None, "Should have location relationship" assert len(location_rel.predicate_uris) > 0, "Should have predicate URIs" assert "schema:location" in location_rel.predicate_uris or "crm:P53_has_former_or_current_location" in location_rel.predicate_uris # Check temporal scope parsing spouse_rel = next((r for r in session.relationship_claims if "SPO" in (r.relationship_hyponym or "")), None) assert spouse_rel is not None, "Should have spouse relationship" assert spouse_rel.temporal_scope is not None, "Spouse relationship should have temporal scope" assert spouse_rel.temporal_scope.start_date == "1525-06-13" print("=" * 60) print("✓ All assertions passed!") print("=" * 60) # Test serialization print("\n--- Serialization Test ---") session_dict = session.to_dict() # Check entity claims in serialization entity_dicts = session_dict["claims"]["entity"] assert len(entity_dicts) == 5 assert all("class_uri" in e for e in entity_dicts) print(f"✓ {len(entity_dicts)} entity claims serialized with class_uri") # Check relationship claims in serialization rel_dicts = session_dict["claims"]["relationship"] assert len(rel_dicts) == 4 assert all("predicate_uris" in r for r in rel_dicts) assert all("relationship_hyponym" in r for r in rel_dicts) print(f"✓ {len(rel_dicts)} relationship claims serialized with predicate_uris") print("\n" + "=" * 60) print("All tests passed! ✓") print("=" * 60) return session def test_ontology_class_mappings(): """Test that ontology class mappings work correctly.""" print("\n--- Ontology Class Mapping Test ---") test_cases = [ # Entity hyponyms ("GRP.HER", "glam:HeritageCustodian"), ("AGT.PER", "crm:E21_Person"), ("TOP.SET", "schema:City"), ("WRK.COL", "crm:E78_Curated_Holding"), ("GRP.GOV", "schema:GovernmentOrganization"), ("THG.EVT", "crm:E5_Event"), # Events are under THG, not EVT # Hypernym fallbacks ("GRP", "crm:E74_Group"), ("AGT", "crm:E39_Actor"), ("TOP", "crm:E53_Place"), ] for code, expected in test_cases: result = get_ontology_class(code) status = "✓" if result == expected else "✗" print(f" {status} {code} -> {result} (expected: {expected})") if result != expected: print(f" WARNING: Mismatch!") print() if __name__ == "__main__": test_ontology_class_mappings() session = test_populate_session_with_mock_response() # Optional: Print full JSON output print("\n--- Full Session JSON ---") print(json.dumps(session.to_dict(), indent=2, default=str)[:3000] + "...")