glam/scripts/test_collision_resolution.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

152 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""
Minimal test script to debug collision resolution.
Test if Q-numbers are being assigned to institutions in collision groups.
"""
from datetime import datetime, timezone
from pathlib import Path
import sys
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from glam_extractor.models import (
HeritageCustodian, Location, Identifier, Provenance,
InstitutionType, DataSource, DataTier
)
from glam_extractor.identifiers.collision_detector import GHCIDCollisionDetector
def create_test_institution(
name: str,
city: str,
ghcid: str,
ghcid_numeric: int,
wikidata_qid: str = None
) -> HeritageCustodian:
"""Create a test institution with minimal required fields."""
identifiers = []
if wikidata_qid:
identifiers.append(
Identifier(
identifier_scheme="Wikidata",
identifier_value=wikidata_qid,
identifier_url=f"https://www.wikidata.org/wiki/{wikidata_qid}"
)
)
return HeritageCustodian(
id=f"https://w3id.org/heritage/custodian/test/{name.lower().replace(' ', '-')}",
name=name,
institution_type=InstitutionType.MUSEUM,
locations=[
Location(city=city, country="NL")
],
identifiers=identifiers if identifiers else None,
ghcid=ghcid,
ghcid_numeric=ghcid_numeric,
provenance=Provenance(
data_source=DataSource.ISIL_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="Manual test creation"
)
)
def main():
print("=" * 80)
print("Testing GHCID Collision Resolution")
print("=" * 80)
# Create a single extraction timestamp for both institutions
extraction_timestamp = datetime.now(timezone.utc)
# Create two institutions with the same base GHCID
print("\n1. Creating test institutions...")
inst1 = create_test_institution(
name="Stedelijk Museum Amsterdam",
city="Amsterdam",
ghcid="NL-NH-AMS-M-SM",
ghcid_numeric=123456789012,
wikidata_qid="Q621531"
)
# Override extraction_date to match
inst1.provenance.extraction_date = extraction_timestamp
inst2 = create_test_institution(
name="Science Museum Amsterdam",
city="Amsterdam",
ghcid="NL-NH-AMS-M-SM", # Same base GHCID!
ghcid_numeric=987654321098,
wikidata_qid="Q98765432"
)
# Override extraction_date to match
inst2.provenance.extraction_date = extraction_timestamp
print(f" Institution 1: {inst1.name}")
print(f" - GHCID: {inst1.ghcid}")
print(f" - GHCID numeric: {inst1.ghcid_numeric}")
print(f" - Wikidata: {inst1.identifiers[0].identifier_value if inst1.identifiers else 'None'}")
print(f" Institution 2: {inst2.name}")
print(f" - GHCID: {inst2.ghcid}")
print(f" - GHCID numeric: {inst2.ghcid_numeric}")
print(f" - Wikidata: {inst2.identifiers[0].identifier_value if inst2.identifiers else 'None'}")
# Detect collisions
print("\n2. Detecting collisions...")
detector = GHCIDCollisionDetector() # Empty published_dataset
collisions = detector.detect_collisions([inst1, inst2])
print(f" Collisions detected: {len(collisions)}")
for base_ghcid, collision_group in collisions.items():
print(f" - {base_ghcid}: {len(collision_group.institutions)} institutions")
print(f" Collision type: {collision_group.collision_type}")
# Resolve collisions
print("\n3. Resolving collisions...")
resolved = detector.resolve_collisions([inst1, inst2])
print(f" Resolved institutions: {len(resolved)}")
for inst in resolved:
print(f" - {inst.name}")
print(f" GHCID before: NL-NH-AMS-M-SM")
print(f" GHCID after: {inst.ghcid}")
print(f" Q-number added: {'-Q' in inst.ghcid}")
if inst.ghcid_history:
print(f" GHCID history entries: {len(inst.ghcid_history)}")
for entry in inst.ghcid_history:
print(f" - {entry.ghcid} (valid {entry.valid_from} to {entry.valid_to})")
else:
print(f" GHCID history: None")
# Check results
print("\n4. Verification...")
success = True
for inst in resolved:
if "-Q" not in inst.ghcid:
print(f" ❌ FAILED: {inst.name} missing Q-number in GHCID")
success = False
else:
print(f" ✅ PASSED: {inst.name} has Q-number: {inst.ghcid}")
if not inst.ghcid_history or len(inst.ghcid_history) < 2:
print(f" ❌ FAILED: {inst.name} missing GHCID history")
success = False
else:
print(f" ✅ PASSED: {inst.name} has GHCID history with {len(inst.ghcid_history)} entries")
print("\n" + "=" * 80)
if success:
print("✅ ALL TESTS PASSED")
else:
print("❌ SOME TESTS FAILED")
print("=" * 80)
if __name__ == "__main__":
main()