#!/usr/bin/env python3 """ Minimal test script to debug collision resolution. Test if Q-numbers are being assigned to institutions in collision groups. """ from datetime import datetime, timezone from pathlib import Path import sys # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) from glam_extractor.models import ( HeritageCustodian, Location, Identifier, Provenance, InstitutionType, DataSource, DataTier ) from glam_extractor.identifiers.collision_detector import GHCIDCollisionDetector def create_test_institution( name: str, city: str, ghcid: str, ghcid_numeric: int, wikidata_qid: str = None ) -> HeritageCustodian: """Create a test institution with minimal required fields.""" identifiers = [] if wikidata_qid: identifiers.append( Identifier( identifier_scheme="Wikidata", identifier_value=wikidata_qid, identifier_url=f"https://www.wikidata.org/wiki/{wikidata_qid}" ) ) return HeritageCustodian( id=f"https://w3id.org/heritage/custodian/test/{name.lower().replace(' ', '-')}", name=name, institution_type=InstitutionType.MUSEUM, locations=[ Location(city=city, country="NL") ], identifiers=identifiers if identifiers else None, ghcid=ghcid, ghcid_numeric=ghcid_numeric, provenance=Provenance( data_source=DataSource.ISIL_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc), extraction_method="Manual test creation" ) ) def main(): print("=" * 80) print("Testing GHCID Collision Resolution") print("=" * 80) # Create a single extraction timestamp for both institutions extraction_timestamp = datetime.now(timezone.utc) # Create two institutions with the same base GHCID print("\n1. Creating test institutions...") inst1 = create_test_institution( name="Stedelijk Museum Amsterdam", city="Amsterdam", ghcid="NL-NH-AMS-M-SM", ghcid_numeric=123456789012, wikidata_qid="Q621531" ) # Override extraction_date to match inst1.provenance.extraction_date = extraction_timestamp inst2 = create_test_institution( name="Science Museum Amsterdam", city="Amsterdam", ghcid="NL-NH-AMS-M-SM", # Same base GHCID! ghcid_numeric=987654321098, wikidata_qid="Q98765432" ) # Override extraction_date to match inst2.provenance.extraction_date = extraction_timestamp print(f" Institution 1: {inst1.name}") print(f" - GHCID: {inst1.ghcid}") print(f" - GHCID numeric: {inst1.ghcid_numeric}") print(f" - Wikidata: {inst1.identifiers[0].identifier_value if inst1.identifiers else 'None'}") print(f" Institution 2: {inst2.name}") print(f" - GHCID: {inst2.ghcid}") print(f" - GHCID numeric: {inst2.ghcid_numeric}") print(f" - Wikidata: {inst2.identifiers[0].identifier_value if inst2.identifiers else 'None'}") # Detect collisions print("\n2. Detecting collisions...") detector = GHCIDCollisionDetector() # Empty published_dataset collisions = detector.detect_collisions([inst1, inst2]) print(f" Collisions detected: {len(collisions)}") for base_ghcid, collision_group in collisions.items(): print(f" - {base_ghcid}: {len(collision_group.institutions)} institutions") print(f" Collision type: {collision_group.collision_type}") # Resolve collisions print("\n3. Resolving collisions...") resolved = detector.resolve_collisions([inst1, inst2]) print(f" Resolved institutions: {len(resolved)}") for inst in resolved: print(f" - {inst.name}") print(f" GHCID before: NL-NH-AMS-M-SM") print(f" GHCID after: {inst.ghcid}") print(f" Q-number added: {'-Q' in inst.ghcid}") if inst.ghcid_history: print(f" GHCID history entries: {len(inst.ghcid_history)}") for entry in inst.ghcid_history: print(f" - {entry.ghcid} (valid {entry.valid_from} to {entry.valid_to})") else: print(f" GHCID history: None") # Check results print("\n4. Verification...") success = True for inst in resolved: if "-Q" not in inst.ghcid: print(f" ❌ FAILED: {inst.name} missing Q-number in GHCID") success = False else: print(f" ✅ PASSED: {inst.name} has Q-number: {inst.ghcid}") if not inst.ghcid_history or len(inst.ghcid_history) < 2: print(f" ❌ FAILED: {inst.name} missing GHCID history") success = False else: print(f" ✅ PASSED: {inst.name} has GHCID history with {len(inst.ghcid_history)} entries") print("\n" + "=" * 80) if success: print("✅ ALL TESTS PASSED") else: print("❌ SOME TESTS FAILED") print("=" * 80) if __name__ == "__main__": main()