""" Integration Tests for GHCID Collision Detector with Real Dutch ISIL Registry Data Tests collision detection using actual heritage institutions from the Dutch ISIL registry. Real collision scenarios from data/ISIL-codes_2025-08-01.csv: - Amsterdam: 7 museums, 16 official institutions, 3 archives - Den Haag: 27 official institutions (ministries, research centers) - Alphen aan den Rijn: 2 archives - Arnhem: 3 museums - Plus 38 more collision groups These tests validate: 1. Collision detection with real institution names 2. GHCID generation from actual Dutch cities 3. Q-number assignment using real Wikidata IDs (when available) 4. Temporal collision resolution with realistic extraction dates 5. Integration between ISILRegistryParser and GHCIDCollisionDetector """ import pytest from datetime import datetime, timezone from pathlib import Path from glam_extractor.parsers.isil_registry import ISILRegistryParser from glam_extractor.identifiers.collision_detector import ( GHCIDCollisionDetector, CollisionGroup ) from glam_extractor.identifiers.ghcid import ( GHCIDGenerator, InstitutionType as GHCIDInstitutionType ) from glam_extractor.identifiers.lookups import get_ghcid_components_for_dutch_city from glam_extractor.models import ( HeritageCustodian, Identifier, InstitutionType, Location, Provenance, DataSource, DataTier ) @pytest.fixture def isil_csv_path() -> Path: """Path to real Dutch ISIL registry CSV""" return Path(__file__).parent.parent.parent / "data" / "ISIL-codes_2025-08-01.csv" @pytest.fixture def isil_parser() -> ISILRegistryParser: """ISIL registry parser instance""" return ISILRegistryParser() @pytest.fixture def collision_detector() -> GHCIDCollisionDetector: """Collision detector instance""" return GHCIDCollisionDetector() def generate_ghcid_for_institution(institution: HeritageCustodian) -> None: """ Helper function to generate GHCID for a HeritageCustodian instance. This function modifies the institution in-place, setting ghcid, ghcid_uuid, ghcid_uuid_sha256, and ghcid_numeric fields. For Dutch institutions only. Extracts city from locations, uses lookup table to get region code and city locode. Args: institution: HeritageCustodian instance to generate GHCID for """ # Extract required fields if not institution.locations or len(institution.locations) == 0: return # Cannot generate GHCID without location city = institution.locations[0].city country = institution.locations[0].country or "NL" # For Dutch cities, use lookup table component_dict = get_ghcid_components_for_dutch_city( city=city, institution_name=institution.name, institution_type=institution.institution_type ) if not component_dict: return # Cannot generate GHCID without components # Convert institution type string to GHCID InstitutionType enum # Model InstitutionType: "MUSEUM", "LIBRARY", etc. # GHCID InstitutionType: MUSEUM="M", LIBRARY="L", etc. try: ghcid_inst_type = GHCIDInstitutionType[institution.institution_type] except KeyError: # Fallback: try direct string match ghcid_inst_type = institution.institution_type # Generate GHCID components generator = GHCIDGenerator() components = generator.generate( institution_name=component_dict["institution_name"], english_name=component_dict["english_name"], institution_type=ghcid_inst_type, country_code=component_dict["country_code"], region_code=component_dict["region_code"], city_locode=component_dict["city_locode"] ) # Set GHCID fields on institution institution.ghcid = components.to_string() institution.ghcid_uuid = str(components.to_uuid()) institution.ghcid_uuid_sha256 = str(components.to_uuid_sha256()) institution.ghcid_numeric = components.to_numeric() class TestRealAmsterdamMuseums: """ Test collision detection with 7 real Amsterdam museums from ISIL registry: - Amsterdam Museum - Het Scheepvaartmuseum (HSM) - Joods Historisch Museum - Museum Ons' Lieve Heer op Solder - Rijksmuseum - Van Gogh Museum - Verzetsmuseum Amsterdam All share: city=Amsterdam, type=MUSEUM, country=NL Expected base GHCID: NL-NH-AMS-M-{abbreviation} """ def test_detect_amsterdam_museum_collisions( self, isil_csv_path: Path, isil_parser: ISILRegistryParser, collision_detector: GHCIDCollisionDetector ): """7 Amsterdam museums should be detected as collision group""" # Parse ISIL registry records = isil_parser.parse_file(isil_csv_path) # Filter for Amsterdam museums amsterdam_museums = [ r for r in records if r.plaats == "Amsterdam" and "museum" in r.instelling.lower() ] # Should find 7 museums assert len(amsterdam_museums) >= 7, f"Expected 7+ museums, found {len(amsterdam_museums)}" # Convert to HeritageCustodian institutions = [isil_parser.to_heritage_custodian(r) for r in amsterdam_museums] # All extracted on same date (simulating batch import) extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc) for inst in institutions: inst.provenance.extraction_date = extraction_date # Detect collisions collisions = collision_detector.detect_collisions(institutions) # Should detect collisions (institutions with same city + type may collide) # Note: Actual collision count depends on GHCID abbreviation uniqueness assert len(collisions) >= 0, "Should detect collision groups or have unique abbreviations" def test_amsterdam_museums_first_batch_resolution( self, isil_csv_path: Path, isil_parser: ISILRegistryParser, collision_detector: GHCIDCollisionDetector ): """First batch: All colliding Amsterdam museums get Q-numbers""" # Create test subset with known collision # Use institutions with same abbreviation potential institutions = [ HeritageCustodian( id="test-rijksmuseum", name="Rijksmuseum", institution_type=InstitutionType.MUSEUM, identifiers=[ Identifier( identifier_scheme="ISIL", identifier_value="NL-AsdRM", identifier_url="https://isil.nl/NL-AsdRM" ), Identifier( identifier_scheme="Wikidata", identifier_value="Q190804", identifier_url="https://www.wikidata.org/wiki/Q190804" ) ], locations=[Location(city="Amsterdam", country="NL")], provenance=Provenance( data_source=DataSource.DUTCH_ORG_CSV, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc), extraction_method="ISILRegistryParser", confidence_score=1.0 ) ), HeritageCustodian( id="test-van-gogh", name="Van Gogh Museum", institution_type=InstitutionType.MUSEUM, identifiers=[ Identifier( identifier_scheme="ISIL", identifier_value="NL-AsdVGM", identifier_url="https://isil.nl/NL-AsdVGM" ), Identifier( identifier_scheme="Wikidata", identifier_value="Q224124", identifier_url="https://www.wikidata.org/wiki/Q224124" ) ], locations=[Location(city="Amsterdam", country="NL")], provenance=Provenance( data_source=DataSource.DUTCH_ORG_CSV, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc), extraction_method="ISILRegistryParser", confidence_score=1.0 ) ) ] # Generate GHCIDs for institutions for inst in institutions: generate_ghcid_for_institution(inst) # Resolve collisions resolved = collision_detector.resolve_collisions(institutions) # Both should have GHCIDs assert all(inst.ghcid is not None for inst in resolved) # If they collided, both should have Q-numbers in GHCID # Check if any Q-numbers were added ghcids_with_q = [inst.ghcid for inst in resolved if inst.ghcid and '-Q' in inst.ghcid] # If collision occurred, both get Q-numbers (first batch rule) if ghcids_with_q: assert len(ghcids_with_q) == 2, "First batch: both institutions should have Q-numbers" assert 'Q190804' in ghcids_with_q[0] or 'Q224124' in ghcids_with_q[0] assert 'Q190804' in ghcids_with_q[1] or 'Q224124' in ghcids_with_q[1] class TestRealAlphenArchives: """ Test collision with 2 real Alphen aan den Rijn archives: - Gemeentearchief Alphen aan den Rijn (NL-AlGAADR) - Streekarchief Rijnlands Midden (NL-AlSARM) Both are archives in same city - potential GHCID collision """ def test_alphen_archives_collision_detection( self, isil_csv_path: Path, isil_parser: ISILRegistryParser, collision_detector: GHCIDCollisionDetector ): """Two Alphen archives should be detected as potential collision""" records = isil_parser.parse_file(isil_csv_path) # Filter for Alphen aan den Rijn archives alphen_archives = [ r for r in records if r.plaats == "Alphen aan den Rijn" and ( "archief" in r.instelling.lower() or "archive" in r.instelling.lower() ) ] # Should find 2 archives assert len(alphen_archives) >= 2, f"Expected 2+ archives, found {len(alphen_archives)}" # Convert to HeritageCustodian institutions = [isil_parser.to_heritage_custodian(r) for r in alphen_archives] # Same extraction date (batch import) extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc) for inst in institutions: inst.provenance.extraction_date = extraction_date # Detect collisions collisions = collision_detector.detect_collisions(institutions) # May or may not collide depending on abbreviation uniqueness # This test validates the detection logic works with real data assert isinstance(collisions, dict) class TestRealDenHaagOfficialInstitutions: """ Test collision with 27 real Den Haag official institutions: - Government ministries (15+) - Research centers (Planbureau, SCP, etc.) - Heritage institutions (RKD, Mauritshuis, etc.) Largest collision group in dataset - excellent stress test """ def test_den_haag_official_institutions_batch_size( self, isil_csv_path: Path, isil_parser: ISILRegistryParser ): """Verify Den Haag has 27+ official institutions""" records = isil_parser.parse_file(isil_csv_path) # Filter for Den Haag institutions (non-museum, non-archive, non-library) den_haag_official = [ r for r in records if r.plaats == "Den Haag" and not any( keyword in r.instelling.lower() for keyword in ["museum", "archief", "archive", "bibliotheek", "library"] ) ] # Should find 27+ institutions assert len(den_haag_official) >= 27, ( f"Expected 27+ official institutions in Den Haag, found {len(den_haag_official)}" ) def test_den_haag_collision_stress_test( self, isil_csv_path: Path, isil_parser: ISILRegistryParser, collision_detector: GHCIDCollisionDetector ): """Stress test: 27 institutions in same city with same type""" records = isil_parser.parse_file(isil_csv_path) # Get Den Haag official institutions den_haag_official = [ r for r in records if r.plaats == "Den Haag" and not any( keyword in r.instelling.lower() for keyword in ["museum", "archief", "archive", "bibliotheek", "library"] ) ] # Convert to HeritageCustodian institutions = [isil_parser.to_heritage_custodian(r) for r in den_haag_official[:10]] # Test with subset # Same extraction date (batch import) extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc) for inst in institutions: inst.provenance.extraction_date = extraction_date # Detect collisions - should handle large group collisions = collision_detector.detect_collisions(institutions) # Should complete without errors assert isinstance(collisions, dict) # Resolve collisions - should handle large group resolved = collision_detector.resolve_collisions(institutions) # Should have same number of institutions assert len(resolved) == len(institutions) class TestRealHistoricalAddition: """ Test historical addition scenario using real institutions Scenario: Rijksmuseum extracted in Nov 2025, then Joods Historisch Museum discovered in Dec 2025. Both Amsterdam museums. Expected: - Rijksmuseum GHCID preserved (if no collision) - JHM gets Q-number if collision occurs """ def test_historical_addition_with_real_museums(self): """Historical addition: New museum added later with collision""" # Published dataset (November 2025) rijksmuseum = HeritageCustodian( id="rijksmuseum-001", name="Rijksmuseum", institution_type=InstitutionType.MUSEUM, identifiers=[ Identifier( identifier_scheme="ISIL", identifier_value="NL-AsdRM", identifier_url="https://isil.nl/NL-AsdRM" ), Identifier( identifier_scheme="Wikidata", identifier_value="Q190804", identifier_url="https://www.wikidata.org/wiki/Q190804" ) ], locations=[Location(city="Amsterdam", country="NL")], provenance=Provenance( data_source=DataSource.DUTCH_ORG_CSV, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc), extraction_method="ISILRegistryParser", confidence_score=1.0 ), ghcid="NL-NH-AMS-M-RM", # Already published ghcid_numeric=123456789012 ) # New institution (December 2025) jhm = HeritageCustodian( id="jhm-001", name="Joods Historisch Museum", institution_type=InstitutionType.MUSEUM, identifiers=[ Identifier( identifier_scheme="ISIL", identifier_value="NL-AsdJHM", identifier_url="https://isil.nl/NL-AsdJHM" ), Identifier( identifier_scheme="Wikidata", identifier_value="Q924335", identifier_url="https://www.wikidata.org/wiki/Q924335" ) ], locations=[Location(city="Amsterdam", country="NL")], provenance=Provenance( data_source=DataSource.DUTCH_ORG_CSV, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime(2025, 12, 1, 14, 0, 0, tzinfo=timezone.utc), extraction_method="ISILRegistryParser - historical addition", confidence_score=1.0 ) ) # Generate GHCID for JHM (new institution needs GHCID before collision detection) generate_ghcid_for_institution(jhm) # Create collision detector with published dataset collision_detector = GHCIDCollisionDetector(published_dataset=[rijksmuseum]) # Resolve collision - new institution only resolved = collision_detector.resolve_collisions([jhm]) # Rijksmuseum GHCID should be unchanged (PID stability) assert rijksmuseum.ghcid == "NL-NH-AMS-M-RM" # JHM should have GHCID (may have Q-number if collision occurred) assert resolved[0].ghcid is not None # If collision occurred, only JHM gets Q-number if '-Q' in resolved[0].ghcid: assert 'Q924335' in resolved[0].ghcid assert resolved[0].ghcid_history is not None assert len(resolved[0].ghcid_history) >= 1 class TestRealCrossDatasetIntegration: """ Test integration between ISIL registry and Dutch organizations CSV Both datasets contain overlapping institutions - test collision resolution when merging datasets. """ def test_cross_dataset_collision_detection( self, isil_csv_path: Path, isil_parser: ISILRegistryParser, collision_detector: GHCIDCollisionDetector ): """Detect collisions when merging ISIL registry with Dutch orgs CSV""" # Parse ISIL registry (sample) isil_records = isil_parser.parse_file(isil_csv_path) isil_institutions = [ isil_parser.to_heritage_custodian(r) for r in isil_records[:50] # Test with first 50 ] # Simulate Dutch orgs CSV data (same institutions, later extraction) # In real scenario, would parse from voorbeeld_lijst_organisaties CSV # Set extraction dates extraction_date_isil = datetime(2025, 8, 1, 0, 0, 0, tzinfo=timezone.utc) extraction_date_orgs = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc) for inst in isil_institutions: inst.provenance.extraction_date = extraction_date_isil # Detect collisions within ISIL dataset collisions = collision_detector.detect_collisions(isil_institutions) # Should complete without errors assert isinstance(collisions, dict) class TestRealSyntheticQNumbers: """ Test synthetic Q-number generation for real institutions without Wikidata IDs Many smaller institutions in ISIL registry lack Wikidata entries. Verify synthetic Q-numbers are generated correctly. """ def test_synthetic_q_number_for_local_heritage_society( self, collision_detector: GHCIDCollisionDetector ): """Generate synthetic Q-number for institution without Wikidata""" # Real institution from ISIL registry: Heemkunde Vereniging Borne # No Wikidata ID available institutions = [ HeritageCustodian( id="hvb-001", name="Heemkunde Vereniging Borne", institution_type=InstitutionType.COLLECTING_SOCIETY, identifiers=[ Identifier( identifier_scheme="ISIL", identifier_value="NL-BneHVB", identifier_url="https://isil.nl/NL-BneHVB" ) ], locations=[Location(city="Borne", country="NL")], provenance=Provenance( data_source=DataSource.ISIL_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc), extraction_method="ISILRegistryParser", confidence_score=1.0 ) ), HeritageCustodian( id="gemeente-borne-001", name="Gemeente Borne", institution_type=InstitutionType.OFFICIAL_INSTITUTION, identifiers=[ Identifier( identifier_scheme="ISIL", identifier_value="NL-BneGB", identifier_url="https://isil.nl/NL-BneGB" ) ], locations=[Location(city="Borne", country="NL")], provenance=Provenance( data_source=DataSource.ISIL_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc), extraction_method="ISILRegistryParser", confidence_score=1.0 ) ) ] # Generate GHCIDs for institutions for inst in institutions: generate_ghcid_for_institution(inst) # Resolve collisions - both extracted same day (first batch) resolved = collision_detector.resolve_collisions(institutions) # Both should have GHCIDs assert all(inst.ghcid is not None for inst in resolved) # If collision occurred and no Wikidata, should have synthetic Q-numbers for inst in resolved: if inst.ghcid and '-Q' in inst.ghcid: # Extract Q-number q_part = inst.ghcid.split('-Q')[1] assert q_part.startswith('Q') assert q_part[1:].isdigit(), "Synthetic Q-number should be numeric" class TestRealDataQualityValidation: """ Validate data quality after collision resolution with real ISIL data Ensures: - All institutions retain valid ISIL codes - Provenance metadata preserved - GHCID history correctly tracks changes - No data loss during collision resolution """ def test_data_integrity_after_collision_resolution( self, isil_csv_path: Path, isil_parser: ISILRegistryParser, collision_detector: GHCIDCollisionDetector ): """Verify data integrity after resolving collisions""" # Parse sample of ISIL registry records = isil_parser.parse_file(isil_csv_path) institutions = [ isil_parser.to_heritage_custodian(r) for r in records[:30] # Test with first 30 ] # Set extraction date extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc) for inst in institutions: inst.provenance.extraction_date = extraction_date # Store original ISIL codes for verification original_isil_codes = { inst.id: [ ident.identifier_value for ident in (inst.identifiers or []) if ident.identifier_scheme == "ISIL" ][0] if inst.identifiers else None for inst in institutions } # Resolve collisions resolved = collision_detector.resolve_collisions(institutions) # Verify data integrity assert len(resolved) == len(institutions), "No institutions lost" for inst in resolved: # ISIL code preserved if inst.id in original_isil_codes and original_isil_codes[inst.id]: current_isil = [ ident.identifier_value for ident in (inst.identifiers or []) if ident.identifier_scheme == "ISIL" ] assert len(current_isil) > 0, f"ISIL code lost for {inst.name}" assert current_isil[0] == original_isil_codes[inst.id], "ISIL code changed" # Provenance preserved assert inst.provenance is not None assert inst.provenance.data_source == DataSource.ISIL_REGISTRY assert inst.provenance.data_tier == DataTier.TIER_1_AUTHORITATIVE # If GHCID assigned, should have history if inst.ghcid: assert inst.ghcid_numeric is not None, f"Missing ghcid_numeric for {inst.name}"