""" Tests for Institution Deduplicator Test coverage: - Name normalization - Match key generation - Tier-based priority selection - Metadata merging - Edge cases (missing fields, whitespace variants) """ from datetime import datetime, timezone import pytest from glam_extractor.models import ( DataSource, DataTier, DigitalPlatform, DigitalPlatformType, HeritageCustodian, Identifier, InstitutionType, Location, Provenance, ) from glam_extractor.parsers.deduplicator import InstitutionDeduplicator class TestNameNormalization: """Test name normalization logic""" def test_normalize_basic(self): """Test basic normalization (lowercase, strip)""" assert InstitutionDeduplicator.normalize_name("Rijksmuseum") == "rijksmuseum" assert InstitutionDeduplicator.normalize_name(" Amsterdam Museum ") == "amsterdam museum" def test_normalize_whitespace(self): """Test whitespace collapsing""" assert InstitutionDeduplicator.normalize_name("Museum van Loon") == "museum van loon" assert InstitutionDeduplicator.normalize_name("Museum\t\tName") == "museum name" def test_normalize_punctuation(self): """Test punctuation removal""" assert InstitutionDeduplicator.normalize_name("Museum (Amsterdam)") == "museum amsterdam" assert InstitutionDeduplicator.normalize_name("Rijksmuseum, Amsterdam") == "rijksmuseum amsterdam" assert InstitutionDeduplicator.normalize_name("Museum!") == "museum" def test_normalize_hyphens_preserved(self): """Test that hyphens in names are preserved""" assert InstitutionDeduplicator.normalize_name("Anne-Frank-Huis") == "anne-frank-huis" class TestMatchKeyGeneration: """Test match key generation""" @pytest.fixture def sample_institution(self): """Create sample institution for testing""" return HeritageCustodian( id="test-001", name="Rijksmuseum", institution_type=InstitutionType.MUSEUM, locations=[Location( city="Amsterdam", country="NL", location_type="primary", postal_code=None, street_address=None, region=None, latitude=None, longitude=None, geonames_id=None, is_primary=True )], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc), extraction_method="test", confidence_score=1.0, conversation_id=None, source_url=None, verified_date=None, verified_by=None ), organization_status=None, parent_organization=None, founded_date=None, closed_date=None, homepage=None, description=None, alternative_names=[], identifiers=[], digital_platforms=[], collections=[], change_history=[], ghcid_uuid=None, ghcid_uuid_sha256=None, record_id=None, ghcid_numeric=None, ghcid=None, ghcid_original=None, ghcid_history=None, contact_info=None ) def test_match_key_basic(self, sample_institution): """Test basic match key generation (name + city only, no type)""" key = InstitutionDeduplicator.get_match_key(sample_institution) assert key == "rijksmuseum|amsterdam" def test_match_key_missing_city(self, sample_institution): """Test match key generation when city is missing""" sample_institution.locations[0].city = None key = InstitutionDeduplicator.get_match_key(sample_institution) assert key is None def test_match_key_no_locations(self, sample_institution): """Test match key generation when locations list is empty""" sample_institution.locations = [] key = InstitutionDeduplicator.get_match_key(sample_institution) assert key is None def test_match_key_whitespace_variants(self, sample_institution): """Test that whitespace variants generate same key""" sample_institution.name = "Rijksmuseum Amsterdam" key1 = InstitutionDeduplicator.get_match_key(sample_institution) sample_institution.name = " Rijksmuseum Amsterdam " key2 = InstitutionDeduplicator.get_match_key(sample_institution) assert key1 == key2 class TestTierPriority: """Test data tier priority selection""" @pytest.fixture def create_institution(self): """Factory function to create institutions with different tiers""" def _create(name: str, tier: DataTier, num_identifiers: int = 0): identifiers = [ Identifier( identifier_scheme="ISIL", identifier_value=f"NL-{i}", identifier_url=None, assigned_date=None ) for i in range(num_identifiers) ] return HeritageCustodian( id=f"test-{name}", name=name, institution_type=InstitutionType.MUSEUM, locations=[Location( city="Amsterdam", country="NL", location_type="primary", postal_code=None, street_address=None, region=None, latitude=None, longitude=None, geonames_id=None, is_primary=True )], identifiers=identifiers, provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=tier, extraction_date=datetime.now(timezone.utc), extraction_method="test", confidence_score=1.0, conversation_id=None, source_url=None, verified_date=None, verified_by=None ), organization_status=None, parent_organization=None, founded_date=None, closed_date=None, homepage=None, description=None, alternative_names=[], digital_platforms=[], collections=[], change_history=[], ghcid_uuid=None, ghcid_uuid_sha256=None, record_id=None, ghcid_numeric=None, ghcid=None, ghcid_original=None, ghcid_history=None, contact_info=None ) return _create def test_tier_1_wins_over_tier_4(self, create_institution): """Test TIER_1 selected over TIER_4""" tier1_inst = create_institution("Museum A", DataTier.TIER_1_AUTHORITATIVE) tier4_inst = create_institution("Museum A", DataTier.TIER_4_INFERRED) primary = InstitutionDeduplicator.select_primary_record([tier4_inst, tier1_inst]) assert primary is tier1_inst def test_tier_2_wins_over_tier_3(self, create_institution): """Test TIER_2 selected over TIER_3""" tier2_inst = create_institution("Museum B", DataTier.TIER_2_VERIFIED) tier3_inst = create_institution("Museum B", DataTier.TIER_3_CROWD_SOURCED) primary = InstitutionDeduplicator.select_primary_record([tier3_inst, tier2_inst]) assert primary is tier2_inst def test_metadata_completeness_as_tiebreaker(self, create_institution): """Test that metadata completeness breaks tier ties""" inst_few_ids = create_institution("Museum C", DataTier.TIER_1_AUTHORITATIVE, num_identifiers=1) inst_many_ids = create_institution("Museum C", DataTier.TIER_1_AUTHORITATIVE, num_identifiers=5) primary = InstitutionDeduplicator.select_primary_record([inst_few_ids, inst_many_ids]) assert primary is inst_many_ids class TestMetadataMerging: """Test metadata merging from duplicates""" @pytest.fixture def create_institution_with_metadata(self): """Factory to create institutions with various metadata""" def _create( name: str, tier: DataTier, isil_code: str = None, platform_name: str = None, description: str = None ): identifiers = [] if isil_code: identifiers.append(Identifier( identifier_scheme="ISIL", identifier_value=isil_code, identifier_url=None, assigned_date=None )) platforms = [] if platform_name: platforms.append(DigitalPlatform( platform_name=platform_name, platform_url=None, platform_type=DigitalPlatformType.COLLECTION_MANAGEMENT, vendor=None, metadata_standards=None )) return HeritageCustodian( id=f"test-{name}", name=name, institution_type=InstitutionType.MUSEUM, locations=[Location( city="Amsterdam", country="NL", location_type="primary", postal_code=None, street_address=None, region=None, latitude=None, longitude=None, geonames_id=None, is_primary=True )], identifiers=identifiers, digital_platforms=platforms, description=description, provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=tier, extraction_date=datetime.now(timezone.utc), extraction_method="test", confidence_score=1.0, conversation_id=None, source_url=None, verified_date=None, verified_by=None ), organization_status=None, parent_organization=None, founded_date=None, closed_date=None, homepage=None, alternative_names=[], collections=[], change_history=[], ghcid_uuid=None, ghcid_uuid_sha256=None, record_id=None, ghcid_numeric=None, ghcid=None, ghcid_original=None, ghcid_history=None, contact_info=None ) return _create def test_merge_identifiers(self, create_institution_with_metadata): """Test merging unique identifiers from duplicates""" inst1 = create_institution_with_metadata( "Museum", DataTier.TIER_1_AUTHORITATIVE, isil_code="NL-001" ) inst2 = create_institution_with_metadata( "Museum", DataTier.TIER_4_INFERRED, isil_code="NL-002" ) merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2]) assert len(merged.identifiers) == 2 isil_codes = {i.identifier_value for i in merged.identifiers} assert isil_codes == {"NL-001", "NL-002"} def test_merge_platforms(self, create_institution_with_metadata): """Test merging unique platforms from duplicates""" inst1 = create_institution_with_metadata( "Museum", DataTier.TIER_1_AUTHORITATIVE, platform_name="Atlantis" ) inst2 = create_institution_with_metadata( "Museum", DataTier.TIER_4_INFERRED, platform_name="MAIS" ) merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2]) assert len(merged.digital_platforms) == 2 platform_names = {p.platform_name for p in merged.digital_platforms} assert platform_names == {"Atlantis", "MAIS"} def test_merge_descriptions(self, create_institution_with_metadata): """Test merging unique descriptions""" inst1 = create_institution_with_metadata( "Museum", DataTier.TIER_1_AUTHORITATIVE, description="Note A" ) inst2 = create_institution_with_metadata( "Museum", DataTier.TIER_4_INFERRED, description="Note B" ) merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2]) assert "Note A" in merged.description assert "Note B" in merged.description def test_no_duplicate_identifiers(self, create_institution_with_metadata): """Test that duplicate identifiers are not added twice""" inst1 = create_institution_with_metadata( "Museum", DataTier.TIER_1_AUTHORITATIVE, isil_code="NL-001" ) inst2 = create_institution_with_metadata( "Museum", DataTier.TIER_4_INFERRED, isil_code="NL-001" ) merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2]) # Should only have one NL-001 identifier assert len(merged.identifiers) == 1 assert merged.identifiers[0].identifier_value == "NL-001" class TestTypeResolution: """Test institution type resolution logic""" @pytest.fixture def create_institution_with_type(self): """Factory to create institutions with specific types""" def _create(name: str, inst_type: InstitutionType, tier: DataTier): return HeritageCustodian( id=f"test-{name}-{inst_type}", name=name, institution_type=inst_type, locations=[Location( city="Amsterdam", country="NL", location_type="primary", postal_code=None, street_address=None, region=None, latitude=None, longitude=None, geonames_id=None, is_primary=True )], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=tier, extraction_date=datetime.now(timezone.utc), extraction_method="test", confidence_score=1.0, conversation_id=None, source_url=None, verified_date=None, verified_by=None ), organization_status=None, parent_organization=None, founded_date=None, closed_date=None, homepage=None, description=None, alternative_names=[], identifiers=[], digital_platforms=[], collections=[], change_history=[], ghcid_uuid=None, ghcid_uuid_sha256=None, record_id=None, ghcid_numeric=None, ghcid=None, ghcid_original=None, ghcid_history=None, contact_info=None ) return _create def test_resolve_type_all_match(self, create_institution_with_type): """Test type resolution when all types match""" records = [ create_institution_with_type("Museum", InstitutionType.MUSEUM, DataTier.TIER_1_AUTHORITATIVE), create_institution_with_type("Museum", InstitutionType.MUSEUM, DataTier.TIER_4_INFERRED) ] resolved = InstitutionDeduplicator.resolve_institution_type(records) assert resolved == InstitutionType.MUSEUM def test_resolve_type_mixed_vs_specific(self, create_institution_with_type): """Test type resolution: MIXED + MUSEUM → MUSEUM (prefer specific)""" records = [ create_institution_with_type("Museum", InstitutionType.MIXED, DataTier.TIER_1_AUTHORITATIVE), create_institution_with_type("Museum", InstitutionType.MUSEUM, DataTier.TIER_4_INFERRED) ] resolved = InstitutionDeduplicator.resolve_institution_type(records) assert resolved == InstitutionType.MUSEUM def test_resolve_type_mixed_vs_library(self, create_institution_with_type): """Test type resolution: MIXED + LIBRARY → LIBRARY (prefer specific)""" records = [ create_institution_with_type("Library", InstitutionType.MIXED, DataTier.TIER_1_AUTHORITATIVE), create_institution_with_type("Library", InstitutionType.LIBRARY, DataTier.TIER_4_INFERRED) ] resolved = InstitutionDeduplicator.resolve_institution_type(records) assert resolved == InstitutionType.LIBRARY def test_resolve_type_conflicting_use_tier(self, create_institution_with_type): """Test type resolution: conflicting types (MUSEUM vs LIBRARY) → use highest tier""" records = [ create_institution_with_type("Org", InstitutionType.MUSEUM, DataTier.TIER_1_AUTHORITATIVE), create_institution_with_type("Org", InstitutionType.LIBRARY, DataTier.TIER_4_INFERRED) ] resolved = InstitutionDeduplicator.resolve_institution_type(records) # Should use MUSEUM because TIER_1 > TIER_4 assert resolved == InstitutionType.MUSEUM def test_resolve_type_all_mixed(self, create_institution_with_type): """Test type resolution: all MIXED → return MIXED""" records = [ create_institution_with_type("Org", InstitutionType.MIXED, DataTier.TIER_1_AUTHORITATIVE), create_institution_with_type("Org", InstitutionType.MIXED, DataTier.TIER_4_INFERRED) ] resolved = InstitutionDeduplicator.resolve_institution_type(records) assert resolved == InstitutionType.MIXED class TestDeduplication: """Test full deduplication workflow""" @pytest.fixture def create_duplicate_set(self): """Create set of institutions with duplicates""" def _create(): institutions = [] # Duplicate group 1: Rijksmuseum (3 instances, different tiers) for i, tier in enumerate([ DataTier.TIER_1_AUTHORITATIVE, DataTier.TIER_4_INFERRED, DataTier.TIER_4_INFERRED ]): institutions.append(HeritageCustodian( id=f"rijks-{i}", name="Rijksmuseum" if i == 0 else " Rijksmuseum ", # Test whitespace institution_type=InstitutionType.MUSEUM, locations=[Location( city="Amsterdam", country="NL", location_type="primary", postal_code=None, street_address=None, region=None, latitude=None, longitude=None, geonames_id=None, is_primary=True )], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=tier, extraction_date=datetime.now(timezone.utc), extraction_method="test", confidence_score=1.0, conversation_id=None, source_url=None, verified_date=None, verified_by=None ), organization_status=None, parent_organization=None, founded_date=None, closed_date=None, homepage=None, description=None, alternative_names=[], identifiers=[], digital_platforms=[], collections=[], change_history=[], ghcid_uuid=None, ghcid_uuid_sha256=None, record_id=None, ghcid_numeric=None, ghcid=None, ghcid_original=None, ghcid_history=None, contact_info=None )) # Duplicate group 2: Van Gogh Museum (2 instances) for i in range(2): institutions.append(HeritageCustodian( id=f"vg-{i}", name="Van Gogh Museum", institution_type=InstitutionType.MUSEUM, locations=[Location( city="Amsterdam", country="NL", location_type="primary", postal_code=None, street_address=None, region=None, latitude=None, longitude=None, geonames_id=None, is_primary=True )], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc), extraction_method="test", confidence_score=1.0, conversation_id=None, source_url=None, verified_date=None, verified_by=None ), organization_status=None, parent_organization=None, founded_date=None, closed_date=None, homepage=None, description=None, alternative_names=[], identifiers=[], digital_platforms=[], collections=[], change_history=[], ghcid_uuid=None, ghcid_uuid_sha256=None, record_id=None, ghcid_numeric=None, ghcid=None, ghcid_original=None, ghcid_history=None, contact_info=None )) # Unique institution: Stedelijk Museum institutions.append(HeritageCustodian( id="stedelijk-1", name="Stedelijk Museum", institution_type=InstitutionType.MUSEUM, locations=[Location( city="Amsterdam", country="NL", location_type="primary", postal_code=None, street_address=None, region=None, latitude=None, longitude=None, geonames_id=None, is_primary=True )], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc), extraction_method="test", confidence_score=1.0, conversation_id=None, source_url=None, verified_date=None, verified_by=None ), organization_status=None, parent_organization=None, founded_date=None, closed_date=None, homepage=None, description=None, alternative_names=[], identifiers=[], digital_platforms=[], collections=[], change_history=[], ghcid_uuid=None, ghcid_uuid_sha256=None, record_id=None, ghcid_numeric=None, ghcid=None, ghcid_original=None, ghcid_history=None, contact_info=None )) return institutions return _create def test_deduplicate_basic(self, create_duplicate_set): """Test basic deduplication workflow""" institutions = create_duplicate_set() deduplicator = InstitutionDeduplicator() unique = deduplicator.deduplicate(institutions) # 6 input institutions: 3 Rijks + 2 Van Gogh + 1 Stedelijk # Should dedupe to: 1 Rijks + 1 Van Gogh + 1 Stedelijk = 3 unique assert len(unique) == 3 assert deduplicator.duplicates_removed == 3 # (3-1) + (2-1) + (1-1) = 3 assert len(deduplicator.duplicate_groups) == 2 # Rijks and Van Gogh def test_deduplicate_stats(self, create_duplicate_set): """Test deduplication statistics""" institutions = create_duplicate_set() deduplicator = InstitutionDeduplicator() deduplicator.deduplicate(institutions) assert deduplicator.records_processed == 6 assert len(deduplicator.duplicate_groups) == 2 # Check duplicate group sizes group_sizes = [len(group) for group in deduplicator.duplicate_groups] assert sorted(group_sizes) == [2, 3] def test_deduplicate_report(self, create_duplicate_set): """Test deduplication report generation""" institutions = create_duplicate_set() deduplicator = InstitutionDeduplicator() deduplicator.deduplicate(institutions) report = deduplicator.get_deduplication_report() assert "Records processed: 6" in report assert "Duplicates removed: 3" in report assert "Rijksmuseum" in report assert "Van Gogh Museum" in report def test_deduplicate_without_metadata_merge(self, create_duplicate_set): """Test deduplication without metadata merging""" institutions = create_duplicate_set() deduplicator = InstitutionDeduplicator() unique = deduplicator.deduplicate(institutions, merge_metadata=False) # Should still deduplicate, just without merging assert len(unique) == 3 class TestEdgeCases: """Test edge cases discovered during Dutch dataset analysis""" def test_rotterdam_name_change_via_isil(self): """ Test Rotterdam case: Het Nieuwe Instituut → Nieuwe Instituut Same ISIL code → should merge as name change """ record1 = HeritageCustodian( id="test-rotterdam-1", name="Het Nieuwe Instituut", institution_type=InstitutionType.MUSEUM, locations=[Location(city="Rotterdam", country="NL")], identifiers=[ Identifier( identifier_scheme="ISIL", identifier_value="NL-RtHNI" ) ], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) record2 = HeritageCustodian( id="test-rotterdam-2", name="Nieuwe Instituut", institution_type=InstitutionType.MUSEUM, locations=[Location(city="Rotterdam", country="NL")], identifiers=[ Identifier( identifier_scheme="ISIL", identifier_value="NL-RtHNI" # SAME ISIL ) ], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) deduplicator = InstitutionDeduplicator() result = deduplicator.deduplicate([record1, record2]) # Should merge to one record (same ISIL code) assert len(result) == 1 # Should store old name in alternative_names assert result[0].alternative_names is not None assert len(result[0].alternative_names) > 0 # Check that both names are represented names = {result[0].name} | set(result[0].alternative_names) assert "Het Nieuwe Instituut" in names assert "Nieuwe Instituut" in names def test_losser_article_normalization(self): """ Test Losser case: de Historische Kring Losser vs Historische Kring Losser Dutch article "de" should be stripped for matching """ record1 = HeritageCustodian( id="test-losser-1", name="de Historische Kring Losser", institution_type=InstitutionType.COLLECTING_SOCIETY, locations=[Location(city="Losser", country="NL")], identifiers=[ Identifier( identifier_scheme="ISIL", identifier_value="NL-LsHKL" ) ], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) record2 = HeritageCustodian( id="test-losser-2", name="Historische Kring Losser", institution_type=InstitutionType.COLLECTING_SOCIETY, locations=[Location(city="Losser", country="NL")], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) deduplicator = InstitutionDeduplicator() result = deduplicator.deduplicate([record1, record2]) # Should merge (articles ignored in matching) assert len(result) == 1 # ISIL version should win (TIER_1 with more metadata) assert result[0].name == "de Historische Kring Losser" # Other name should be in alternatives assert result[0].alternative_names is not None assert "Historische Kring Losser" in result[0].alternative_names def test_article_normalization_het(self): """Test that 'het' article is also stripped""" record1 = HeritageCustodian( id="test-het-1", name="Het Gemeentearchief", institution_type=InstitutionType.ARCHIVE, locations=[Location(city="Amsterdam", country="NL")], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) record2 = HeritageCustodian( id="test-het-2", name="Gemeentearchief", institution_type=InstitutionType.ARCHIVE, locations=[Location(city="Amsterdam", country="NL")], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) deduplicator = InstitutionDeduplicator() result = deduplicator.deduplicate([record1, record2]) # Should merge (articles ignored) assert len(result) == 1 def test_municipality_vs_archive_not_merged(self): """ Test that Gemeente X and Gemeentearchief X are NOT merged (different institutions despite similar names) """ gemeente = HeritageCustodian( id="test-borne-gemeente", name="Gemeente Borne", institution_type=InstitutionType.OFFICIAL_INSTITUTION, locations=[Location(city="Borne", country="NL")], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) archief = HeritageCustodian( id="test-borne-archief", name="Gemeentearchief Borne", institution_type=InstitutionType.ARCHIVE, locations=[Location(city="Borne", country="NL")], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) deduplicator = InstitutionDeduplicator() result = deduplicator.deduplicate([gemeente, archief]) # Should NOT merge (different institutions) assert len(result) == 2 def test_isil_helper_methods(self): """Test ISIL code extraction and comparison helpers""" institution_with_isil = HeritageCustodian( id="test-isil-1", name="Test Museum", institution_type=InstitutionType.MUSEUM, locations=[Location(city="Amsterdam", country="NL")], identifiers=[ Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdTM"), Identifier(identifier_scheme="Wikidata", identifier_value="Q12345") ], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) institution_without_isil = HeritageCustodian( id="test-isil-2", name="Test Museum 2", institution_type=InstitutionType.MUSEUM, locations=[Location(city="Amsterdam", country="NL")], identifiers=[ Identifier(identifier_scheme="Wikidata", identifier_value="Q67890") ], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) # Test ISIL extraction assert InstitutionDeduplicator.get_isil_code(institution_with_isil) == "NL-AsdTM" assert InstitutionDeduplicator.get_isil_code(institution_without_isil) is None # Test ISIL comparison same_isil = HeritageCustodian( id="test-isil-3", name="Test Museum (different name)", institution_type=InstitutionType.MUSEUM, locations=[Location(city="Amsterdam", country="NL")], identifiers=[ Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdTM") ], provenance=Provenance( data_source=DataSource.CSV_REGISTRY, data_tier=DataTier.TIER_1_AUTHORITATIVE, extraction_date=datetime.now(timezone.utc) ) ) assert InstitutionDeduplicator.has_same_isil(institution_with_isil, same_isil) is True assert InstitutionDeduplicator.has_same_isil(institution_with_isil, institution_without_isil) is False