glam/tests/parsers/test_deduplicator.py
2025-12-05 15:30:23 +01:00

932 lines
37 KiB
Python

"""
Tests for Institution Deduplicator
Test coverage:
- Name normalization
- Match key generation
- Tier-based priority selection
- Metadata merging
- Edge cases (missing fields, whitespace variants)
"""
from datetime import datetime, timezone
import pytest
from glam_extractor.models import (
DataSource,
DataTier,
DigitalPlatform,
DigitalPlatformType,
HeritageCustodian,
Identifier,
InstitutionType,
Location,
Provenance,
)
from glam_extractor.parsers.deduplicator import InstitutionDeduplicator
class TestNameNormalization:
"""Test name normalization logic"""
def test_normalize_basic(self):
"""Test basic normalization (lowercase, strip)"""
assert InstitutionDeduplicator.normalize_name("Rijksmuseum") == "rijksmuseum"
assert InstitutionDeduplicator.normalize_name(" Amsterdam Museum ") == "amsterdam museum"
def test_normalize_whitespace(self):
"""Test whitespace collapsing"""
assert InstitutionDeduplicator.normalize_name("Museum van Loon") == "museum van loon"
assert InstitutionDeduplicator.normalize_name("Museum\t\tName") == "museum name"
def test_normalize_punctuation(self):
"""Test punctuation removal"""
assert InstitutionDeduplicator.normalize_name("Museum (Amsterdam)") == "museum amsterdam"
assert InstitutionDeduplicator.normalize_name("Rijksmuseum, Amsterdam") == "rijksmuseum amsterdam"
assert InstitutionDeduplicator.normalize_name("Museum!") == "museum"
def test_normalize_hyphens_preserved(self):
"""Test that hyphens in names are preserved"""
assert InstitutionDeduplicator.normalize_name("Anne-Frank-Huis") == "anne-frank-huis"
class TestMatchKeyGeneration:
"""Test match key generation"""
@pytest.fixture
def sample_institution(self):
"""Create sample institution for testing"""
return HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-001",
name="Rijksmuseum",
institution_type=InstitutionType.MUSEUM,
locations=[Location(
city="Amsterdam",
country="NL",
location_type="primary",
postal_code=None,
street_address=None,
region=None,
latitude=None,
longitude=None,
geonames_id=None,
is_primary=True
)],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="test",
confidence_score=1.0,
conversation_id=None,
source_url=None,
verified_date=None,
verified_by=None
),
organization_status=None,
parent_organization=None,
founded_date=None,
closed_date=None,
homepage=None,
description=None,
alternative_names=[],
identifiers=[],
digital_platforms=[],
collections=[],
change_history=[],
ghcid_uuid=None,
ghcid_uuid_sha256=None,
record_id=None,
ghcid_numeric=None,
ghcid_current=None,
ghcid_original=None,
ghcid_history=None,
contact_info=None
)
def test_match_key_basic(self, sample_institution):
"""Test basic match key generation (name + city only, no type)"""
key = InstitutionDeduplicator.get_match_key(sample_institution)
assert key == "rijksmuseum|amsterdam"
def test_match_key_missing_city(self, sample_institution):
"""Test match key generation when city is missing"""
sample_institution.locations[0].city = None
key = InstitutionDeduplicator.get_match_key(sample_institution)
assert key is None
def test_match_key_no_locations(self, sample_institution):
"""Test match key generation when locations list is empty"""
sample_institution.locations = []
key = InstitutionDeduplicator.get_match_key(sample_institution)
assert key is None
def test_match_key_whitespace_variants(self, sample_institution):
"""Test that whitespace variants generate same key"""
sample_institution.name = "Rijksmuseum Amsterdam"
key1 = InstitutionDeduplicator.get_match_key(sample_institution)
sample_institution.name = " Rijksmuseum Amsterdam "
key2 = InstitutionDeduplicator.get_match_key(sample_institution)
assert key1 == key2
class TestTierPriority:
"""Test data tier priority selection"""
@pytest.fixture
def create_institution(self):
"""Factory function to create institutions with different tiers"""
def _create(name: str, tier: DataTier, num_identifiers: int = 0):
import re
# Slugify name for URI-safe ID
slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
identifiers = [
Identifier(
identifier_scheme="ISIL",
identifier_value=f"NL-{i}",
identifier_url=None,
assigned_date=None
)
for i in range(num_identifiers)
]
return HeritageCustodian(
id=f"https://w3id.org/heritage/custodian/nl/test-{slug}",
name=name,
institution_type=InstitutionType.MUSEUM,
locations=[Location(
city="Amsterdam",
country="NL",
location_type="primary",
postal_code=None,
street_address=None,
region=None,
latitude=None,
longitude=None,
geonames_id=None,
is_primary=True
)],
identifiers=identifiers,
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=tier,
extraction_date=datetime.now(timezone.utc),
extraction_method="test",
confidence_score=1.0,
conversation_id=None,
source_url=None,
verified_date=None,
verified_by=None
),
organization_status=None,
parent_organization=None,
founded_date=None,
closed_date=None,
homepage=None,
description=None,
alternative_names=[],
digital_platforms=[],
collections=[],
change_history=[],
ghcid_uuid=None,
ghcid_uuid_sha256=None,
record_id=None,
ghcid_numeric=None,
ghcid_current=None,
ghcid_original=None,
ghcid_history=None,
contact_info=None
)
return _create
def test_tier_1_wins_over_tier_4(self, create_institution):
"""Test TIER_1 selected over TIER_4"""
tier1_inst = create_institution("Museum A", DataTier.TIER_1_AUTHORITATIVE)
tier4_inst = create_institution("Museum A", DataTier.TIER_4_INFERRED)
primary = InstitutionDeduplicator.select_primary_record([tier4_inst, tier1_inst])
assert primary is tier1_inst
def test_tier_2_wins_over_tier_3(self, create_institution):
"""Test TIER_2 selected over TIER_3"""
tier2_inst = create_institution("Museum B", DataTier.TIER_2_VERIFIED)
tier3_inst = create_institution("Museum B", DataTier.TIER_3_CROWD_SOURCED)
primary = InstitutionDeduplicator.select_primary_record([tier3_inst, tier2_inst])
assert primary is tier2_inst
def test_metadata_completeness_as_tiebreaker(self, create_institution):
"""Test that metadata completeness breaks tier ties"""
inst_few_ids = create_institution("Museum C", DataTier.TIER_1_AUTHORITATIVE, num_identifiers=1)
inst_many_ids = create_institution("Museum C", DataTier.TIER_1_AUTHORITATIVE, num_identifiers=5)
primary = InstitutionDeduplicator.select_primary_record([inst_few_ids, inst_many_ids])
assert primary is inst_many_ids
class TestMetadataMerging:
"""Test metadata merging from duplicates"""
@pytest.fixture
def create_institution_with_metadata(self):
"""Factory to create institutions with various metadata"""
def _create(
name: str,
tier: DataTier,
isil_code: str = None,
platform_name: str = None,
description: str = None
):
import re
# Slugify name for URI-safe ID
slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
identifiers = []
if isil_code:
identifiers.append(Identifier(
identifier_scheme="ISIL",
identifier_value=isil_code,
identifier_url=None,
assigned_date=None
))
platforms = []
if platform_name:
platforms.append(DigitalPlatform(
platform_name=platform_name,
platform_url=None,
platform_type=DigitalPlatformType.COLLECTION_MANAGEMENT,
vendor=None,
implemented_standards=None # Fixed: was metadata_standards
))
return HeritageCustodian(
id=f"https://w3id.org/heritage/custodian/nl/test-{slug}",
name=name,
institution_type=InstitutionType.MUSEUM,
locations=[Location(
city="Amsterdam",
country="NL",
location_type="primary",
postal_code=None,
street_address=None,
region=None,
latitude=None,
longitude=None,
geonames_id=None,
is_primary=True
)],
identifiers=identifiers,
digital_platforms=platforms,
description=description,
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=tier,
extraction_date=datetime.now(timezone.utc),
extraction_method="test",
confidence_score=1.0,
conversation_id=None,
source_url=None,
verified_date=None,
verified_by=None
),
organization_status=None,
parent_organization=None,
founded_date=None,
closed_date=None,
homepage=None,
alternative_names=[],
collections=[],
change_history=[],
ghcid_uuid=None,
ghcid_uuid_sha256=None,
record_id=None,
ghcid_numeric=None,
ghcid_current=None,
ghcid_original=None,
ghcid_history=None,
contact_info=None
)
return _create
def test_merge_identifiers(self, create_institution_with_metadata):
"""Test merging unique identifiers from duplicates"""
inst1 = create_institution_with_metadata(
"Museum", DataTier.TIER_1_AUTHORITATIVE, isil_code="NL-001"
)
inst2 = create_institution_with_metadata(
"Museum", DataTier.TIER_4_INFERRED, isil_code="NL-002"
)
merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2])
assert len(merged.identifiers) == 2
isil_codes = {i.identifier_value for i in merged.identifiers}
assert isil_codes == {"NL-001", "NL-002"}
def test_merge_platforms(self, create_institution_with_metadata):
"""Test merging unique platforms from duplicates"""
inst1 = create_institution_with_metadata(
"Museum", DataTier.TIER_1_AUTHORITATIVE, platform_name="Atlantis"
)
inst2 = create_institution_with_metadata(
"Museum", DataTier.TIER_4_INFERRED, platform_name="MAIS"
)
merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2])
assert len(merged.digital_platforms) == 2
platform_names = {p.platform_name for p in merged.digital_platforms}
assert platform_names == {"Atlantis", "MAIS"}
def test_merge_descriptions(self, create_institution_with_metadata):
"""Test merging unique descriptions"""
inst1 = create_institution_with_metadata(
"Museum", DataTier.TIER_1_AUTHORITATIVE, description="Note A"
)
inst2 = create_institution_with_metadata(
"Museum", DataTier.TIER_4_INFERRED, description="Note B"
)
merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2])
assert "Note A" in merged.description
assert "Note B" in merged.description
def test_no_duplicate_identifiers(self, create_institution_with_metadata):
"""Test that duplicate identifiers are not added twice"""
inst1 = create_institution_with_metadata(
"Museum", DataTier.TIER_1_AUTHORITATIVE, isil_code="NL-001"
)
inst2 = create_institution_with_metadata(
"Museum", DataTier.TIER_4_INFERRED, isil_code="NL-001"
)
merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2])
# Should only have one NL-001 identifier
assert len(merged.identifiers) == 1
assert merged.identifiers[0].identifier_value == "NL-001"
class TestTypeResolution:
"""Test institution type resolution logic"""
@pytest.fixture
def create_institution_with_type(self):
"""Factory to create institutions with specific types"""
def _create(name: str, inst_type: InstitutionType, tier: DataTier):
import re
# Slugify name for URI-safe ID
slug = re.sub(r'[^a-z0-9]+', '-', f"{name}-{inst_type}".lower()).strip('-')
return HeritageCustodian(
id=f"https://w3id.org/heritage/custodian/nl/test-{slug}",
name=name,
institution_type=inst_type,
locations=[Location(
city="Amsterdam",
country="NL",
location_type="primary",
postal_code=None,
street_address=None,
region=None,
latitude=None,
longitude=None,
geonames_id=None,
is_primary=True
)],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=tier,
extraction_date=datetime.now(timezone.utc),
extraction_method="test",
confidence_score=1.0,
conversation_id=None,
source_url=None,
verified_date=None,
verified_by=None
),
organization_status=None,
parent_organization=None,
founded_date=None,
closed_date=None,
homepage=None,
description=None,
alternative_names=[],
identifiers=[],
digital_platforms=[],
collections=[],
change_history=[],
ghcid_uuid=None,
ghcid_uuid_sha256=None,
record_id=None,
ghcid_numeric=None,
ghcid_current=None,
ghcid_original=None,
ghcid_history=None,
contact_info=None
)
return _create
def test_resolve_type_all_match(self, create_institution_with_type):
"""Test type resolution when all types match"""
records = [
create_institution_with_type("Museum", InstitutionType.MUSEUM, DataTier.TIER_1_AUTHORITATIVE),
create_institution_with_type("Museum", InstitutionType.MUSEUM, DataTier.TIER_4_INFERRED)
]
resolved = InstitutionDeduplicator.resolve_institution_type(records)
assert str(resolved) == 'MUSEUM'
def test_resolve_type_mixed_vs_specific(self, create_institution_with_type):
"""Test type resolution: MIXED + MUSEUM → MUSEUM (prefer specific)"""
records = [
create_institution_with_type("Museum", InstitutionType.MIXED, DataTier.TIER_1_AUTHORITATIVE),
create_institution_with_type("Museum", InstitutionType.MUSEUM, DataTier.TIER_4_INFERRED)
]
resolved = InstitutionDeduplicator.resolve_institution_type(records)
assert str(resolved) == 'MUSEUM'
def test_resolve_type_mixed_vs_library(self, create_institution_with_type):
"""Test type resolution: MIXED + LIBRARY → LIBRARY (prefer specific)"""
records = [
create_institution_with_type("Library", InstitutionType.MIXED, DataTier.TIER_1_AUTHORITATIVE),
create_institution_with_type("Library", InstitutionType.LIBRARY, DataTier.TIER_4_INFERRED)
]
resolved = InstitutionDeduplicator.resolve_institution_type(records)
assert str(resolved) == 'LIBRARY'
def test_resolve_type_conflicting_use_tier(self, create_institution_with_type):
"""Test type resolution: conflicting types (MUSEUM vs LIBRARY) → use highest tier"""
records = [
create_institution_with_type("Org", InstitutionType.MUSEUM, DataTier.TIER_1_AUTHORITATIVE),
create_institution_with_type("Org", InstitutionType.LIBRARY, DataTier.TIER_4_INFERRED)
]
resolved = InstitutionDeduplicator.resolve_institution_type(records)
# Should use MUSEUM because TIER_1 > TIER_4
assert str(resolved) == 'MUSEUM'
def test_resolve_type_all_mixed(self, create_institution_with_type):
"""Test type resolution: all MIXED → return MIXED"""
records = [
create_institution_with_type("Org", InstitutionType.MIXED, DataTier.TIER_1_AUTHORITATIVE),
create_institution_with_type("Org", InstitutionType.MIXED, DataTier.TIER_4_INFERRED)
]
resolved = InstitutionDeduplicator.resolve_institution_type(records)
assert str(resolved) == 'MIXED'
class TestDeduplication:
"""Test full deduplication workflow"""
@pytest.fixture
def create_duplicate_set(self):
"""Create set of institutions with duplicates"""
def _create():
institutions = []
# Duplicate group 1: Rijksmuseum (3 instances, different tiers)
for i, tier in enumerate([
DataTier.TIER_1_AUTHORITATIVE,
DataTier.TIER_4_INFERRED,
DataTier.TIER_4_INFERRED
]):
institutions.append(HeritageCustodian(
id=f"rijks-{i}",
name="Rijksmuseum" if i == 0 else " Rijksmuseum ", # Test whitespace
institution_type=InstitutionType.MUSEUM,
locations=[Location(
city="Amsterdam",
country="NL",
location_type="primary",
postal_code=None,
street_address=None,
region=None,
latitude=None,
longitude=None,
geonames_id=None,
is_primary=True
)],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=tier,
extraction_date=datetime.now(timezone.utc),
extraction_method="test",
confidence_score=1.0,
conversation_id=None,
source_url=None,
verified_date=None,
verified_by=None
),
organization_status=None,
parent_organization=None,
founded_date=None,
closed_date=None,
homepage=None,
description=None,
alternative_names=[],
identifiers=[],
digital_platforms=[],
collections=[],
change_history=[],
ghcid_uuid=None,
ghcid_uuid_sha256=None,
record_id=None,
ghcid_numeric=None,
ghcid_current=None,
ghcid_original=None,
ghcid_history=None,
contact_info=None
))
# Duplicate group 2: Van Gogh Museum (2 instances)
for i in range(2):
institutions.append(HeritageCustodian(
id=f"vg-{i}",
name="Van Gogh Museum",
institution_type=InstitutionType.MUSEUM,
locations=[Location(
city="Amsterdam",
country="NL",
location_type="primary",
postal_code=None,
street_address=None,
region=None,
latitude=None,
longitude=None,
geonames_id=None,
is_primary=True
)],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="test",
confidence_score=1.0,
conversation_id=None,
source_url=None,
verified_date=None,
verified_by=None
),
organization_status=None,
parent_organization=None,
founded_date=None,
closed_date=None,
homepage=None,
description=None,
alternative_names=[],
identifiers=[],
digital_platforms=[],
collections=[],
change_history=[],
ghcid_uuid=None,
ghcid_uuid_sha256=None,
record_id=None,
ghcid_numeric=None,
ghcid_current=None,
ghcid_original=None,
ghcid_history=None,
contact_info=None
))
# Unique institution: Stedelijk Museum
institutions.append(HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/stedelijk-1",
name="Stedelijk Museum",
institution_type=InstitutionType.MUSEUM,
locations=[Location(
city="Amsterdam",
country="NL",
location_type="primary",
postal_code=None,
street_address=None,
region=None,
latitude=None,
longitude=None,
geonames_id=None,
is_primary=True
)],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc),
extraction_method="test",
confidence_score=1.0,
conversation_id=None,
source_url=None,
verified_date=None,
verified_by=None
),
organization_status=None,
parent_organization=None,
founded_date=None,
closed_date=None,
homepage=None,
description=None,
alternative_names=[],
identifiers=[],
digital_platforms=[],
collections=[],
change_history=[],
ghcid_uuid=None,
ghcid_uuid_sha256=None,
record_id=None,
ghcid_numeric=None,
ghcid_current=None,
ghcid_original=None,
ghcid_history=None,
contact_info=None
))
return institutions
return _create
def test_deduplicate_basic(self, create_duplicate_set):
"""Test basic deduplication workflow"""
institutions = create_duplicate_set()
deduplicator = InstitutionDeduplicator()
unique = deduplicator.deduplicate(institutions)
# 6 input institutions: 3 Rijks + 2 Van Gogh + 1 Stedelijk
# Should dedupe to: 1 Rijks + 1 Van Gogh + 1 Stedelijk = 3 unique
assert len(unique) == 3
assert deduplicator.duplicates_removed == 3 # (3-1) + (2-1) + (1-1) = 3
assert len(deduplicator.duplicate_groups) == 2 # Rijks and Van Gogh
def test_deduplicate_stats(self, create_duplicate_set):
"""Test deduplication statistics"""
institutions = create_duplicate_set()
deduplicator = InstitutionDeduplicator()
deduplicator.deduplicate(institutions)
assert deduplicator.records_processed == 6
assert len(deduplicator.duplicate_groups) == 2
# Check duplicate group sizes
group_sizes = [len(group) for group in deduplicator.duplicate_groups]
assert sorted(group_sizes) == [2, 3]
def test_deduplicate_report(self, create_duplicate_set):
"""Test deduplication report generation"""
institutions = create_duplicate_set()
deduplicator = InstitutionDeduplicator()
deduplicator.deduplicate(institutions)
report = deduplicator.get_deduplication_report()
assert "Records processed: 6" in report
assert "Duplicates removed: 3" in report
assert "Rijksmuseum" in report
assert "Van Gogh Museum" in report
def test_deduplicate_without_metadata_merge(self, create_duplicate_set):
"""Test deduplication without metadata merging"""
institutions = create_duplicate_set()
deduplicator = InstitutionDeduplicator()
unique = deduplicator.deduplicate(institutions, merge_metadata=False)
# Should still deduplicate, just without merging
assert len(unique) == 3
class TestEdgeCases:
"""Test edge cases discovered during Dutch dataset analysis"""
def test_rotterdam_name_change_via_isil(self):
"""
Test Rotterdam case: Het Nieuwe Instituut → Nieuwe Instituut
Same ISIL code → should merge as name change
"""
record1 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-rotterdam-1",
name="Het Nieuwe Instituut",
institution_type=InstitutionType.MUSEUM,
locations=[Location(city="Rotterdam", country="NL")],
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-RtHNI"
)
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
record2 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-rotterdam-2",
name="Nieuwe Instituut",
institution_type=InstitutionType.MUSEUM,
locations=[Location(city="Rotterdam", country="NL")],
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-RtHNI" # SAME ISIL
)
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
deduplicator = InstitutionDeduplicator()
result = deduplicator.deduplicate([record1, record2])
# Should merge to one record (same ISIL code)
assert len(result) == 1
# Should store old name in alternative_names
assert result[0].alternative_names is not None
assert len(result[0].alternative_names) > 0
# Check that both names are represented
names = {result[0].name} | set(result[0].alternative_names)
assert "Het Nieuwe Instituut" in names
assert "Nieuwe Instituut" in names
def test_losser_article_normalization(self):
"""
Test Losser case: de Historische Kring Losser vs Historische Kring Losser
Dutch article "de" should be stripped for matching
"""
record1 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-losser-1",
name="de Historische Kring Losser",
institution_type=InstitutionType.COLLECTING_SOCIETY,
locations=[Location(city="Losser", country="NL")],
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-LsHKL"
)
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
record2 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-losser-2",
name="Historische Kring Losser",
institution_type=InstitutionType.COLLECTING_SOCIETY,
locations=[Location(city="Losser", country="NL")],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
deduplicator = InstitutionDeduplicator()
result = deduplicator.deduplicate([record1, record2])
# Should merge (articles ignored in matching)
assert len(result) == 1
# ISIL version should win (TIER_1 with more metadata)
assert result[0].name == "de Historische Kring Losser"
# Other name should be in alternatives
assert result[0].alternative_names is not None
assert "Historische Kring Losser" in result[0].alternative_names
def test_article_normalization_het(self):
"""Test that 'het' article is also stripped"""
record1 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-het-1",
name="Het Gemeentearchief",
institution_type=InstitutionType.ARCHIVE,
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
record2 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-het-2",
name="Gemeentearchief",
institution_type=InstitutionType.ARCHIVE,
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
deduplicator = InstitutionDeduplicator()
result = deduplicator.deduplicate([record1, record2])
# Should merge (articles ignored)
assert len(result) == 1
def test_municipality_vs_archive_not_merged(self):
"""
Test that Gemeente X and Gemeentearchief X are NOT merged
(different institutions despite similar names)
"""
gemeente = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-borne-gemeente",
name="Gemeente Borne",
institution_type=InstitutionType.OFFICIAL_INSTITUTION,
locations=[Location(city="Borne", country="NL")],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
archief = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-borne-archief",
name="Gemeentearchief Borne",
institution_type=InstitutionType.ARCHIVE,
locations=[Location(city="Borne", country="NL")],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
deduplicator = InstitutionDeduplicator()
result = deduplicator.deduplicate([gemeente, archief])
# Should NOT merge (different institutions)
assert len(result) == 2
def test_isil_helper_methods(self):
"""Test ISIL code extraction and comparison helpers"""
institution_with_isil = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-isil-1",
name="Test Museum",
institution_type=InstitutionType.MUSEUM,
locations=[Location(city="Amsterdam", country="NL")],
identifiers=[
Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdTM"),
Identifier(identifier_scheme="Wikidata", identifier_value="Q12345")
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
institution_without_isil = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-isil-2",
name="Test Museum 2",
institution_type=InstitutionType.MUSEUM,
locations=[Location(city="Amsterdam", country="NL")],
identifiers=[
Identifier(identifier_scheme="Wikidata", identifier_value="Q67890")
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
# Test ISIL extraction
assert InstitutionDeduplicator.get_isil_code(institution_with_isil) == "NL-AsdTM"
assert InstitutionDeduplicator.get_isil_code(institution_without_isil) is None
# Test ISIL comparison
same_isil = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/test-isil-3",
name="Test Museum (different name)",
institution_type=InstitutionType.MUSEUM,
locations=[Location(city="Amsterdam", country="NL")],
identifiers=[
Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdTM")
],
provenance=Provenance(
data_source=DataSource.CSV_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime.now(timezone.utc)
)
)
assert InstitutionDeduplicator.has_same_isil(institution_with_isil, same_isil) is True
assert InstitutionDeduplicator.has_same_isil(institution_with_isil, institution_without_isil) is False