932 lines
37 KiB
Python
932 lines
37 KiB
Python
"""
|
|
Tests for Institution Deduplicator
|
|
|
|
Test coverage:
|
|
- Name normalization
|
|
- Match key generation
|
|
- Tier-based priority selection
|
|
- Metadata merging
|
|
- Edge cases (missing fields, whitespace variants)
|
|
"""
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
import pytest
|
|
|
|
from glam_extractor.models import (
|
|
DataSource,
|
|
DataTier,
|
|
DigitalPlatform,
|
|
DigitalPlatformType,
|
|
HeritageCustodian,
|
|
Identifier,
|
|
InstitutionType,
|
|
Location,
|
|
Provenance,
|
|
)
|
|
from glam_extractor.parsers.deduplicator import InstitutionDeduplicator
|
|
|
|
|
|
class TestNameNormalization:
|
|
"""Test name normalization logic"""
|
|
|
|
def test_normalize_basic(self):
|
|
"""Test basic normalization (lowercase, strip)"""
|
|
assert InstitutionDeduplicator.normalize_name("Rijksmuseum") == "rijksmuseum"
|
|
assert InstitutionDeduplicator.normalize_name(" Amsterdam Museum ") == "amsterdam museum"
|
|
|
|
def test_normalize_whitespace(self):
|
|
"""Test whitespace collapsing"""
|
|
assert InstitutionDeduplicator.normalize_name("Museum van Loon") == "museum van loon"
|
|
assert InstitutionDeduplicator.normalize_name("Museum\t\tName") == "museum name"
|
|
|
|
def test_normalize_punctuation(self):
|
|
"""Test punctuation removal"""
|
|
assert InstitutionDeduplicator.normalize_name("Museum (Amsterdam)") == "museum amsterdam"
|
|
assert InstitutionDeduplicator.normalize_name("Rijksmuseum, Amsterdam") == "rijksmuseum amsterdam"
|
|
assert InstitutionDeduplicator.normalize_name("Museum!") == "museum"
|
|
|
|
def test_normalize_hyphens_preserved(self):
|
|
"""Test that hyphens in names are preserved"""
|
|
assert InstitutionDeduplicator.normalize_name("Anne-Frank-Huis") == "anne-frank-huis"
|
|
|
|
|
|
class TestMatchKeyGeneration:
|
|
"""Test match key generation"""
|
|
|
|
@pytest.fixture
|
|
def sample_institution(self):
|
|
"""Create sample institution for testing"""
|
|
return HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-001",
|
|
name="Rijksmuseum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(
|
|
city="Amsterdam",
|
|
country="NL",
|
|
location_type="primary",
|
|
postal_code=None,
|
|
street_address=None,
|
|
region=None,
|
|
latitude=None,
|
|
longitude=None,
|
|
geonames_id=None,
|
|
is_primary=True
|
|
)],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="test",
|
|
confidence_score=1.0,
|
|
conversation_id=None,
|
|
source_url=None,
|
|
verified_date=None,
|
|
verified_by=None
|
|
),
|
|
organization_status=None,
|
|
parent_organization=None,
|
|
founded_date=None,
|
|
closed_date=None,
|
|
homepage=None,
|
|
description=None,
|
|
alternative_names=[],
|
|
identifiers=[],
|
|
digital_platforms=[],
|
|
collections=[],
|
|
change_history=[],
|
|
ghcid_uuid=None,
|
|
ghcid_uuid_sha256=None,
|
|
record_id=None,
|
|
ghcid_numeric=None,
|
|
ghcid_current=None,
|
|
ghcid_original=None,
|
|
ghcid_history=None,
|
|
contact_info=None
|
|
)
|
|
|
|
def test_match_key_basic(self, sample_institution):
|
|
"""Test basic match key generation (name + city only, no type)"""
|
|
key = InstitutionDeduplicator.get_match_key(sample_institution)
|
|
assert key == "rijksmuseum|amsterdam"
|
|
|
|
def test_match_key_missing_city(self, sample_institution):
|
|
"""Test match key generation when city is missing"""
|
|
sample_institution.locations[0].city = None
|
|
key = InstitutionDeduplicator.get_match_key(sample_institution)
|
|
assert key is None
|
|
|
|
def test_match_key_no_locations(self, sample_institution):
|
|
"""Test match key generation when locations list is empty"""
|
|
sample_institution.locations = []
|
|
key = InstitutionDeduplicator.get_match_key(sample_institution)
|
|
assert key is None
|
|
|
|
def test_match_key_whitespace_variants(self, sample_institution):
|
|
"""Test that whitespace variants generate same key"""
|
|
sample_institution.name = "Rijksmuseum Amsterdam"
|
|
key1 = InstitutionDeduplicator.get_match_key(sample_institution)
|
|
|
|
sample_institution.name = " Rijksmuseum Amsterdam "
|
|
key2 = InstitutionDeduplicator.get_match_key(sample_institution)
|
|
|
|
assert key1 == key2
|
|
|
|
|
|
class TestTierPriority:
|
|
"""Test data tier priority selection"""
|
|
|
|
@pytest.fixture
|
|
def create_institution(self):
|
|
"""Factory function to create institutions with different tiers"""
|
|
def _create(name: str, tier: DataTier, num_identifiers: int = 0):
|
|
import re
|
|
# Slugify name for URI-safe ID
|
|
slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
|
|
|
identifiers = [
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value=f"NL-{i}",
|
|
identifier_url=None,
|
|
assigned_date=None
|
|
)
|
|
for i in range(num_identifiers)
|
|
]
|
|
|
|
return HeritageCustodian(
|
|
id=f"https://w3id.org/heritage/custodian/nl/test-{slug}",
|
|
name=name,
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(
|
|
city="Amsterdam",
|
|
country="NL",
|
|
location_type="primary",
|
|
postal_code=None,
|
|
street_address=None,
|
|
region=None,
|
|
latitude=None,
|
|
longitude=None,
|
|
geonames_id=None,
|
|
is_primary=True
|
|
)],
|
|
identifiers=identifiers,
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=tier,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="test",
|
|
confidence_score=1.0,
|
|
conversation_id=None,
|
|
source_url=None,
|
|
verified_date=None,
|
|
verified_by=None
|
|
),
|
|
organization_status=None,
|
|
parent_organization=None,
|
|
founded_date=None,
|
|
closed_date=None,
|
|
homepage=None,
|
|
description=None,
|
|
alternative_names=[],
|
|
digital_platforms=[],
|
|
collections=[],
|
|
change_history=[],
|
|
ghcid_uuid=None,
|
|
ghcid_uuid_sha256=None,
|
|
record_id=None,
|
|
ghcid_numeric=None,
|
|
ghcid_current=None,
|
|
ghcid_original=None,
|
|
ghcid_history=None,
|
|
contact_info=None
|
|
)
|
|
return _create
|
|
|
|
def test_tier_1_wins_over_tier_4(self, create_institution):
|
|
"""Test TIER_1 selected over TIER_4"""
|
|
tier1_inst = create_institution("Museum A", DataTier.TIER_1_AUTHORITATIVE)
|
|
tier4_inst = create_institution("Museum A", DataTier.TIER_4_INFERRED)
|
|
|
|
primary = InstitutionDeduplicator.select_primary_record([tier4_inst, tier1_inst])
|
|
assert primary is tier1_inst
|
|
|
|
def test_tier_2_wins_over_tier_3(self, create_institution):
|
|
"""Test TIER_2 selected over TIER_3"""
|
|
tier2_inst = create_institution("Museum B", DataTier.TIER_2_VERIFIED)
|
|
tier3_inst = create_institution("Museum B", DataTier.TIER_3_CROWD_SOURCED)
|
|
|
|
primary = InstitutionDeduplicator.select_primary_record([tier3_inst, tier2_inst])
|
|
assert primary is tier2_inst
|
|
|
|
def test_metadata_completeness_as_tiebreaker(self, create_institution):
|
|
"""Test that metadata completeness breaks tier ties"""
|
|
inst_few_ids = create_institution("Museum C", DataTier.TIER_1_AUTHORITATIVE, num_identifiers=1)
|
|
inst_many_ids = create_institution("Museum C", DataTier.TIER_1_AUTHORITATIVE, num_identifiers=5)
|
|
|
|
primary = InstitutionDeduplicator.select_primary_record([inst_few_ids, inst_many_ids])
|
|
assert primary is inst_many_ids
|
|
|
|
|
|
class TestMetadataMerging:
|
|
"""Test metadata merging from duplicates"""
|
|
|
|
@pytest.fixture
|
|
def create_institution_with_metadata(self):
|
|
"""Factory to create institutions with various metadata"""
|
|
def _create(
|
|
name: str,
|
|
tier: DataTier,
|
|
isil_code: str = None,
|
|
platform_name: str = None,
|
|
description: str = None
|
|
):
|
|
import re
|
|
# Slugify name for URI-safe ID
|
|
slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
|
|
|
identifiers = []
|
|
if isil_code:
|
|
identifiers.append(Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value=isil_code,
|
|
identifier_url=None,
|
|
assigned_date=None
|
|
))
|
|
|
|
platforms = []
|
|
if platform_name:
|
|
platforms.append(DigitalPlatform(
|
|
platform_name=platform_name,
|
|
platform_url=None,
|
|
platform_type=DigitalPlatformType.COLLECTION_MANAGEMENT,
|
|
vendor=None,
|
|
implemented_standards=None # Fixed: was metadata_standards
|
|
))
|
|
|
|
return HeritageCustodian(
|
|
id=f"https://w3id.org/heritage/custodian/nl/test-{slug}",
|
|
name=name,
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(
|
|
city="Amsterdam",
|
|
country="NL",
|
|
location_type="primary",
|
|
postal_code=None,
|
|
street_address=None,
|
|
region=None,
|
|
latitude=None,
|
|
longitude=None,
|
|
geonames_id=None,
|
|
is_primary=True
|
|
)],
|
|
identifiers=identifiers,
|
|
digital_platforms=platforms,
|
|
description=description,
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=tier,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="test",
|
|
confidence_score=1.0,
|
|
conversation_id=None,
|
|
source_url=None,
|
|
verified_date=None,
|
|
verified_by=None
|
|
),
|
|
organization_status=None,
|
|
parent_organization=None,
|
|
founded_date=None,
|
|
closed_date=None,
|
|
homepage=None,
|
|
alternative_names=[],
|
|
collections=[],
|
|
change_history=[],
|
|
ghcid_uuid=None,
|
|
ghcid_uuid_sha256=None,
|
|
record_id=None,
|
|
ghcid_numeric=None,
|
|
ghcid_current=None,
|
|
ghcid_original=None,
|
|
ghcid_history=None,
|
|
contact_info=None
|
|
)
|
|
return _create
|
|
|
|
def test_merge_identifiers(self, create_institution_with_metadata):
|
|
"""Test merging unique identifiers from duplicates"""
|
|
inst1 = create_institution_with_metadata(
|
|
"Museum", DataTier.TIER_1_AUTHORITATIVE, isil_code="NL-001"
|
|
)
|
|
inst2 = create_institution_with_metadata(
|
|
"Museum", DataTier.TIER_4_INFERRED, isil_code="NL-002"
|
|
)
|
|
|
|
merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2])
|
|
|
|
assert len(merged.identifiers) == 2
|
|
isil_codes = {i.identifier_value for i in merged.identifiers}
|
|
assert isil_codes == {"NL-001", "NL-002"}
|
|
|
|
def test_merge_platforms(self, create_institution_with_metadata):
|
|
"""Test merging unique platforms from duplicates"""
|
|
inst1 = create_institution_with_metadata(
|
|
"Museum", DataTier.TIER_1_AUTHORITATIVE, platform_name="Atlantis"
|
|
)
|
|
inst2 = create_institution_with_metadata(
|
|
"Museum", DataTier.TIER_4_INFERRED, platform_name="MAIS"
|
|
)
|
|
|
|
merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2])
|
|
|
|
assert len(merged.digital_platforms) == 2
|
|
platform_names = {p.platform_name for p in merged.digital_platforms}
|
|
assert platform_names == {"Atlantis", "MAIS"}
|
|
|
|
def test_merge_descriptions(self, create_institution_with_metadata):
|
|
"""Test merging unique descriptions"""
|
|
inst1 = create_institution_with_metadata(
|
|
"Museum", DataTier.TIER_1_AUTHORITATIVE, description="Note A"
|
|
)
|
|
inst2 = create_institution_with_metadata(
|
|
"Museum", DataTier.TIER_4_INFERRED, description="Note B"
|
|
)
|
|
|
|
merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2])
|
|
|
|
assert "Note A" in merged.description
|
|
assert "Note B" in merged.description
|
|
|
|
def test_no_duplicate_identifiers(self, create_institution_with_metadata):
|
|
"""Test that duplicate identifiers are not added twice"""
|
|
inst1 = create_institution_with_metadata(
|
|
"Museum", DataTier.TIER_1_AUTHORITATIVE, isil_code="NL-001"
|
|
)
|
|
inst2 = create_institution_with_metadata(
|
|
"Museum", DataTier.TIER_4_INFERRED, isil_code="NL-001"
|
|
)
|
|
|
|
merged = InstitutionDeduplicator.merge_metadata(inst1, [inst1, inst2])
|
|
|
|
# Should only have one NL-001 identifier
|
|
assert len(merged.identifiers) == 1
|
|
assert merged.identifiers[0].identifier_value == "NL-001"
|
|
|
|
|
|
class TestTypeResolution:
|
|
"""Test institution type resolution logic"""
|
|
|
|
@pytest.fixture
|
|
def create_institution_with_type(self):
|
|
"""Factory to create institutions with specific types"""
|
|
def _create(name: str, inst_type: InstitutionType, tier: DataTier):
|
|
import re
|
|
# Slugify name for URI-safe ID
|
|
slug = re.sub(r'[^a-z0-9]+', '-', f"{name}-{inst_type}".lower()).strip('-')
|
|
|
|
return HeritageCustodian(
|
|
id=f"https://w3id.org/heritage/custodian/nl/test-{slug}",
|
|
name=name,
|
|
institution_type=inst_type,
|
|
locations=[Location(
|
|
city="Amsterdam",
|
|
country="NL",
|
|
location_type="primary",
|
|
postal_code=None,
|
|
street_address=None,
|
|
region=None,
|
|
latitude=None,
|
|
longitude=None,
|
|
geonames_id=None,
|
|
is_primary=True
|
|
)],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=tier,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="test",
|
|
confidence_score=1.0,
|
|
conversation_id=None,
|
|
source_url=None,
|
|
verified_date=None,
|
|
verified_by=None
|
|
),
|
|
organization_status=None,
|
|
parent_organization=None,
|
|
founded_date=None,
|
|
closed_date=None,
|
|
homepage=None,
|
|
description=None,
|
|
alternative_names=[],
|
|
identifiers=[],
|
|
digital_platforms=[],
|
|
collections=[],
|
|
change_history=[],
|
|
ghcid_uuid=None,
|
|
ghcid_uuid_sha256=None,
|
|
record_id=None,
|
|
ghcid_numeric=None,
|
|
ghcid_current=None,
|
|
ghcid_original=None,
|
|
ghcid_history=None,
|
|
contact_info=None
|
|
)
|
|
return _create
|
|
|
|
def test_resolve_type_all_match(self, create_institution_with_type):
|
|
"""Test type resolution when all types match"""
|
|
records = [
|
|
create_institution_with_type("Museum", InstitutionType.MUSEUM, DataTier.TIER_1_AUTHORITATIVE),
|
|
create_institution_with_type("Museum", InstitutionType.MUSEUM, DataTier.TIER_4_INFERRED)
|
|
]
|
|
|
|
resolved = InstitutionDeduplicator.resolve_institution_type(records)
|
|
assert str(resolved) == 'MUSEUM'
|
|
|
|
def test_resolve_type_mixed_vs_specific(self, create_institution_with_type):
|
|
"""Test type resolution: MIXED + MUSEUM → MUSEUM (prefer specific)"""
|
|
records = [
|
|
create_institution_with_type("Museum", InstitutionType.MIXED, DataTier.TIER_1_AUTHORITATIVE),
|
|
create_institution_with_type("Museum", InstitutionType.MUSEUM, DataTier.TIER_4_INFERRED)
|
|
]
|
|
|
|
resolved = InstitutionDeduplicator.resolve_institution_type(records)
|
|
assert str(resolved) == 'MUSEUM'
|
|
|
|
def test_resolve_type_mixed_vs_library(self, create_institution_with_type):
|
|
"""Test type resolution: MIXED + LIBRARY → LIBRARY (prefer specific)"""
|
|
records = [
|
|
create_institution_with_type("Library", InstitutionType.MIXED, DataTier.TIER_1_AUTHORITATIVE),
|
|
create_institution_with_type("Library", InstitutionType.LIBRARY, DataTier.TIER_4_INFERRED)
|
|
]
|
|
|
|
resolved = InstitutionDeduplicator.resolve_institution_type(records)
|
|
assert str(resolved) == 'LIBRARY'
|
|
|
|
def test_resolve_type_conflicting_use_tier(self, create_institution_with_type):
|
|
"""Test type resolution: conflicting types (MUSEUM vs LIBRARY) → use highest tier"""
|
|
records = [
|
|
create_institution_with_type("Org", InstitutionType.MUSEUM, DataTier.TIER_1_AUTHORITATIVE),
|
|
create_institution_with_type("Org", InstitutionType.LIBRARY, DataTier.TIER_4_INFERRED)
|
|
]
|
|
|
|
resolved = InstitutionDeduplicator.resolve_institution_type(records)
|
|
# Should use MUSEUM because TIER_1 > TIER_4
|
|
assert str(resolved) == 'MUSEUM'
|
|
|
|
def test_resolve_type_all_mixed(self, create_institution_with_type):
|
|
"""Test type resolution: all MIXED → return MIXED"""
|
|
records = [
|
|
create_institution_with_type("Org", InstitutionType.MIXED, DataTier.TIER_1_AUTHORITATIVE),
|
|
create_institution_with_type("Org", InstitutionType.MIXED, DataTier.TIER_4_INFERRED)
|
|
]
|
|
|
|
resolved = InstitutionDeduplicator.resolve_institution_type(records)
|
|
assert str(resolved) == 'MIXED'
|
|
|
|
|
|
class TestDeduplication:
|
|
"""Test full deduplication workflow"""
|
|
|
|
@pytest.fixture
|
|
def create_duplicate_set(self):
|
|
"""Create set of institutions with duplicates"""
|
|
def _create():
|
|
institutions = []
|
|
|
|
# Duplicate group 1: Rijksmuseum (3 instances, different tiers)
|
|
for i, tier in enumerate([
|
|
DataTier.TIER_1_AUTHORITATIVE,
|
|
DataTier.TIER_4_INFERRED,
|
|
DataTier.TIER_4_INFERRED
|
|
]):
|
|
institutions.append(HeritageCustodian(
|
|
id=f"rijks-{i}",
|
|
name="Rijksmuseum" if i == 0 else " Rijksmuseum ", # Test whitespace
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(
|
|
city="Amsterdam",
|
|
country="NL",
|
|
location_type="primary",
|
|
postal_code=None,
|
|
street_address=None,
|
|
region=None,
|
|
latitude=None,
|
|
longitude=None,
|
|
geonames_id=None,
|
|
is_primary=True
|
|
)],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=tier,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="test",
|
|
confidence_score=1.0,
|
|
conversation_id=None,
|
|
source_url=None,
|
|
verified_date=None,
|
|
verified_by=None
|
|
),
|
|
organization_status=None,
|
|
parent_organization=None,
|
|
founded_date=None,
|
|
closed_date=None,
|
|
homepage=None,
|
|
description=None,
|
|
alternative_names=[],
|
|
identifiers=[],
|
|
digital_platforms=[],
|
|
collections=[],
|
|
change_history=[],
|
|
ghcid_uuid=None,
|
|
ghcid_uuid_sha256=None,
|
|
record_id=None,
|
|
ghcid_numeric=None,
|
|
ghcid_current=None,
|
|
ghcid_original=None,
|
|
ghcid_history=None,
|
|
contact_info=None
|
|
))
|
|
|
|
# Duplicate group 2: Van Gogh Museum (2 instances)
|
|
for i in range(2):
|
|
institutions.append(HeritageCustodian(
|
|
id=f"vg-{i}",
|
|
name="Van Gogh Museum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(
|
|
city="Amsterdam",
|
|
country="NL",
|
|
location_type="primary",
|
|
postal_code=None,
|
|
street_address=None,
|
|
region=None,
|
|
latitude=None,
|
|
longitude=None,
|
|
geonames_id=None,
|
|
is_primary=True
|
|
)],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="test",
|
|
confidence_score=1.0,
|
|
conversation_id=None,
|
|
source_url=None,
|
|
verified_date=None,
|
|
verified_by=None
|
|
),
|
|
organization_status=None,
|
|
parent_organization=None,
|
|
founded_date=None,
|
|
closed_date=None,
|
|
homepage=None,
|
|
description=None,
|
|
alternative_names=[],
|
|
identifiers=[],
|
|
digital_platforms=[],
|
|
collections=[],
|
|
change_history=[],
|
|
ghcid_uuid=None,
|
|
ghcid_uuid_sha256=None,
|
|
record_id=None,
|
|
ghcid_numeric=None,
|
|
ghcid_current=None,
|
|
ghcid_original=None,
|
|
ghcid_history=None,
|
|
contact_info=None
|
|
))
|
|
|
|
# Unique institution: Stedelijk Museum
|
|
institutions.append(HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/stedelijk-1",
|
|
name="Stedelijk Museum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(
|
|
city="Amsterdam",
|
|
country="NL",
|
|
location_type="primary",
|
|
postal_code=None,
|
|
street_address=None,
|
|
region=None,
|
|
latitude=None,
|
|
longitude=None,
|
|
geonames_id=None,
|
|
is_primary=True
|
|
)],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="test",
|
|
confidence_score=1.0,
|
|
conversation_id=None,
|
|
source_url=None,
|
|
verified_date=None,
|
|
verified_by=None
|
|
),
|
|
organization_status=None,
|
|
parent_organization=None,
|
|
founded_date=None,
|
|
closed_date=None,
|
|
homepage=None,
|
|
description=None,
|
|
alternative_names=[],
|
|
identifiers=[],
|
|
digital_platforms=[],
|
|
collections=[],
|
|
change_history=[],
|
|
ghcid_uuid=None,
|
|
ghcid_uuid_sha256=None,
|
|
record_id=None,
|
|
ghcid_numeric=None,
|
|
ghcid_current=None,
|
|
ghcid_original=None,
|
|
ghcid_history=None,
|
|
contact_info=None
|
|
))
|
|
|
|
return institutions
|
|
return _create
|
|
|
|
def test_deduplicate_basic(self, create_duplicate_set):
|
|
"""Test basic deduplication workflow"""
|
|
institutions = create_duplicate_set()
|
|
deduplicator = InstitutionDeduplicator()
|
|
|
|
unique = deduplicator.deduplicate(institutions)
|
|
|
|
# 6 input institutions: 3 Rijks + 2 Van Gogh + 1 Stedelijk
|
|
# Should dedupe to: 1 Rijks + 1 Van Gogh + 1 Stedelijk = 3 unique
|
|
assert len(unique) == 3
|
|
assert deduplicator.duplicates_removed == 3 # (3-1) + (2-1) + (1-1) = 3
|
|
assert len(deduplicator.duplicate_groups) == 2 # Rijks and Van Gogh
|
|
|
|
def test_deduplicate_stats(self, create_duplicate_set):
|
|
"""Test deduplication statistics"""
|
|
institutions = create_duplicate_set()
|
|
deduplicator = InstitutionDeduplicator()
|
|
|
|
deduplicator.deduplicate(institutions)
|
|
|
|
assert deduplicator.records_processed == 6
|
|
assert len(deduplicator.duplicate_groups) == 2
|
|
|
|
# Check duplicate group sizes
|
|
group_sizes = [len(group) for group in deduplicator.duplicate_groups]
|
|
assert sorted(group_sizes) == [2, 3]
|
|
|
|
def test_deduplicate_report(self, create_duplicate_set):
|
|
"""Test deduplication report generation"""
|
|
institutions = create_duplicate_set()
|
|
deduplicator = InstitutionDeduplicator()
|
|
|
|
deduplicator.deduplicate(institutions)
|
|
report = deduplicator.get_deduplication_report()
|
|
|
|
assert "Records processed: 6" in report
|
|
assert "Duplicates removed: 3" in report
|
|
assert "Rijksmuseum" in report
|
|
assert "Van Gogh Museum" in report
|
|
|
|
def test_deduplicate_without_metadata_merge(self, create_duplicate_set):
|
|
"""Test deduplication without metadata merging"""
|
|
institutions = create_duplicate_set()
|
|
deduplicator = InstitutionDeduplicator()
|
|
|
|
unique = deduplicator.deduplicate(institutions, merge_metadata=False)
|
|
|
|
# Should still deduplicate, just without merging
|
|
assert len(unique) == 3
|
|
|
|
|
|
class TestEdgeCases:
|
|
"""Test edge cases discovered during Dutch dataset analysis"""
|
|
|
|
def test_rotterdam_name_change_via_isil(self):
|
|
"""
|
|
Test Rotterdam case: Het Nieuwe Instituut → Nieuwe Instituut
|
|
Same ISIL code → should merge as name change
|
|
"""
|
|
record1 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-rotterdam-1",
|
|
name="Het Nieuwe Instituut",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(city="Rotterdam", country="NL")],
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-RtHNI"
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
record2 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-rotterdam-2",
|
|
name="Nieuwe Instituut",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(city="Rotterdam", country="NL")],
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-RtHNI" # SAME ISIL
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
deduplicator = InstitutionDeduplicator()
|
|
result = deduplicator.deduplicate([record1, record2])
|
|
|
|
# Should merge to one record (same ISIL code)
|
|
assert len(result) == 1
|
|
|
|
# Should store old name in alternative_names
|
|
assert result[0].alternative_names is not None
|
|
assert len(result[0].alternative_names) > 0
|
|
|
|
# Check that both names are represented
|
|
names = {result[0].name} | set(result[0].alternative_names)
|
|
assert "Het Nieuwe Instituut" in names
|
|
assert "Nieuwe Instituut" in names
|
|
|
|
def test_losser_article_normalization(self):
|
|
"""
|
|
Test Losser case: de Historische Kring Losser vs Historische Kring Losser
|
|
Dutch article "de" should be stripped for matching
|
|
"""
|
|
record1 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-losser-1",
|
|
name="de Historische Kring Losser",
|
|
institution_type=InstitutionType.COLLECTING_SOCIETY,
|
|
locations=[Location(city="Losser", country="NL")],
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-LsHKL"
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
record2 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-losser-2",
|
|
name="Historische Kring Losser",
|
|
institution_type=InstitutionType.COLLECTING_SOCIETY,
|
|
locations=[Location(city="Losser", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
deduplicator = InstitutionDeduplicator()
|
|
result = deduplicator.deduplicate([record1, record2])
|
|
|
|
# Should merge (articles ignored in matching)
|
|
assert len(result) == 1
|
|
|
|
# ISIL version should win (TIER_1 with more metadata)
|
|
assert result[0].name == "de Historische Kring Losser"
|
|
|
|
# Other name should be in alternatives
|
|
assert result[0].alternative_names is not None
|
|
assert "Historische Kring Losser" in result[0].alternative_names
|
|
|
|
def test_article_normalization_het(self):
|
|
"""Test that 'het' article is also stripped"""
|
|
record1 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-het-1",
|
|
name="Het Gemeentearchief",
|
|
institution_type=InstitutionType.ARCHIVE,
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
record2 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-het-2",
|
|
name="Gemeentearchief",
|
|
institution_type=InstitutionType.ARCHIVE,
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
deduplicator = InstitutionDeduplicator()
|
|
result = deduplicator.deduplicate([record1, record2])
|
|
|
|
# Should merge (articles ignored)
|
|
assert len(result) == 1
|
|
|
|
def test_municipality_vs_archive_not_merged(self):
|
|
"""
|
|
Test that Gemeente X and Gemeentearchief X are NOT merged
|
|
(different institutions despite similar names)
|
|
"""
|
|
gemeente = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-borne-gemeente",
|
|
name="Gemeente Borne",
|
|
institution_type=InstitutionType.OFFICIAL_INSTITUTION,
|
|
locations=[Location(city="Borne", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
archief = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-borne-archief",
|
|
name="Gemeentearchief Borne",
|
|
institution_type=InstitutionType.ARCHIVE,
|
|
locations=[Location(city="Borne", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
deduplicator = InstitutionDeduplicator()
|
|
result = deduplicator.deduplicate([gemeente, archief])
|
|
|
|
# Should NOT merge (different institutions)
|
|
assert len(result) == 2
|
|
|
|
def test_isil_helper_methods(self):
|
|
"""Test ISIL code extraction and comparison helpers"""
|
|
institution_with_isil = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-isil-1",
|
|
name="Test Museum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
identifiers=[
|
|
Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdTM"),
|
|
Identifier(identifier_scheme="Wikidata", identifier_value="Q12345")
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
institution_without_isil = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-isil-2",
|
|
name="Test Museum 2",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
identifiers=[
|
|
Identifier(identifier_scheme="Wikidata", identifier_value="Q67890")
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
# Test ISIL extraction
|
|
assert InstitutionDeduplicator.get_isil_code(institution_with_isil) == "NL-AsdTM"
|
|
assert InstitutionDeduplicator.get_isil_code(institution_without_isil) is None
|
|
|
|
# Test ISIL comparison
|
|
same_isil = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/test-isil-3",
|
|
name="Test Museum (different name)",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
identifiers=[
|
|
Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdTM")
|
|
],
|
|
provenance=Provenance(
|
|
data_source=DataSource.CSV_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
assert InstitutionDeduplicator.has_same_isil(institution_with_isil, same_isil) is True
|
|
assert InstitutionDeduplicator.has_same_isil(institution_with_isil, institution_without_isil) is False
|