- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
641 lines
24 KiB
Python
641 lines
24 KiB
Python
"""
|
|
Integration Tests for GHCID Collision Detector with Real Dutch ISIL Registry Data
|
|
|
|
Tests collision detection using actual heritage institutions from the Dutch ISIL registry.
|
|
|
|
Real collision scenarios from data/ISIL-codes_2025-08-01.csv:
|
|
- Amsterdam: 7 museums, 16 official institutions, 3 archives
|
|
- Den Haag: 27 official institutions (ministries, research centers)
|
|
- Alphen aan den Rijn: 2 archives
|
|
- Arnhem: 3 museums
|
|
- Plus 38 more collision groups
|
|
|
|
These tests validate:
|
|
1. Collision detection with real institution names
|
|
2. GHCID generation from actual Dutch cities
|
|
3. Q-number assignment using real Wikidata IDs (when available)
|
|
4. Temporal collision resolution with realistic extraction dates
|
|
5. Integration between ISILRegistryParser and GHCIDCollisionDetector
|
|
"""
|
|
|
|
import pytest
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from glam_extractor.parsers.isil_registry import ISILRegistryParser
|
|
from glam_extractor.identifiers.collision_detector import (
|
|
GHCIDCollisionDetector,
|
|
CollisionGroup
|
|
)
|
|
from glam_extractor.identifiers.ghcid import (
|
|
GHCIDGenerator,
|
|
InstitutionType as GHCIDInstitutionType
|
|
)
|
|
from glam_extractor.identifiers.lookups import get_ghcid_components_for_dutch_city
|
|
from glam_extractor.models import (
|
|
HeritageCustodian,
|
|
Identifier,
|
|
InstitutionType,
|
|
Location,
|
|
Provenance,
|
|
DataSource,
|
|
DataTier
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def isil_csv_path() -> Path:
|
|
"""Path to real Dutch ISIL registry CSV"""
|
|
return Path(__file__).parent.parent.parent / "data" / "ISIL-codes_2025-08-01.csv"
|
|
|
|
|
|
@pytest.fixture
|
|
def isil_parser() -> ISILRegistryParser:
|
|
"""ISIL registry parser instance"""
|
|
return ISILRegistryParser()
|
|
|
|
|
|
@pytest.fixture
|
|
def collision_detector() -> GHCIDCollisionDetector:
|
|
"""Collision detector instance"""
|
|
return GHCIDCollisionDetector()
|
|
|
|
|
|
def generate_ghcid_for_institution(institution: HeritageCustodian) -> None:
|
|
"""
|
|
Helper function to generate GHCID for a HeritageCustodian instance.
|
|
|
|
This function modifies the institution in-place, setting ghcid, ghcid_uuid,
|
|
ghcid_uuid_sha256, and ghcid_numeric fields.
|
|
|
|
For Dutch institutions only. Extracts city from locations, uses lookup table
|
|
to get region code and city locode.
|
|
|
|
Args:
|
|
institution: HeritageCustodian instance to generate GHCID for
|
|
"""
|
|
# Extract required fields
|
|
if not institution.locations or len(institution.locations) == 0:
|
|
return # Cannot generate GHCID without location
|
|
|
|
city = institution.locations[0].city
|
|
country = institution.locations[0].country or "NL"
|
|
|
|
# For Dutch cities, use lookup table
|
|
component_dict = get_ghcid_components_for_dutch_city(
|
|
city=city,
|
|
institution_name=institution.name,
|
|
institution_type=institution.institution_type
|
|
)
|
|
|
|
if not component_dict:
|
|
return # Cannot generate GHCID without components
|
|
|
|
# Convert institution type string to GHCID InstitutionType enum
|
|
# Model InstitutionType: "MUSEUM", "LIBRARY", etc.
|
|
# GHCID InstitutionType: MUSEUM="M", LIBRARY="L", etc.
|
|
try:
|
|
ghcid_inst_type = GHCIDInstitutionType[institution.institution_type]
|
|
except KeyError:
|
|
# Fallback: try direct string match
|
|
ghcid_inst_type = institution.institution_type
|
|
|
|
# Generate GHCID components
|
|
generator = GHCIDGenerator()
|
|
components = generator.generate(
|
|
institution_name=component_dict["institution_name"],
|
|
english_name=component_dict["english_name"],
|
|
institution_type=ghcid_inst_type,
|
|
country_code=component_dict["country_code"],
|
|
region_code=component_dict["region_code"],
|
|
city_locode=component_dict["city_locode"]
|
|
)
|
|
|
|
# Set GHCID fields on institution
|
|
institution.ghcid = components.to_string()
|
|
institution.ghcid_uuid = str(components.to_uuid())
|
|
institution.ghcid_uuid_sha256 = str(components.to_uuid_sha256())
|
|
institution.ghcid_numeric = components.to_numeric()
|
|
|
|
|
|
class TestRealAmsterdamMuseums:
|
|
"""
|
|
Test collision detection with 7 real Amsterdam museums from ISIL registry:
|
|
- Amsterdam Museum
|
|
- Het Scheepvaartmuseum (HSM)
|
|
- Joods Historisch Museum
|
|
- Museum Ons' Lieve Heer op Solder
|
|
- Rijksmuseum
|
|
- Van Gogh Museum
|
|
- Verzetsmuseum Amsterdam
|
|
|
|
All share: city=Amsterdam, type=MUSEUM, country=NL
|
|
Expected base GHCID: NL-NH-AMS-M-{abbreviation}
|
|
"""
|
|
|
|
def test_detect_amsterdam_museum_collisions(
|
|
self,
|
|
isil_csv_path: Path,
|
|
isil_parser: ISILRegistryParser,
|
|
collision_detector: GHCIDCollisionDetector
|
|
):
|
|
"""7 Amsterdam museums should be detected as collision group"""
|
|
# Parse ISIL registry
|
|
records = isil_parser.parse_file(isil_csv_path)
|
|
|
|
# Filter for Amsterdam museums
|
|
amsterdam_museums = [
|
|
r for r in records
|
|
if r.plaats == "Amsterdam" and "museum" in r.instelling.lower()
|
|
]
|
|
|
|
# Should find 7 museums
|
|
assert len(amsterdam_museums) >= 7, f"Expected 7+ museums, found {len(amsterdam_museums)}"
|
|
|
|
# Convert to HeritageCustodian
|
|
institutions = [isil_parser.to_heritage_custodian(r) for r in amsterdam_museums]
|
|
|
|
# All extracted on same date (simulating batch import)
|
|
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
for inst in institutions:
|
|
inst.provenance.extraction_date = extraction_date
|
|
|
|
# Detect collisions
|
|
collisions = collision_detector.detect_collisions(institutions)
|
|
|
|
# Should detect collisions (institutions with same city + type may collide)
|
|
# Note: Actual collision count depends on GHCID abbreviation uniqueness
|
|
assert len(collisions) >= 0, "Should detect collision groups or have unique abbreviations"
|
|
|
|
def test_amsterdam_museums_first_batch_resolution(
|
|
self,
|
|
isil_csv_path: Path,
|
|
isil_parser: ISILRegistryParser,
|
|
collision_detector: GHCIDCollisionDetector
|
|
):
|
|
"""First batch: All colliding Amsterdam museums get Q-numbers"""
|
|
# Create test subset with known collision
|
|
# Use institutions with same abbreviation potential
|
|
institutions = [
|
|
HeritageCustodian(
|
|
id="test-rijksmuseum",
|
|
name="Rijksmuseum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-AsdRM",
|
|
identifier_url="https://isil.nl/NL-AsdRM"
|
|
),
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value="Q190804",
|
|
identifier_url="https://www.wikidata.org/wiki/Q190804"
|
|
)
|
|
],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.DUTCH_ORG_CSV,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
|
|
extraction_method="ISILRegistryParser",
|
|
confidence_score=1.0
|
|
)
|
|
),
|
|
HeritageCustodian(
|
|
id="test-van-gogh",
|
|
name="Van Gogh Museum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-AsdVGM",
|
|
identifier_url="https://isil.nl/NL-AsdVGM"
|
|
),
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value="Q224124",
|
|
identifier_url="https://www.wikidata.org/wiki/Q224124"
|
|
)
|
|
],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.DUTCH_ORG_CSV,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
|
|
extraction_method="ISILRegistryParser",
|
|
confidence_score=1.0
|
|
)
|
|
)
|
|
]
|
|
|
|
# Generate GHCIDs for institutions
|
|
for inst in institutions:
|
|
generate_ghcid_for_institution(inst)
|
|
|
|
# Resolve collisions
|
|
resolved = collision_detector.resolve_collisions(institutions)
|
|
|
|
# Both should have GHCIDs
|
|
assert all(inst.ghcid is not None for inst in resolved)
|
|
|
|
# If they collided, both should have Q-numbers in GHCID
|
|
# Check if any Q-numbers were added
|
|
ghcids_with_q = [inst.ghcid for inst in resolved if inst.ghcid and '-Q' in inst.ghcid]
|
|
|
|
# If collision occurred, both get Q-numbers (first batch rule)
|
|
if ghcids_with_q:
|
|
assert len(ghcids_with_q) == 2, "First batch: both institutions should have Q-numbers"
|
|
assert 'Q190804' in ghcids_with_q[0] or 'Q224124' in ghcids_with_q[0]
|
|
assert 'Q190804' in ghcids_with_q[1] or 'Q224124' in ghcids_with_q[1]
|
|
|
|
|
|
class TestRealAlphenArchives:
|
|
"""
|
|
Test collision with 2 real Alphen aan den Rijn archives:
|
|
- Gemeentearchief Alphen aan den Rijn (NL-AlGAADR)
|
|
- Streekarchief Rijnlands Midden (NL-AlSARM)
|
|
|
|
Both are archives in same city - potential GHCID collision
|
|
"""
|
|
|
|
def test_alphen_archives_collision_detection(
|
|
self,
|
|
isil_csv_path: Path,
|
|
isil_parser: ISILRegistryParser,
|
|
collision_detector: GHCIDCollisionDetector
|
|
):
|
|
"""Two Alphen archives should be detected as potential collision"""
|
|
records = isil_parser.parse_file(isil_csv_path)
|
|
|
|
# Filter for Alphen aan den Rijn archives
|
|
alphen_archives = [
|
|
r for r in records
|
|
if r.plaats == "Alphen aan den Rijn" and (
|
|
"archief" in r.instelling.lower() or "archive" in r.instelling.lower()
|
|
)
|
|
]
|
|
|
|
# Should find 2 archives
|
|
assert len(alphen_archives) >= 2, f"Expected 2+ archives, found {len(alphen_archives)}"
|
|
|
|
# Convert to HeritageCustodian
|
|
institutions = [isil_parser.to_heritage_custodian(r) for r in alphen_archives]
|
|
|
|
# Same extraction date (batch import)
|
|
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
for inst in institutions:
|
|
inst.provenance.extraction_date = extraction_date
|
|
|
|
# Detect collisions
|
|
collisions = collision_detector.detect_collisions(institutions)
|
|
|
|
# May or may not collide depending on abbreviation uniqueness
|
|
# This test validates the detection logic works with real data
|
|
assert isinstance(collisions, dict)
|
|
|
|
|
|
class TestRealDenHaagOfficialInstitutions:
|
|
"""
|
|
Test collision with 27 real Den Haag official institutions:
|
|
- Government ministries (15+)
|
|
- Research centers (Planbureau, SCP, etc.)
|
|
- Heritage institutions (RKD, Mauritshuis, etc.)
|
|
|
|
Largest collision group in dataset - excellent stress test
|
|
"""
|
|
|
|
def test_den_haag_official_institutions_batch_size(
|
|
self,
|
|
isil_csv_path: Path,
|
|
isil_parser: ISILRegistryParser
|
|
):
|
|
"""Verify Den Haag has 27+ official institutions"""
|
|
records = isil_parser.parse_file(isil_csv_path)
|
|
|
|
# Filter for Den Haag institutions (non-museum, non-archive, non-library)
|
|
den_haag_official = [
|
|
r for r in records
|
|
if r.plaats == "Den Haag" and not any(
|
|
keyword in r.instelling.lower()
|
|
for keyword in ["museum", "archief", "archive", "bibliotheek", "library"]
|
|
)
|
|
]
|
|
|
|
# Should find 27+ institutions
|
|
assert len(den_haag_official) >= 27, (
|
|
f"Expected 27+ official institutions in Den Haag, found {len(den_haag_official)}"
|
|
)
|
|
|
|
def test_den_haag_collision_stress_test(
|
|
self,
|
|
isil_csv_path: Path,
|
|
isil_parser: ISILRegistryParser,
|
|
collision_detector: GHCIDCollisionDetector
|
|
):
|
|
"""Stress test: 27 institutions in same city with same type"""
|
|
records = isil_parser.parse_file(isil_csv_path)
|
|
|
|
# Get Den Haag official institutions
|
|
den_haag_official = [
|
|
r for r in records
|
|
if r.plaats == "Den Haag" and not any(
|
|
keyword in r.instelling.lower()
|
|
for keyword in ["museum", "archief", "archive", "bibliotheek", "library"]
|
|
)
|
|
]
|
|
|
|
# Convert to HeritageCustodian
|
|
institutions = [isil_parser.to_heritage_custodian(r) for r in den_haag_official[:10]] # Test with subset
|
|
|
|
# Same extraction date (batch import)
|
|
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
for inst in institutions:
|
|
inst.provenance.extraction_date = extraction_date
|
|
|
|
# Detect collisions - should handle large group
|
|
collisions = collision_detector.detect_collisions(institutions)
|
|
|
|
# Should complete without errors
|
|
assert isinstance(collisions, dict)
|
|
|
|
# Resolve collisions - should handle large group
|
|
resolved = collision_detector.resolve_collisions(institutions)
|
|
|
|
# Should have same number of institutions
|
|
assert len(resolved) == len(institutions)
|
|
|
|
|
|
class TestRealHistoricalAddition:
|
|
"""
|
|
Test historical addition scenario using real institutions
|
|
|
|
Scenario: Rijksmuseum extracted in Nov 2025, then Joods Historisch Museum
|
|
discovered in Dec 2025. Both Amsterdam museums.
|
|
|
|
Expected:
|
|
- Rijksmuseum GHCID preserved (if no collision)
|
|
- JHM gets Q-number if collision occurs
|
|
"""
|
|
|
|
def test_historical_addition_with_real_museums(self):
|
|
"""Historical addition: New museum added later with collision"""
|
|
# Published dataset (November 2025)
|
|
rijksmuseum = HeritageCustodian(
|
|
id="rijksmuseum-001",
|
|
name="Rijksmuseum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-AsdRM",
|
|
identifier_url="https://isil.nl/NL-AsdRM"
|
|
),
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value="Q190804",
|
|
identifier_url="https://www.wikidata.org/wiki/Q190804"
|
|
)
|
|
],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.DUTCH_ORG_CSV,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
|
|
extraction_method="ISILRegistryParser",
|
|
confidence_score=1.0
|
|
),
|
|
ghcid="NL-NH-AMS-M-RM", # Already published
|
|
ghcid_numeric=123456789012
|
|
)
|
|
|
|
# New institution (December 2025)
|
|
jhm = HeritageCustodian(
|
|
id="jhm-001",
|
|
name="Joods Historisch Museum",
|
|
institution_type=InstitutionType.MUSEUM,
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-AsdJHM",
|
|
identifier_url="https://isil.nl/NL-AsdJHM"
|
|
),
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value="Q924335",
|
|
identifier_url="https://www.wikidata.org/wiki/Q924335"
|
|
)
|
|
],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.DUTCH_ORG_CSV,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime(2025, 12, 1, 14, 0, 0, tzinfo=timezone.utc),
|
|
extraction_method="ISILRegistryParser - historical addition",
|
|
confidence_score=1.0
|
|
)
|
|
)
|
|
|
|
# Generate GHCID for JHM (new institution needs GHCID before collision detection)
|
|
generate_ghcid_for_institution(jhm)
|
|
|
|
# Create collision detector with published dataset
|
|
collision_detector = GHCIDCollisionDetector(published_dataset=[rijksmuseum])
|
|
|
|
# Resolve collision - new institution only
|
|
resolved = collision_detector.resolve_collisions([jhm])
|
|
|
|
# Rijksmuseum GHCID should be unchanged (PID stability)
|
|
assert rijksmuseum.ghcid == "NL-NH-AMS-M-RM"
|
|
|
|
# JHM should have GHCID (may have Q-number if collision occurred)
|
|
assert resolved[0].ghcid is not None
|
|
|
|
# If collision occurred, only JHM gets Q-number
|
|
if '-Q' in resolved[0].ghcid:
|
|
assert 'Q924335' in resolved[0].ghcid
|
|
assert resolved[0].ghcid_history is not None
|
|
assert len(resolved[0].ghcid_history) >= 1
|
|
|
|
|
|
class TestRealCrossDatasetIntegration:
|
|
"""
|
|
Test integration between ISIL registry and Dutch organizations CSV
|
|
|
|
Both datasets contain overlapping institutions - test collision
|
|
resolution when merging datasets.
|
|
"""
|
|
|
|
def test_cross_dataset_collision_detection(
|
|
self,
|
|
isil_csv_path: Path,
|
|
isil_parser: ISILRegistryParser,
|
|
collision_detector: GHCIDCollisionDetector
|
|
):
|
|
"""Detect collisions when merging ISIL registry with Dutch orgs CSV"""
|
|
# Parse ISIL registry (sample)
|
|
isil_records = isil_parser.parse_file(isil_csv_path)
|
|
isil_institutions = [
|
|
isil_parser.to_heritage_custodian(r)
|
|
for r in isil_records[:50] # Test with first 50
|
|
]
|
|
|
|
# Simulate Dutch orgs CSV data (same institutions, later extraction)
|
|
# In real scenario, would parse from voorbeeld_lijst_organisaties CSV
|
|
|
|
# Set extraction dates
|
|
extraction_date_isil = datetime(2025, 8, 1, 0, 0, 0, tzinfo=timezone.utc)
|
|
extraction_date_orgs = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
|
|
for inst in isil_institutions:
|
|
inst.provenance.extraction_date = extraction_date_isil
|
|
|
|
# Detect collisions within ISIL dataset
|
|
collisions = collision_detector.detect_collisions(isil_institutions)
|
|
|
|
# Should complete without errors
|
|
assert isinstance(collisions, dict)
|
|
|
|
|
|
class TestRealSyntheticQNumbers:
|
|
"""
|
|
Test synthetic Q-number generation for real institutions without Wikidata IDs
|
|
|
|
Many smaller institutions in ISIL registry lack Wikidata entries.
|
|
Verify synthetic Q-numbers are generated correctly.
|
|
"""
|
|
|
|
def test_synthetic_q_number_for_local_heritage_society(
|
|
self,
|
|
collision_detector: GHCIDCollisionDetector
|
|
):
|
|
"""Generate synthetic Q-number for institution without Wikidata"""
|
|
# Real institution from ISIL registry: Heemkunde Vereniging Borne
|
|
# No Wikidata ID available
|
|
institutions = [
|
|
HeritageCustodian(
|
|
id="hvb-001",
|
|
name="Heemkunde Vereniging Borne",
|
|
institution_type=InstitutionType.COLLECTING_SOCIETY,
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-BneHVB",
|
|
identifier_url="https://isil.nl/NL-BneHVB"
|
|
)
|
|
],
|
|
locations=[Location(city="Borne", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.ISIL_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
|
|
extraction_method="ISILRegistryParser",
|
|
confidence_score=1.0
|
|
)
|
|
),
|
|
HeritageCustodian(
|
|
id="gemeente-borne-001",
|
|
name="Gemeente Borne",
|
|
institution_type=InstitutionType.OFFICIAL_INSTITUTION,
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="ISIL",
|
|
identifier_value="NL-BneGB",
|
|
identifier_url="https://isil.nl/NL-BneGB"
|
|
)
|
|
],
|
|
locations=[Location(city="Borne", country="NL")],
|
|
provenance=Provenance(
|
|
data_source=DataSource.ISIL_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
|
|
extraction_method="ISILRegistryParser",
|
|
confidence_score=1.0
|
|
)
|
|
)
|
|
]
|
|
|
|
# Generate GHCIDs for institutions
|
|
for inst in institutions:
|
|
generate_ghcid_for_institution(inst)
|
|
|
|
# Resolve collisions - both extracted same day (first batch)
|
|
resolved = collision_detector.resolve_collisions(institutions)
|
|
|
|
# Both should have GHCIDs
|
|
assert all(inst.ghcid is not None for inst in resolved)
|
|
|
|
# If collision occurred and no Wikidata, should have synthetic Q-numbers
|
|
for inst in resolved:
|
|
if inst.ghcid and '-Q' in inst.ghcid:
|
|
# Extract Q-number
|
|
q_part = inst.ghcid.split('-Q')[1]
|
|
assert q_part.startswith('Q')
|
|
assert q_part[1:].isdigit(), "Synthetic Q-number should be numeric"
|
|
|
|
|
|
class TestRealDataQualityValidation:
|
|
"""
|
|
Validate data quality after collision resolution with real ISIL data
|
|
|
|
Ensures:
|
|
- All institutions retain valid ISIL codes
|
|
- Provenance metadata preserved
|
|
- GHCID history correctly tracks changes
|
|
- No data loss during collision resolution
|
|
"""
|
|
|
|
def test_data_integrity_after_collision_resolution(
|
|
self,
|
|
isil_csv_path: Path,
|
|
isil_parser: ISILRegistryParser,
|
|
collision_detector: GHCIDCollisionDetector
|
|
):
|
|
"""Verify data integrity after resolving collisions"""
|
|
# Parse sample of ISIL registry
|
|
records = isil_parser.parse_file(isil_csv_path)
|
|
institutions = [
|
|
isil_parser.to_heritage_custodian(r)
|
|
for r in records[:30] # Test with first 30
|
|
]
|
|
|
|
# Set extraction date
|
|
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
for inst in institutions:
|
|
inst.provenance.extraction_date = extraction_date
|
|
|
|
# Store original ISIL codes for verification
|
|
original_isil_codes = {
|
|
inst.id: [
|
|
ident.identifier_value
|
|
for ident in (inst.identifiers or [])
|
|
if ident.identifier_scheme == "ISIL"
|
|
][0] if inst.identifiers else None
|
|
for inst in institutions
|
|
}
|
|
|
|
# Resolve collisions
|
|
resolved = collision_detector.resolve_collisions(institutions)
|
|
|
|
# Verify data integrity
|
|
assert len(resolved) == len(institutions), "No institutions lost"
|
|
|
|
for inst in resolved:
|
|
# ISIL code preserved
|
|
if inst.id in original_isil_codes and original_isil_codes[inst.id]:
|
|
current_isil = [
|
|
ident.identifier_value
|
|
for ident in (inst.identifiers or [])
|
|
if ident.identifier_scheme == "ISIL"
|
|
]
|
|
assert len(current_isil) > 0, f"ISIL code lost for {inst.name}"
|
|
assert current_isil[0] == original_isil_codes[inst.id], "ISIL code changed"
|
|
|
|
# Provenance preserved
|
|
assert inst.provenance is not None
|
|
assert inst.provenance.data_source == DataSource.ISIL_REGISTRY
|
|
assert inst.provenance.data_tier == DataTier.TIER_1_AUTHORITATIVE
|
|
|
|
# If GHCID assigned, should have history
|
|
if inst.ghcid:
|
|
assert inst.ghcid_numeric is not None, f"Missing ghcid_numeric for {inst.name}"
|