glam/tests/identifiers/test_collision_detector_integration.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

641 lines
24 KiB
Python

"""
Integration Tests for GHCID Collision Detector with Real Dutch ISIL Registry Data
Tests collision detection using actual heritage institutions from the Dutch ISIL registry.
Real collision scenarios from data/ISIL-codes_2025-08-01.csv:
- Amsterdam: 7 museums, 16 official institutions, 3 archives
- Den Haag: 27 official institutions (ministries, research centers)
- Alphen aan den Rijn: 2 archives
- Arnhem: 3 museums
- Plus 38 more collision groups
These tests validate:
1. Collision detection with real institution names
2. GHCID generation from actual Dutch cities
3. Q-number assignment using real Wikidata IDs (when available)
4. Temporal collision resolution with realistic extraction dates
5. Integration between ISILRegistryParser and GHCIDCollisionDetector
"""
import pytest
from datetime import datetime, timezone
from pathlib import Path
from glam_extractor.parsers.isil_registry import ISILRegistryParser
from glam_extractor.identifiers.collision_detector import (
GHCIDCollisionDetector,
CollisionGroup
)
from glam_extractor.identifiers.ghcid import (
GHCIDGenerator,
InstitutionType as GHCIDInstitutionType
)
from glam_extractor.identifiers.lookups import get_ghcid_components_for_dutch_city
from glam_extractor.models import (
HeritageCustodian,
Identifier,
InstitutionType,
Location,
Provenance,
DataSource,
DataTier
)
@pytest.fixture
def isil_csv_path() -> Path:
"""Path to real Dutch ISIL registry CSV"""
return Path(__file__).parent.parent.parent / "data" / "ISIL-codes_2025-08-01.csv"
@pytest.fixture
def isil_parser() -> ISILRegistryParser:
"""ISIL registry parser instance"""
return ISILRegistryParser()
@pytest.fixture
def collision_detector() -> GHCIDCollisionDetector:
"""Collision detector instance"""
return GHCIDCollisionDetector()
def generate_ghcid_for_institution(institution: HeritageCustodian) -> None:
"""
Helper function to generate GHCID for a HeritageCustodian instance.
This function modifies the institution in-place, setting ghcid, ghcid_uuid,
ghcid_uuid_sha256, and ghcid_numeric fields.
For Dutch institutions only. Extracts city from locations, uses lookup table
to get region code and city locode.
Args:
institution: HeritageCustodian instance to generate GHCID for
"""
# Extract required fields
if not institution.locations or len(institution.locations) == 0:
return # Cannot generate GHCID without location
city = institution.locations[0].city
country = institution.locations[0].country or "NL"
# For Dutch cities, use lookup table
component_dict = get_ghcid_components_for_dutch_city(
city=city,
institution_name=institution.name,
institution_type=institution.institution_type
)
if not component_dict:
return # Cannot generate GHCID without components
# Convert institution type string to GHCID InstitutionType enum
# Model InstitutionType: "MUSEUM", "LIBRARY", etc.
# GHCID InstitutionType: MUSEUM="M", LIBRARY="L", etc.
try:
ghcid_inst_type = GHCIDInstitutionType[institution.institution_type]
except KeyError:
# Fallback: try direct string match
ghcid_inst_type = institution.institution_type
# Generate GHCID components
generator = GHCIDGenerator()
components = generator.generate(
institution_name=component_dict["institution_name"],
english_name=component_dict["english_name"],
institution_type=ghcid_inst_type,
country_code=component_dict["country_code"],
region_code=component_dict["region_code"],
city_locode=component_dict["city_locode"]
)
# Set GHCID fields on institution
institution.ghcid = components.to_string()
institution.ghcid_uuid = str(components.to_uuid())
institution.ghcid_uuid_sha256 = str(components.to_uuid_sha256())
institution.ghcid_numeric = components.to_numeric()
class TestRealAmsterdamMuseums:
"""
Test collision detection with 7 real Amsterdam museums from ISIL registry:
- Amsterdam Museum
- Het Scheepvaartmuseum (HSM)
- Joods Historisch Museum
- Museum Ons' Lieve Heer op Solder
- Rijksmuseum
- Van Gogh Museum
- Verzetsmuseum Amsterdam
All share: city=Amsterdam, type=MUSEUM, country=NL
Expected base GHCID: NL-NH-AMS-M-{abbreviation}
"""
def test_detect_amsterdam_museum_collisions(
self,
isil_csv_path: Path,
isil_parser: ISILRegistryParser,
collision_detector: GHCIDCollisionDetector
):
"""7 Amsterdam museums should be detected as collision group"""
# Parse ISIL registry
records = isil_parser.parse_file(isil_csv_path)
# Filter for Amsterdam museums
amsterdam_museums = [
r for r in records
if r.plaats == "Amsterdam" and "museum" in r.instelling.lower()
]
# Should find 7 museums
assert len(amsterdam_museums) >= 7, f"Expected 7+ museums, found {len(amsterdam_museums)}"
# Convert to HeritageCustodian
institutions = [isil_parser.to_heritage_custodian(r) for r in amsterdam_museums]
# All extracted on same date (simulating batch import)
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
for inst in institutions:
inst.provenance.extraction_date = extraction_date
# Detect collisions
collisions = collision_detector.detect_collisions(institutions)
# Should detect collisions (institutions with same city + type may collide)
# Note: Actual collision count depends on GHCID abbreviation uniqueness
assert len(collisions) >= 0, "Should detect collision groups or have unique abbreviations"
def test_amsterdam_museums_first_batch_resolution(
self,
isil_csv_path: Path,
isil_parser: ISILRegistryParser,
collision_detector: GHCIDCollisionDetector
):
"""First batch: All colliding Amsterdam museums get Q-numbers"""
# Create test subset with known collision
# Use institutions with same abbreviation potential
institutions = [
HeritageCustodian(
id="test-rijksmuseum",
name="Rijksmuseum",
institution_type=InstitutionType.MUSEUM,
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-AsdRM",
identifier_url="https://isil.nl/NL-AsdRM"
),
Identifier(
identifier_scheme="Wikidata",
identifier_value="Q190804",
identifier_url="https://www.wikidata.org/wiki/Q190804"
)
],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source=DataSource.DUTCH_ORG_CSV,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
extraction_method="ISILRegistryParser",
confidence_score=1.0
)
),
HeritageCustodian(
id="test-van-gogh",
name="Van Gogh Museum",
institution_type=InstitutionType.MUSEUM,
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-AsdVGM",
identifier_url="https://isil.nl/NL-AsdVGM"
),
Identifier(
identifier_scheme="Wikidata",
identifier_value="Q224124",
identifier_url="https://www.wikidata.org/wiki/Q224124"
)
],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source=DataSource.DUTCH_ORG_CSV,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
extraction_method="ISILRegistryParser",
confidence_score=1.0
)
)
]
# Generate GHCIDs for institutions
for inst in institutions:
generate_ghcid_for_institution(inst)
# Resolve collisions
resolved = collision_detector.resolve_collisions(institutions)
# Both should have GHCIDs
assert all(inst.ghcid is not None for inst in resolved)
# If they collided, both should have Q-numbers in GHCID
# Check if any Q-numbers were added
ghcids_with_q = [inst.ghcid for inst in resolved if inst.ghcid and '-Q' in inst.ghcid]
# If collision occurred, both get Q-numbers (first batch rule)
if ghcids_with_q:
assert len(ghcids_with_q) == 2, "First batch: both institutions should have Q-numbers"
assert 'Q190804' in ghcids_with_q[0] or 'Q224124' in ghcids_with_q[0]
assert 'Q190804' in ghcids_with_q[1] or 'Q224124' in ghcids_with_q[1]
class TestRealAlphenArchives:
"""
Test collision with 2 real Alphen aan den Rijn archives:
- Gemeentearchief Alphen aan den Rijn (NL-AlGAADR)
- Streekarchief Rijnlands Midden (NL-AlSARM)
Both are archives in same city - potential GHCID collision
"""
def test_alphen_archives_collision_detection(
self,
isil_csv_path: Path,
isil_parser: ISILRegistryParser,
collision_detector: GHCIDCollisionDetector
):
"""Two Alphen archives should be detected as potential collision"""
records = isil_parser.parse_file(isil_csv_path)
# Filter for Alphen aan den Rijn archives
alphen_archives = [
r for r in records
if r.plaats == "Alphen aan den Rijn" and (
"archief" in r.instelling.lower() or "archive" in r.instelling.lower()
)
]
# Should find 2 archives
assert len(alphen_archives) >= 2, f"Expected 2+ archives, found {len(alphen_archives)}"
# Convert to HeritageCustodian
institutions = [isil_parser.to_heritage_custodian(r) for r in alphen_archives]
# Same extraction date (batch import)
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
for inst in institutions:
inst.provenance.extraction_date = extraction_date
# Detect collisions
collisions = collision_detector.detect_collisions(institutions)
# May or may not collide depending on abbreviation uniqueness
# This test validates the detection logic works with real data
assert isinstance(collisions, dict)
class TestRealDenHaagOfficialInstitutions:
"""
Test collision with 27 real Den Haag official institutions:
- Government ministries (15+)
- Research centers (Planbureau, SCP, etc.)
- Heritage institutions (RKD, Mauritshuis, etc.)
Largest collision group in dataset - excellent stress test
"""
def test_den_haag_official_institutions_batch_size(
self,
isil_csv_path: Path,
isil_parser: ISILRegistryParser
):
"""Verify Den Haag has 27+ official institutions"""
records = isil_parser.parse_file(isil_csv_path)
# Filter for Den Haag institutions (non-museum, non-archive, non-library)
den_haag_official = [
r for r in records
if r.plaats == "Den Haag" and not any(
keyword in r.instelling.lower()
for keyword in ["museum", "archief", "archive", "bibliotheek", "library"]
)
]
# Should find 27+ institutions
assert len(den_haag_official) >= 27, (
f"Expected 27+ official institutions in Den Haag, found {len(den_haag_official)}"
)
def test_den_haag_collision_stress_test(
self,
isil_csv_path: Path,
isil_parser: ISILRegistryParser,
collision_detector: GHCIDCollisionDetector
):
"""Stress test: 27 institutions in same city with same type"""
records = isil_parser.parse_file(isil_csv_path)
# Get Den Haag official institutions
den_haag_official = [
r for r in records
if r.plaats == "Den Haag" and not any(
keyword in r.instelling.lower()
for keyword in ["museum", "archief", "archive", "bibliotheek", "library"]
)
]
# Convert to HeritageCustodian
institutions = [isil_parser.to_heritage_custodian(r) for r in den_haag_official[:10]] # Test with subset
# Same extraction date (batch import)
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
for inst in institutions:
inst.provenance.extraction_date = extraction_date
# Detect collisions - should handle large group
collisions = collision_detector.detect_collisions(institutions)
# Should complete without errors
assert isinstance(collisions, dict)
# Resolve collisions - should handle large group
resolved = collision_detector.resolve_collisions(institutions)
# Should have same number of institutions
assert len(resolved) == len(institutions)
class TestRealHistoricalAddition:
"""
Test historical addition scenario using real institutions
Scenario: Rijksmuseum extracted in Nov 2025, then Joods Historisch Museum
discovered in Dec 2025. Both Amsterdam museums.
Expected:
- Rijksmuseum GHCID preserved (if no collision)
- JHM gets Q-number if collision occurs
"""
def test_historical_addition_with_real_museums(self):
"""Historical addition: New museum added later with collision"""
# Published dataset (November 2025)
rijksmuseum = HeritageCustodian(
id="rijksmuseum-001",
name="Rijksmuseum",
institution_type=InstitutionType.MUSEUM,
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-AsdRM",
identifier_url="https://isil.nl/NL-AsdRM"
),
Identifier(
identifier_scheme="Wikidata",
identifier_value="Q190804",
identifier_url="https://www.wikidata.org/wiki/Q190804"
)
],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source=DataSource.DUTCH_ORG_CSV,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
extraction_method="ISILRegistryParser",
confidence_score=1.0
),
ghcid="NL-NH-AMS-M-RM", # Already published
ghcid_numeric=123456789012
)
# New institution (December 2025)
jhm = HeritageCustodian(
id="jhm-001",
name="Joods Historisch Museum",
institution_type=InstitutionType.MUSEUM,
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-AsdJHM",
identifier_url="https://isil.nl/NL-AsdJHM"
),
Identifier(
identifier_scheme="Wikidata",
identifier_value="Q924335",
identifier_url="https://www.wikidata.org/wiki/Q924335"
)
],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source=DataSource.DUTCH_ORG_CSV,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime(2025, 12, 1, 14, 0, 0, tzinfo=timezone.utc),
extraction_method="ISILRegistryParser - historical addition",
confidence_score=1.0
)
)
# Generate GHCID for JHM (new institution needs GHCID before collision detection)
generate_ghcid_for_institution(jhm)
# Create collision detector with published dataset
collision_detector = GHCIDCollisionDetector(published_dataset=[rijksmuseum])
# Resolve collision - new institution only
resolved = collision_detector.resolve_collisions([jhm])
# Rijksmuseum GHCID should be unchanged (PID stability)
assert rijksmuseum.ghcid == "NL-NH-AMS-M-RM"
# JHM should have GHCID (may have Q-number if collision occurred)
assert resolved[0].ghcid is not None
# If collision occurred, only JHM gets Q-number
if '-Q' in resolved[0].ghcid:
assert 'Q924335' in resolved[0].ghcid
assert resolved[0].ghcid_history is not None
assert len(resolved[0].ghcid_history) >= 1
class TestRealCrossDatasetIntegration:
"""
Test integration between ISIL registry and Dutch organizations CSV
Both datasets contain overlapping institutions - test collision
resolution when merging datasets.
"""
def test_cross_dataset_collision_detection(
self,
isil_csv_path: Path,
isil_parser: ISILRegistryParser,
collision_detector: GHCIDCollisionDetector
):
"""Detect collisions when merging ISIL registry with Dutch orgs CSV"""
# Parse ISIL registry (sample)
isil_records = isil_parser.parse_file(isil_csv_path)
isil_institutions = [
isil_parser.to_heritage_custodian(r)
for r in isil_records[:50] # Test with first 50
]
# Simulate Dutch orgs CSV data (same institutions, later extraction)
# In real scenario, would parse from voorbeeld_lijst_organisaties CSV
# Set extraction dates
extraction_date_isil = datetime(2025, 8, 1, 0, 0, 0, tzinfo=timezone.utc)
extraction_date_orgs = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
for inst in isil_institutions:
inst.provenance.extraction_date = extraction_date_isil
# Detect collisions within ISIL dataset
collisions = collision_detector.detect_collisions(isil_institutions)
# Should complete without errors
assert isinstance(collisions, dict)
class TestRealSyntheticQNumbers:
"""
Test synthetic Q-number generation for real institutions without Wikidata IDs
Many smaller institutions in ISIL registry lack Wikidata entries.
Verify synthetic Q-numbers are generated correctly.
"""
def test_synthetic_q_number_for_local_heritage_society(
self,
collision_detector: GHCIDCollisionDetector
):
"""Generate synthetic Q-number for institution without Wikidata"""
# Real institution from ISIL registry: Heemkunde Vereniging Borne
# No Wikidata ID available
institutions = [
HeritageCustodian(
id="hvb-001",
name="Heemkunde Vereniging Borne",
institution_type=InstitutionType.COLLECTING_SOCIETY,
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-BneHVB",
identifier_url="https://isil.nl/NL-BneHVB"
)
],
locations=[Location(city="Borne", country="NL")],
provenance=Provenance(
data_source=DataSource.ISIL_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
extraction_method="ISILRegistryParser",
confidence_score=1.0
)
),
HeritageCustodian(
id="gemeente-borne-001",
name="Gemeente Borne",
institution_type=InstitutionType.OFFICIAL_INSTITUTION,
identifiers=[
Identifier(
identifier_scheme="ISIL",
identifier_value="NL-BneGB",
identifier_url="https://isil.nl/NL-BneGB"
)
],
locations=[Location(city="Borne", country="NL")],
provenance=Provenance(
data_source=DataSource.ISIL_REGISTRY,
data_tier=DataTier.TIER_1_AUTHORITATIVE,
extraction_date=datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc),
extraction_method="ISILRegistryParser",
confidence_score=1.0
)
)
]
# Generate GHCIDs for institutions
for inst in institutions:
generate_ghcid_for_institution(inst)
# Resolve collisions - both extracted same day (first batch)
resolved = collision_detector.resolve_collisions(institutions)
# Both should have GHCIDs
assert all(inst.ghcid is not None for inst in resolved)
# If collision occurred and no Wikidata, should have synthetic Q-numbers
for inst in resolved:
if inst.ghcid and '-Q' in inst.ghcid:
# Extract Q-number
q_part = inst.ghcid.split('-Q')[1]
assert q_part.startswith('Q')
assert q_part[1:].isdigit(), "Synthetic Q-number should be numeric"
class TestRealDataQualityValidation:
"""
Validate data quality after collision resolution with real ISIL data
Ensures:
- All institutions retain valid ISIL codes
- Provenance metadata preserved
- GHCID history correctly tracks changes
- No data loss during collision resolution
"""
def test_data_integrity_after_collision_resolution(
self,
isil_csv_path: Path,
isil_parser: ISILRegistryParser,
collision_detector: GHCIDCollisionDetector
):
"""Verify data integrity after resolving collisions"""
# Parse sample of ISIL registry
records = isil_parser.parse_file(isil_csv_path)
institutions = [
isil_parser.to_heritage_custodian(r)
for r in records[:30] # Test with first 30
]
# Set extraction date
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
for inst in institutions:
inst.provenance.extraction_date = extraction_date
# Store original ISIL codes for verification
original_isil_codes = {
inst.id: [
ident.identifier_value
for ident in (inst.identifiers or [])
if ident.identifier_scheme == "ISIL"
][0] if inst.identifiers else None
for inst in institutions
}
# Resolve collisions
resolved = collision_detector.resolve_collisions(institutions)
# Verify data integrity
assert len(resolved) == len(institutions), "No institutions lost"
for inst in resolved:
# ISIL code preserved
if inst.id in original_isil_codes and original_isil_codes[inst.id]:
current_isil = [
ident.identifier_value
for ident in (inst.identifiers or [])
if ident.identifier_scheme == "ISIL"
]
assert len(current_isil) > 0, f"ISIL code lost for {inst.name}"
assert current_isil[0] == original_isil_codes[inst.id], "ISIL code changed"
# Provenance preserved
assert inst.provenance is not None
assert inst.provenance.data_source == DataSource.ISIL_REGISTRY
assert inst.provenance.data_tier == DataTier.TIER_1_AUTHORITATIVE
# If GHCID assigned, should have history
if inst.ghcid:
assert inst.ghcid_numeric is not None, f"Missing ghcid_numeric for {inst.name}"