glam/tests/identifiers/test_collision_detector.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

812 lines
30 KiB
Python

"""
Unit tests for GHCID collision detection and resolution.
This module tests the GHCIDCollisionDetector class that implements temporal
collision resolution for Global Heritage Custodian Identifiers (GHCIDs).
Test Coverage:
- First batch collision resolution (all get Q-numbers)
- Historical addition collision resolution (only new gets Q-number)
- Q-number assignment (Wikidata preferred, synthetic fallback)
- GHCID history tracking with temporal validity
- PID stability guarantees (published GHCIDs never modified)
References:
- Implementation: src/glam_extractor/identifiers/collision_detector.py
- Specification: docs/PERSISTENT_IDENTIFIERS.md
- Algorithm: docs/plan/global_glam/07-ghcid-collision-resolution.md
"""
import pytest
from datetime import datetime, timezone
from glam_extractor.identifiers.collision_detector import (
GHCIDCollisionDetector,
CollisionGroup
)
from glam_extractor.models import HeritageCustodian, Identifier, Provenance, Location
class TestFirstBatchCollision:
"""
Test first batch collision scenario.
Scenario: Multiple institutions discovered simultaneously (same extraction_date)
generate identical base GHCIDs.
Expected Behavior:
- ALL institutions receive Q-number suffixes (fair treatment)
- No temporal precedence since all extracted on same date
"""
def test_two_institutions_same_base_ghcid_same_date(self):
"""
Two institutions with same base GHCID extracted on same date.
Expected: Both get Q-numbers appended.
"""
# Arrange
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
stedelijk = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/stedelijk-ams",
name="Stedelijk Museum Amsterdam",
ghcid="NL-NH-AMS-M-SM",
ghcid_numeric=123456789012,
institution_type="MUSEUM",
identifiers=[
Identifier(
identifier_scheme="Wikidata",
identifier_value="Q621531"
)
],
locations=[
Location(
city="Amsterdam",
country="NL"
)
],
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=extraction_date,
extraction_method="AI agent NER"
)
)
science = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/science-ams",
name="Science Museum Amsterdam",
ghcid="NL-NH-AMS-M-SM",
ghcid_numeric=987654321098,
institution_type="MUSEUM",
identifiers=[
Identifier(
identifier_scheme="Wikidata",
identifier_value="Q98765432"
)
],
locations=[
Location(
city="Amsterdam",
country="NL"
)
],
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=extraction_date, # Same date!
extraction_method="AI agent NER"
)
)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
resolved = detector.resolve_collisions([stedelijk, science])
# Assert
assert len(resolved) == 2
# Both should have Q-numbers
ghcids = {inst.ghcid for inst in resolved}
assert "NL-NH-AMS-M-SM-Q621531" in ghcids
assert "NL-NH-AMS-M-SM-Q98765432" in ghcids
# Both should have GHCID history entries
for inst in resolved:
assert inst.ghcid_history is not None
assert len(inst.ghcid_history) == 2
# Current entry (with Q-number)
current = inst.ghcid_history[0]
assert current.ghcid.endswith(('-Q621531', '-Q98765432'))
assert current.valid_to is None # Current
assert "first batch collision" in current.reason
# Base entry (without Q-number)
base = inst.ghcid_history[1]
assert base.ghcid == "NL-NH-AMS-M-SM"
assert base.valid_to == extraction_date # Immediately superseded
def test_three_institutions_same_base_ghcid_same_date(self):
"""
Three institutions with same base GHCID extracted on same date.
Expected: All three get Q-numbers.
"""
# Arrange
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
institutions = []
wikidata_ids = ["Q111111", "Q222222", "Q333333"]
names = ["Museum A", "Museum B", "Museum C"]
for i, (name, qid) in enumerate(zip(names, wikidata_ids)):
inst = HeritageCustodian(
id=f"https://w3id.org/heritage/custodian/nl/museum-{i}",
name=name,
ghcid="NL-NH-UTR-M-HM",
ghcid_numeric=100000000000 + i,
institution_type="MUSEUM",
identifiers=[
Identifier(
identifier_scheme="Wikidata",
identifier_value=qid
)
],
locations=[
Location(city="Utrecht", country="NL")
],
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=extraction_date,
extraction_method="AI agent NER"
)
)
institutions.append(inst)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
resolved = detector.resolve_collisions(institutions)
# Assert
assert len(resolved) == 3
# All should have Q-numbers
ghcids = {inst.ghcid for inst in resolved}
assert "NL-NH-UTR-M-HM-Q111111" in ghcids
assert "NL-NH-UTR-M-HM-Q222222" in ghcids
assert "NL-NH-UTR-M-HM-Q333333" in ghcids
# All should have history entries
for inst in resolved:
assert len(inst.ghcid_history) == 2
assert inst.ghcid_history[0].valid_to is None # Current
def test_first_batch_collision_uses_synthetic_qnumber_when_no_wikidata(self):
"""
First batch collision where institutions lack Wikidata identifiers.
Expected: Synthetic Q-numbers generated from ghcid_numeric hash.
"""
# Arrange
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
# No Wikidata identifiers
inst1 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/archive-1",
name="Archive A",
ghcid="NL-NH-AMS-A-AA",
ghcid_numeric=123456789012, # Will generate synthetic Q-number
institution_type="ARCHIVE",
identifiers=[], # No Wikidata!
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=extraction_date,
extraction_method="CSV parser"
)
)
inst2 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/archive-2",
name="Archive B",
ghcid="NL-NH-AMS-A-AA",
ghcid_numeric=987654321098, # Different hash
institution_type="ARCHIVE",
identifiers=[],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=extraction_date,
extraction_method="CSV parser"
)
)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
resolved = detector.resolve_collisions([inst1, inst2])
# Assert
assert len(resolved) == 2
# Both should have synthetic Q-numbers
for inst in resolved:
assert inst.ghcid.startswith("NL-NH-AMS-A-AA-Q")
# Extract Q-number
qnum = inst.ghcid.split('-')[-1]
assert qnum.startswith('Q')
assert qnum[1:].isdigit() # Synthetic Q-number is numeric
# Verify deterministic generation (same hash → same Q-number)
expected_q1 = f"Q{inst1.ghcid_numeric % 100000000}"
expected_q2 = f"Q{inst2.ghcid_numeric % 100000000}"
ghcids = {inst.ghcid for inst in resolved}
assert f"NL-NH-AMS-A-AA-{expected_q1}" in ghcids
assert f"NL-NH-AMS-A-AA-{expected_q2}" in ghcids
def test_detect_collisions_identifies_first_batch(self):
"""
Test collision detection correctly identifies first batch collisions.
Expected: CollisionGroup with collision_type="FIRST_BATCH"
"""
# Arrange
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
inst1 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/lib-1",
name="Library 1",
ghcid="NL-NH-AMS-L-LB",
ghcid_numeric=111111111111,
institution_type="LIBRARY",
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=extraction_date,
extraction_method="CSV parser"
)
)
inst2 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/lib-2",
name="Library 2",
ghcid="NL-NH-AMS-L-LB",
ghcid_numeric=222222222222,
institution_type="LIBRARY",
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=extraction_date, # Same date
extraction_method="CSV parser"
)
)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
collisions = detector.detect_collisions([inst1, inst2])
# Assert
assert len(collisions) == 1
assert "NL-NH-AMS-L-LB" in collisions
collision_group = collisions["NL-NH-AMS-L-LB"]
assert collision_group.collision_type == "FIRST_BATCH"
assert collision_group.base_ghcid == "NL-NH-AMS-L-LB"
assert len(collision_group.institutions) == 2
assert collision_group.earliest_extraction_date == extraction_date
class TestHistoricalAdditionCollision:
"""
Test historical addition collision scenario.
Scenario: New institution discovered AFTER existing GHCID is published.
Expected Behavior:
- EXISTING institution keeps base GHCID (PID stability!)
- ONLY new institution receives Q-number suffix
"""
def test_new_institution_collides_with_published_ghcid(self):
"""
New institution added later collides with published base GHCID.
Expected:
- Published GHCID unchanged (PID stability)
- New institution gets Q-number
"""
# Arrange
published_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
new_date = datetime(2025, 11, 15, 14, 30, 0, tzinfo=timezone.utc)
# Published institution (already in PID registry)
hermitage = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/hermitage-ams",
name="Hermitage Amsterdam",
ghcid="NL-NH-AMS-M-HM", # NO Q-number (published first)
ghcid_numeric=100000000000,
institution_type="MUSEUM",
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=published_date,
extraction_method="CSV parser"
)
)
# New institution discovered later
historical = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/historical-ams",
name="Historical Museum Amsterdam",
ghcid="NL-NH-AMS-M-HM", # COLLISION with published!
ghcid_numeric=200000000000,
institution_type="MUSEUM",
identifiers=[
Identifier(
identifier_scheme="Wikidata",
identifier_value="Q17339437"
)
],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=new_date, # LATER date
extraction_method="AI agent NER"
)
)
# Initialize detector with published dataset
detector = GHCIDCollisionDetector(published_dataset=[hermitage])
# Act
resolved = detector.resolve_collisions([historical])
# Assert
assert len(resolved) == 1
new_inst = resolved[0]
# New institution should have Q-number
assert new_inst.ghcid == "NL-NH-AMS-M-HM-Q17339437"
# Published institution UNCHANGED (PID stability)
assert hermitage.ghcid == "NL-NH-AMS-M-HM"
# New institution should have history entries
assert len(new_inst.ghcid_history) == 2
# Current entry (with Q-number)
current = new_inst.ghcid_history[0]
assert current.ghcid == "NL-NH-AMS-M-HM-Q17339437"
assert current.valid_to is None
assert "collision with existing" in current.reason
assert "Hermitage Amsterdam" in current.reason
# Base entry (without Q-number)
base = new_inst.ghcid_history[1]
assert base.ghcid == "NL-NH-AMS-M-HM"
assert base.valid_to == new_date
def test_multiple_new_institutions_collide_with_same_published_ghcid(self):
"""
Multiple new institutions added over time, all colliding with same published GHCID.
Expected: Each new institution gets Q-number, published remains unchanged.
"""
# Arrange
published_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
# Published institution
published_inst = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/museum-pub",
name="Published Museum",
ghcid="NL-UT-UTR-S-HK",
ghcid_numeric=100000000000,
institution_type="COLLECTING_SOCIETY",
locations=[Location(city="Utrecht", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=published_date,
extraction_method="CSV parser"
)
)
# New institutions added on different dates
new_inst1 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/society-1",
name="Historical Society 1",
ghcid="NL-UT-UTR-S-HK",
ghcid_numeric=200000000000,
institution_type="COLLECTING_SOCIETY",
identifiers=[Identifier(identifier_scheme="Wikidata", identifier_value="Q111111")],
locations=[Location(city="Utrecht", country="NL")],
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=datetime(2025, 11, 15, tzinfo=timezone.utc),
extraction_method="AI agent NER"
)
)
new_inst2 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/society-2",
name="Historical Society 2",
ghcid="NL-UT-UTR-S-HK",
ghcid_numeric=300000000000,
institution_type="COLLECTING_SOCIETY",
identifiers=[Identifier(identifier_scheme="Wikidata", identifier_value="Q222222")],
locations=[Location(city="Utrecht", country="NL")],
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=datetime(2025, 12, 1, tzinfo=timezone.utc),
extraction_method="AI agent NER"
)
)
detector = GHCIDCollisionDetector(published_dataset=[published_inst])
# Act - Process new institutions separately (simulating discovery over time)
resolved1 = detector.resolve_collisions([new_inst1])
resolved2 = detector.resolve_collisions([new_inst2])
# Assert
assert resolved1[0].ghcid == "NL-UT-UTR-S-HK-Q111111"
assert resolved2[0].ghcid == "NL-UT-UTR-S-HK-Q222222"
# Published GHCID still unchanged
assert published_inst.ghcid == "NL-UT-UTR-S-HK"
def test_detect_collisions_identifies_historical_addition(self):
"""
Test collision detection identifies historical additions.
Expected: CollisionGroup with collision_type="HISTORICAL_ADDITION"
"""
# Arrange
date1 = datetime(2025, 11, 1, tzinfo=timezone.utc)
date2 = datetime(2025, 11, 15, tzinfo=timezone.utc)
inst1 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/inst-1",
name="Institution 1",
ghcid="NL-NH-AMS-G-GA",
ghcid_numeric=111111111111,
institution_type="GALLERY",
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=date1, # Earlier
extraction_method="CSV parser"
)
)
inst2 = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/inst-2",
name="Institution 2",
ghcid="NL-NH-AMS-G-GA",
ghcid_numeric=222222222222,
institution_type="GALLERY",
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=date2, # Later
extraction_method="AI agent NER"
)
)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
collisions = detector.detect_collisions([inst1, inst2])
# Assert
assert len(collisions) == 1
collision_group = collisions["NL-NH-AMS-G-GA"]
assert collision_group.collision_type == "HISTORICAL_ADDITION"
assert collision_group.earliest_extraction_date == date1
class TestQNumberAssignment:
"""Test Q-number assignment logic (Wikidata preferred, synthetic fallback)."""
def test_wikidata_qnumber_preferred_over_synthetic(self):
"""When Wikidata QID exists, it should be used instead of synthetic."""
# Arrange
extraction_date = datetime(2025, 11, 1, tzinfo=timezone.utc)
inst = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/rijksmuseum",
name="Rijksmuseum",
ghcid="NL-NH-AMS-M-RM",
ghcid_numeric=123456789012, # Would generate synthetic Q23456789
institution_type="MUSEUM",
identifiers=[
Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdRM"),
Identifier(identifier_scheme="Wikidata", identifier_value="Q190804") # Should use this!
],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=extraction_date,
extraction_method="CSV parser"
)
)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
qnumber = detector._assign_qnumber(inst)
# Assert
assert qnumber == "Q190804" # Wikidata QID, not synthetic
def test_synthetic_qnumber_when_no_wikidata(self):
"""When no Wikidata QID, synthetic Q-number should be generated."""
# Arrange
inst = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/local-archive",
name="Local Archive",
ghcid="NL-NH-AMS-A-LA",
ghcid_numeric=123456789012,
institution_type="ARCHIVE",
identifiers=[
Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdLA")
# No Wikidata!
],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
extraction_method="CSV parser"
)
)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
qnumber = detector._assign_qnumber(inst)
# Assert
expected_synthetic = f"Q{inst.ghcid_numeric % 100000000}"
assert qnumber == expected_synthetic
def test_extract_wikidata_qid_normalizes_format(self):
"""Wikidata QID extraction should normalize format (ensure Q prefix)."""
# Arrange
inst_with_q = HeritageCustodian(
id="https://example.org/1",
name="Museum 1",
institution_type="MUSEUM",
identifiers=[
Identifier(identifier_scheme="Wikidata", identifier_value="Q621531")
],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
extraction_method="CSV parser"
)
)
inst_without_q = HeritageCustodian(
id="https://example.org/2",
name="Museum 2",
institution_type="MUSEUM",
identifiers=[
Identifier(identifier_scheme="Wikidata", identifier_value="621531") # No Q prefix
],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
extraction_method="CSV parser"
)
)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
qid1 = detector._extract_wikidata_qid(inst_with_q)
qid2 = detector._extract_wikidata_qid(inst_without_q)
# Assert
assert qid1 == "Q621531"
assert qid2 == "Q621531" # Normalized
class TestGHCIDHistoryTracking:
"""Test GHCID history entry creation and temporal validity tracking."""
def test_ghcid_history_tracks_collision_resolution(self):
"""GHCID history should document transition from base to Q-number GHCID."""
# Arrange
extraction_date = datetime(2025, 11, 15, 14, 30, 0, tzinfo=timezone.utc)
published = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/pub",
name="Published Institution",
ghcid="NL-NH-AMS-M-PM",
ghcid_numeric=100000000000,
institution_type="MUSEUM",
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
extraction_method="CSV parser"
)
)
new_inst = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/new",
name="New Museum",
ghcid="NL-NH-AMS-M-PM",
ghcid_numeric=200000000000,
institution_type="MUSEUM",
identifiers=[
Identifier(identifier_scheme="Wikidata", identifier_value="Q12345")
],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=extraction_date,
extraction_method="AI agent NER"
)
)
detector = GHCIDCollisionDetector(published_dataset=[published])
# Act
resolved = detector.resolve_collisions([new_inst])
# Assert
inst = resolved[0]
assert len(inst.ghcid_history) == 2
# Entry 1: Current (with Q-number)
current = inst.ghcid_history[0]
assert current.ghcid == "NL-NH-AMS-M-PM-Q12345"
assert current.valid_from == extraction_date
assert current.valid_to is None # Still current
assert current.institution_name == "New Museum"
assert current.location_city == "Amsterdam"
assert current.location_country == "NL"
# Entry 2: Base (without Q-number)
base = inst.ghcid_history[1]
assert base.ghcid == "NL-NH-AMS-M-PM"
assert base.valid_from == extraction_date
assert base.valid_to == extraction_date # Immediately superseded
def test_ghcid_history_handles_missing_location(self):
"""GHCID history should use fallback values when location is missing."""
# Arrange
inst = HeritageCustodian(
id="https://example.org/inst",
name="Unknown Location Museum",
ghcid="XX-XX-XXX-M-UM",
ghcid_numeric=100000000000,
institution_type="MUSEUM",
locations=[], # No location data
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
extraction_method="AI agent NER"
)
)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
history_entry = detector._create_ghcid_history_entry(
institution=inst,
ghcid="XX-XX-XXX-M-UM",
valid_from=datetime(2025, 11, 1, tzinfo=timezone.utc),
valid_to=None,
reason="Test entry"
)
# Assert
assert history_entry.location_city == "Unknown"
assert history_entry.location_country == "Unknown"
class TestPIDStabilityGuarantees:
"""Test PID stability guarantees - published GHCIDs must never change."""
def test_published_ghcids_never_modified(self):
"""Published GHCIDs must remain unchanged even when collisions occur."""
# Arrange
published = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/stable",
name="Stable Museum",
ghcid="NL-NH-AMS-M-SM",
ghcid_numeric=100000000000,
institution_type="MUSEUM",
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
extraction_method="CSV parser"
)
)
original_ghcid = published.ghcid
new_inst = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/new-collision",
name="Colliding Museum",
ghcid="NL-NH-AMS-M-SM", # Collision!
ghcid_numeric=200000000000,
institution_type="MUSEUM",
identifiers=[
Identifier(identifier_scheme="Wikidata", identifier_value="Q99999")
],
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="CONVERSATION_NLP",
data_tier="TIER_4_INFERRED",
extraction_date=datetime(2025, 11, 15, tzinfo=timezone.utc),
extraction_method="AI agent NER"
)
)
detector = GHCIDCollisionDetector(published_dataset=[published])
# Act
detector.resolve_collisions([new_inst])
# Assert - Published GHCID UNCHANGED
assert published.ghcid == original_ghcid
assert published.ghcid == "NL-NH-AMS-M-SM" # No Q-number added
def test_no_collision_when_ghcid_already_has_qnumber(self):
"""Institutions with Q-numbers in GHCID should not be modified."""
# Arrange
inst_with_q = HeritageCustodian(
id="https://w3id.org/heritage/custodian/nl/with-q",
name="Museum with Q-number",
ghcid="NL-NH-AMS-M-MQ-Q621531", # Already has Q-number
ghcid_numeric=100000000000,
institution_type="MUSEUM",
locations=[Location(city="Amsterdam", country="NL")],
provenance=Provenance(
data_source="DUTCH_ORG_CSV",
data_tier="TIER_1_AUTHORITATIVE",
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
extraction_method="CSV parser"
)
)
detector = GHCIDCollisionDetector(published_dataset=[])
# Act
resolved = detector.resolve_collisions([inst_with_q])
# Assert - GHCID unchanged
assert resolved[0].ghcid == "NL-NH-AMS-M-MQ-Q621531"
if __name__ == "__main__":
pytest.main([__file__, "-v"])