- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
812 lines
30 KiB
Python
812 lines
30 KiB
Python
"""
|
|
Unit tests for GHCID collision detection and resolution.
|
|
|
|
This module tests the GHCIDCollisionDetector class that implements temporal
|
|
collision resolution for Global Heritage Custodian Identifiers (GHCIDs).
|
|
|
|
Test Coverage:
|
|
- First batch collision resolution (all get Q-numbers)
|
|
- Historical addition collision resolution (only new gets Q-number)
|
|
- Q-number assignment (Wikidata preferred, synthetic fallback)
|
|
- GHCID history tracking with temporal validity
|
|
- PID stability guarantees (published GHCIDs never modified)
|
|
|
|
References:
|
|
- Implementation: src/glam_extractor/identifiers/collision_detector.py
|
|
- Specification: docs/PERSISTENT_IDENTIFIERS.md
|
|
- Algorithm: docs/plan/global_glam/07-ghcid-collision-resolution.md
|
|
"""
|
|
|
|
import pytest
|
|
from datetime import datetime, timezone
|
|
|
|
from glam_extractor.identifiers.collision_detector import (
|
|
GHCIDCollisionDetector,
|
|
CollisionGroup
|
|
)
|
|
from glam_extractor.models import HeritageCustodian, Identifier, Provenance, Location
|
|
|
|
|
|
class TestFirstBatchCollision:
|
|
"""
|
|
Test first batch collision scenario.
|
|
|
|
Scenario: Multiple institutions discovered simultaneously (same extraction_date)
|
|
generate identical base GHCIDs.
|
|
|
|
Expected Behavior:
|
|
- ALL institutions receive Q-number suffixes (fair treatment)
|
|
- No temporal precedence since all extracted on same date
|
|
"""
|
|
|
|
def test_two_institutions_same_base_ghcid_same_date(self):
|
|
"""
|
|
Two institutions with same base GHCID extracted on same date.
|
|
|
|
Expected: Both get Q-numbers appended.
|
|
"""
|
|
# Arrange
|
|
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
|
|
stedelijk = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/stedelijk-ams",
|
|
name="Stedelijk Museum Amsterdam",
|
|
ghcid="NL-NH-AMS-M-SM",
|
|
ghcid_numeric=123456789012,
|
|
institution_type="MUSEUM",
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value="Q621531"
|
|
)
|
|
],
|
|
locations=[
|
|
Location(
|
|
city="Amsterdam",
|
|
country="NL"
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=extraction_date,
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
|
|
science = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/science-ams",
|
|
name="Science Museum Amsterdam",
|
|
ghcid="NL-NH-AMS-M-SM",
|
|
ghcid_numeric=987654321098,
|
|
institution_type="MUSEUM",
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value="Q98765432"
|
|
)
|
|
],
|
|
locations=[
|
|
Location(
|
|
city="Amsterdam",
|
|
country="NL"
|
|
)
|
|
],
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=extraction_date, # Same date!
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
resolved = detector.resolve_collisions([stedelijk, science])
|
|
|
|
# Assert
|
|
assert len(resolved) == 2
|
|
|
|
# Both should have Q-numbers
|
|
ghcids = {inst.ghcid for inst in resolved}
|
|
assert "NL-NH-AMS-M-SM-Q621531" in ghcids
|
|
assert "NL-NH-AMS-M-SM-Q98765432" in ghcids
|
|
|
|
# Both should have GHCID history entries
|
|
for inst in resolved:
|
|
assert inst.ghcid_history is not None
|
|
assert len(inst.ghcid_history) == 2
|
|
|
|
# Current entry (with Q-number)
|
|
current = inst.ghcid_history[0]
|
|
assert current.ghcid.endswith(('-Q621531', '-Q98765432'))
|
|
assert current.valid_to is None # Current
|
|
assert "first batch collision" in current.reason
|
|
|
|
# Base entry (without Q-number)
|
|
base = inst.ghcid_history[1]
|
|
assert base.ghcid == "NL-NH-AMS-M-SM"
|
|
assert base.valid_to == extraction_date # Immediately superseded
|
|
|
|
def test_three_institutions_same_base_ghcid_same_date(self):
|
|
"""
|
|
Three institutions with same base GHCID extracted on same date.
|
|
|
|
Expected: All three get Q-numbers.
|
|
"""
|
|
# Arrange
|
|
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
|
|
institutions = []
|
|
wikidata_ids = ["Q111111", "Q222222", "Q333333"]
|
|
names = ["Museum A", "Museum B", "Museum C"]
|
|
|
|
for i, (name, qid) in enumerate(zip(names, wikidata_ids)):
|
|
inst = HeritageCustodian(
|
|
id=f"https://w3id.org/heritage/custodian/nl/museum-{i}",
|
|
name=name,
|
|
ghcid="NL-NH-UTR-M-HM",
|
|
ghcid_numeric=100000000000 + i,
|
|
institution_type="MUSEUM",
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value=qid
|
|
)
|
|
],
|
|
locations=[
|
|
Location(city="Utrecht", country="NL")
|
|
],
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=extraction_date,
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
institutions.append(inst)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
resolved = detector.resolve_collisions(institutions)
|
|
|
|
# Assert
|
|
assert len(resolved) == 3
|
|
|
|
# All should have Q-numbers
|
|
ghcids = {inst.ghcid for inst in resolved}
|
|
assert "NL-NH-UTR-M-HM-Q111111" in ghcids
|
|
assert "NL-NH-UTR-M-HM-Q222222" in ghcids
|
|
assert "NL-NH-UTR-M-HM-Q333333" in ghcids
|
|
|
|
# All should have history entries
|
|
for inst in resolved:
|
|
assert len(inst.ghcid_history) == 2
|
|
assert inst.ghcid_history[0].valid_to is None # Current
|
|
|
|
def test_first_batch_collision_uses_synthetic_qnumber_when_no_wikidata(self):
|
|
"""
|
|
First batch collision where institutions lack Wikidata identifiers.
|
|
|
|
Expected: Synthetic Q-numbers generated from ghcid_numeric hash.
|
|
"""
|
|
# Arrange
|
|
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
|
|
# No Wikidata identifiers
|
|
inst1 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/archive-1",
|
|
name="Archive A",
|
|
ghcid="NL-NH-AMS-A-AA",
|
|
ghcid_numeric=123456789012, # Will generate synthetic Q-number
|
|
institution_type="ARCHIVE",
|
|
identifiers=[], # No Wikidata!
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=extraction_date,
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
inst2 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/archive-2",
|
|
name="Archive B",
|
|
ghcid="NL-NH-AMS-A-AA",
|
|
ghcid_numeric=987654321098, # Different hash
|
|
institution_type="ARCHIVE",
|
|
identifiers=[],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=extraction_date,
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
resolved = detector.resolve_collisions([inst1, inst2])
|
|
|
|
# Assert
|
|
assert len(resolved) == 2
|
|
|
|
# Both should have synthetic Q-numbers
|
|
for inst in resolved:
|
|
assert inst.ghcid.startswith("NL-NH-AMS-A-AA-Q")
|
|
# Extract Q-number
|
|
qnum = inst.ghcid.split('-')[-1]
|
|
assert qnum.startswith('Q')
|
|
assert qnum[1:].isdigit() # Synthetic Q-number is numeric
|
|
|
|
# Verify deterministic generation (same hash → same Q-number)
|
|
expected_q1 = f"Q{inst1.ghcid_numeric % 100000000}"
|
|
expected_q2 = f"Q{inst2.ghcid_numeric % 100000000}"
|
|
|
|
ghcids = {inst.ghcid for inst in resolved}
|
|
assert f"NL-NH-AMS-A-AA-{expected_q1}" in ghcids
|
|
assert f"NL-NH-AMS-A-AA-{expected_q2}" in ghcids
|
|
|
|
def test_detect_collisions_identifies_first_batch(self):
|
|
"""
|
|
Test collision detection correctly identifies first batch collisions.
|
|
|
|
Expected: CollisionGroup with collision_type="FIRST_BATCH"
|
|
"""
|
|
# Arrange
|
|
extraction_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
|
|
inst1 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/lib-1",
|
|
name="Library 1",
|
|
ghcid="NL-NH-AMS-L-LB",
|
|
ghcid_numeric=111111111111,
|
|
institution_type="LIBRARY",
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=extraction_date,
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
inst2 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/lib-2",
|
|
name="Library 2",
|
|
ghcid="NL-NH-AMS-L-LB",
|
|
ghcid_numeric=222222222222,
|
|
institution_type="LIBRARY",
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=extraction_date, # Same date
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
collisions = detector.detect_collisions([inst1, inst2])
|
|
|
|
# Assert
|
|
assert len(collisions) == 1
|
|
assert "NL-NH-AMS-L-LB" in collisions
|
|
|
|
collision_group = collisions["NL-NH-AMS-L-LB"]
|
|
assert collision_group.collision_type == "FIRST_BATCH"
|
|
assert collision_group.base_ghcid == "NL-NH-AMS-L-LB"
|
|
assert len(collision_group.institutions) == 2
|
|
assert collision_group.earliest_extraction_date == extraction_date
|
|
|
|
|
|
class TestHistoricalAdditionCollision:
|
|
"""
|
|
Test historical addition collision scenario.
|
|
|
|
Scenario: New institution discovered AFTER existing GHCID is published.
|
|
|
|
Expected Behavior:
|
|
- EXISTING institution keeps base GHCID (PID stability!)
|
|
- ONLY new institution receives Q-number suffix
|
|
"""
|
|
|
|
def test_new_institution_collides_with_published_ghcid(self):
|
|
"""
|
|
New institution added later collides with published base GHCID.
|
|
|
|
Expected:
|
|
- Published GHCID unchanged (PID stability)
|
|
- New institution gets Q-number
|
|
"""
|
|
# Arrange
|
|
published_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
new_date = datetime(2025, 11, 15, 14, 30, 0, tzinfo=timezone.utc)
|
|
|
|
# Published institution (already in PID registry)
|
|
hermitage = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/hermitage-ams",
|
|
name="Hermitage Amsterdam",
|
|
ghcid="NL-NH-AMS-M-HM", # NO Q-number (published first)
|
|
ghcid_numeric=100000000000,
|
|
institution_type="MUSEUM",
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=published_date,
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
# New institution discovered later
|
|
historical = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/historical-ams",
|
|
name="Historical Museum Amsterdam",
|
|
ghcid="NL-NH-AMS-M-HM", # COLLISION with published!
|
|
ghcid_numeric=200000000000,
|
|
institution_type="MUSEUM",
|
|
identifiers=[
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value="Q17339437"
|
|
)
|
|
],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=new_date, # LATER date
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
|
|
# Initialize detector with published dataset
|
|
detector = GHCIDCollisionDetector(published_dataset=[hermitage])
|
|
|
|
# Act
|
|
resolved = detector.resolve_collisions([historical])
|
|
|
|
# Assert
|
|
assert len(resolved) == 1
|
|
new_inst = resolved[0]
|
|
|
|
# New institution should have Q-number
|
|
assert new_inst.ghcid == "NL-NH-AMS-M-HM-Q17339437"
|
|
|
|
# Published institution UNCHANGED (PID stability)
|
|
assert hermitage.ghcid == "NL-NH-AMS-M-HM"
|
|
|
|
# New institution should have history entries
|
|
assert len(new_inst.ghcid_history) == 2
|
|
|
|
# Current entry (with Q-number)
|
|
current = new_inst.ghcid_history[0]
|
|
assert current.ghcid == "NL-NH-AMS-M-HM-Q17339437"
|
|
assert current.valid_to is None
|
|
assert "collision with existing" in current.reason
|
|
assert "Hermitage Amsterdam" in current.reason
|
|
|
|
# Base entry (without Q-number)
|
|
base = new_inst.ghcid_history[1]
|
|
assert base.ghcid == "NL-NH-AMS-M-HM"
|
|
assert base.valid_to == new_date
|
|
|
|
def test_multiple_new_institutions_collide_with_same_published_ghcid(self):
|
|
"""
|
|
Multiple new institutions added over time, all colliding with same published GHCID.
|
|
|
|
Expected: Each new institution gets Q-number, published remains unchanged.
|
|
"""
|
|
# Arrange
|
|
published_date = datetime(2025, 11, 1, 10, 0, 0, tzinfo=timezone.utc)
|
|
|
|
# Published institution
|
|
published_inst = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/museum-pub",
|
|
name="Published Museum",
|
|
ghcid="NL-UT-UTR-S-HK",
|
|
ghcid_numeric=100000000000,
|
|
institution_type="COLLECTING_SOCIETY",
|
|
locations=[Location(city="Utrecht", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=published_date,
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
# New institutions added on different dates
|
|
new_inst1 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/society-1",
|
|
name="Historical Society 1",
|
|
ghcid="NL-UT-UTR-S-HK",
|
|
ghcid_numeric=200000000000,
|
|
institution_type="COLLECTING_SOCIETY",
|
|
identifiers=[Identifier(identifier_scheme="Wikidata", identifier_value="Q111111")],
|
|
locations=[Location(city="Utrecht", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=datetime(2025, 11, 15, tzinfo=timezone.utc),
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
|
|
new_inst2 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/society-2",
|
|
name="Historical Society 2",
|
|
ghcid="NL-UT-UTR-S-HK",
|
|
ghcid_numeric=300000000000,
|
|
institution_type="COLLECTING_SOCIETY",
|
|
identifiers=[Identifier(identifier_scheme="Wikidata", identifier_value="Q222222")],
|
|
locations=[Location(city="Utrecht", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=datetime(2025, 12, 1, tzinfo=timezone.utc),
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[published_inst])
|
|
|
|
# Act - Process new institutions separately (simulating discovery over time)
|
|
resolved1 = detector.resolve_collisions([new_inst1])
|
|
resolved2 = detector.resolve_collisions([new_inst2])
|
|
|
|
# Assert
|
|
assert resolved1[0].ghcid == "NL-UT-UTR-S-HK-Q111111"
|
|
assert resolved2[0].ghcid == "NL-UT-UTR-S-HK-Q222222"
|
|
|
|
# Published GHCID still unchanged
|
|
assert published_inst.ghcid == "NL-UT-UTR-S-HK"
|
|
|
|
def test_detect_collisions_identifies_historical_addition(self):
|
|
"""
|
|
Test collision detection identifies historical additions.
|
|
|
|
Expected: CollisionGroup with collision_type="HISTORICAL_ADDITION"
|
|
"""
|
|
# Arrange
|
|
date1 = datetime(2025, 11, 1, tzinfo=timezone.utc)
|
|
date2 = datetime(2025, 11, 15, tzinfo=timezone.utc)
|
|
|
|
inst1 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/inst-1",
|
|
name="Institution 1",
|
|
ghcid="NL-NH-AMS-G-GA",
|
|
ghcid_numeric=111111111111,
|
|
institution_type="GALLERY",
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=date1, # Earlier
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
inst2 = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/inst-2",
|
|
name="Institution 2",
|
|
ghcid="NL-NH-AMS-G-GA",
|
|
ghcid_numeric=222222222222,
|
|
institution_type="GALLERY",
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=date2, # Later
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
collisions = detector.detect_collisions([inst1, inst2])
|
|
|
|
# Assert
|
|
assert len(collisions) == 1
|
|
collision_group = collisions["NL-NH-AMS-G-GA"]
|
|
assert collision_group.collision_type == "HISTORICAL_ADDITION"
|
|
assert collision_group.earliest_extraction_date == date1
|
|
|
|
|
|
class TestQNumberAssignment:
|
|
"""Test Q-number assignment logic (Wikidata preferred, synthetic fallback)."""
|
|
|
|
def test_wikidata_qnumber_preferred_over_synthetic(self):
|
|
"""When Wikidata QID exists, it should be used instead of synthetic."""
|
|
# Arrange
|
|
extraction_date = datetime(2025, 11, 1, tzinfo=timezone.utc)
|
|
|
|
inst = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/rijksmuseum",
|
|
name="Rijksmuseum",
|
|
ghcid="NL-NH-AMS-M-RM",
|
|
ghcid_numeric=123456789012, # Would generate synthetic Q23456789
|
|
institution_type="MUSEUM",
|
|
identifiers=[
|
|
Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdRM"),
|
|
Identifier(identifier_scheme="Wikidata", identifier_value="Q190804") # Should use this!
|
|
],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=extraction_date,
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
qnumber = detector._assign_qnumber(inst)
|
|
|
|
# Assert
|
|
assert qnumber == "Q190804" # Wikidata QID, not synthetic
|
|
|
|
def test_synthetic_qnumber_when_no_wikidata(self):
|
|
"""When no Wikidata QID, synthetic Q-number should be generated."""
|
|
# Arrange
|
|
inst = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/local-archive",
|
|
name="Local Archive",
|
|
ghcid="NL-NH-AMS-A-LA",
|
|
ghcid_numeric=123456789012,
|
|
institution_type="ARCHIVE",
|
|
identifiers=[
|
|
Identifier(identifier_scheme="ISIL", identifier_value="NL-AsdLA")
|
|
# No Wikidata!
|
|
],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
qnumber = detector._assign_qnumber(inst)
|
|
|
|
# Assert
|
|
expected_synthetic = f"Q{inst.ghcid_numeric % 100000000}"
|
|
assert qnumber == expected_synthetic
|
|
|
|
def test_extract_wikidata_qid_normalizes_format(self):
|
|
"""Wikidata QID extraction should normalize format (ensure Q prefix)."""
|
|
# Arrange
|
|
inst_with_q = HeritageCustodian(
|
|
id="https://example.org/1",
|
|
name="Museum 1",
|
|
institution_type="MUSEUM",
|
|
identifiers=[
|
|
Identifier(identifier_scheme="Wikidata", identifier_value="Q621531")
|
|
],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
inst_without_q = HeritageCustodian(
|
|
id="https://example.org/2",
|
|
name="Museum 2",
|
|
institution_type="MUSEUM",
|
|
identifiers=[
|
|
Identifier(identifier_scheme="Wikidata", identifier_value="621531") # No Q prefix
|
|
],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
qid1 = detector._extract_wikidata_qid(inst_with_q)
|
|
qid2 = detector._extract_wikidata_qid(inst_without_q)
|
|
|
|
# Assert
|
|
assert qid1 == "Q621531"
|
|
assert qid2 == "Q621531" # Normalized
|
|
|
|
|
|
class TestGHCIDHistoryTracking:
|
|
"""Test GHCID history entry creation and temporal validity tracking."""
|
|
|
|
def test_ghcid_history_tracks_collision_resolution(self):
|
|
"""GHCID history should document transition from base to Q-number GHCID."""
|
|
# Arrange
|
|
extraction_date = datetime(2025, 11, 15, 14, 30, 0, tzinfo=timezone.utc)
|
|
|
|
published = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/pub",
|
|
name="Published Institution",
|
|
ghcid="NL-NH-AMS-M-PM",
|
|
ghcid_numeric=100000000000,
|
|
institution_type="MUSEUM",
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
new_inst = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/new",
|
|
name="New Museum",
|
|
ghcid="NL-NH-AMS-M-PM",
|
|
ghcid_numeric=200000000000,
|
|
institution_type="MUSEUM",
|
|
identifiers=[
|
|
Identifier(identifier_scheme="Wikidata", identifier_value="Q12345")
|
|
],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=extraction_date,
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[published])
|
|
|
|
# Act
|
|
resolved = detector.resolve_collisions([new_inst])
|
|
|
|
# Assert
|
|
inst = resolved[0]
|
|
assert len(inst.ghcid_history) == 2
|
|
|
|
# Entry 1: Current (with Q-number)
|
|
current = inst.ghcid_history[0]
|
|
assert current.ghcid == "NL-NH-AMS-M-PM-Q12345"
|
|
assert current.valid_from == extraction_date
|
|
assert current.valid_to is None # Still current
|
|
assert current.institution_name == "New Museum"
|
|
assert current.location_city == "Amsterdam"
|
|
assert current.location_country == "NL"
|
|
|
|
# Entry 2: Base (without Q-number)
|
|
base = inst.ghcid_history[1]
|
|
assert base.ghcid == "NL-NH-AMS-M-PM"
|
|
assert base.valid_from == extraction_date
|
|
assert base.valid_to == extraction_date # Immediately superseded
|
|
|
|
def test_ghcid_history_handles_missing_location(self):
|
|
"""GHCID history should use fallback values when location is missing."""
|
|
# Arrange
|
|
inst = HeritageCustodian(
|
|
id="https://example.org/inst",
|
|
name="Unknown Location Museum",
|
|
ghcid="XX-XX-XXX-M-UM",
|
|
ghcid_numeric=100000000000,
|
|
institution_type="MUSEUM",
|
|
locations=[], # No location data
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
history_entry = detector._create_ghcid_history_entry(
|
|
institution=inst,
|
|
ghcid="XX-XX-XXX-M-UM",
|
|
valid_from=datetime(2025, 11, 1, tzinfo=timezone.utc),
|
|
valid_to=None,
|
|
reason="Test entry"
|
|
)
|
|
|
|
# Assert
|
|
assert history_entry.location_city == "Unknown"
|
|
assert history_entry.location_country == "Unknown"
|
|
|
|
|
|
class TestPIDStabilityGuarantees:
|
|
"""Test PID stability guarantees - published GHCIDs must never change."""
|
|
|
|
def test_published_ghcids_never_modified(self):
|
|
"""Published GHCIDs must remain unchanged even when collisions occur."""
|
|
# Arrange
|
|
published = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/stable",
|
|
name="Stable Museum",
|
|
ghcid="NL-NH-AMS-M-SM",
|
|
ghcid_numeric=100000000000,
|
|
institution_type="MUSEUM",
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
original_ghcid = published.ghcid
|
|
|
|
new_inst = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/new-collision",
|
|
name="Colliding Museum",
|
|
ghcid="NL-NH-AMS-M-SM", # Collision!
|
|
ghcid_numeric=200000000000,
|
|
institution_type="MUSEUM",
|
|
identifiers=[
|
|
Identifier(identifier_scheme="Wikidata", identifier_value="Q99999")
|
|
],
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="CONVERSATION_NLP",
|
|
data_tier="TIER_4_INFERRED",
|
|
extraction_date=datetime(2025, 11, 15, tzinfo=timezone.utc),
|
|
extraction_method="AI agent NER"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[published])
|
|
|
|
# Act
|
|
detector.resolve_collisions([new_inst])
|
|
|
|
# Assert - Published GHCID UNCHANGED
|
|
assert published.ghcid == original_ghcid
|
|
assert published.ghcid == "NL-NH-AMS-M-SM" # No Q-number added
|
|
|
|
def test_no_collision_when_ghcid_already_has_qnumber(self):
|
|
"""Institutions with Q-numbers in GHCID should not be modified."""
|
|
# Arrange
|
|
inst_with_q = HeritageCustodian(
|
|
id="https://w3id.org/heritage/custodian/nl/with-q",
|
|
name="Museum with Q-number",
|
|
ghcid="NL-NH-AMS-M-MQ-Q621531", # Already has Q-number
|
|
ghcid_numeric=100000000000,
|
|
institution_type="MUSEUM",
|
|
locations=[Location(city="Amsterdam", country="NL")],
|
|
provenance=Provenance(
|
|
data_source="DUTCH_ORG_CSV",
|
|
data_tier="TIER_1_AUTHORITATIVE",
|
|
extraction_date=datetime(2025, 11, 1, tzinfo=timezone.utc),
|
|
extraction_method="CSV parser"
|
|
)
|
|
)
|
|
|
|
detector = GHCIDCollisionDetector(published_dataset=[])
|
|
|
|
# Act
|
|
resolved = detector.resolve_collisions([inst_with_q])
|
|
|
|
# Assert - GHCID unchanged
|
|
assert resolved[0].ghcid == "NL-NH-AMS-M-MQ-Q621531"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|