- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
287 lines
11 KiB
Python
287 lines
11 KiB
Python
"""
|
||
Test UUID v5 generation for GHCID persistent identifiers.
|
||
|
||
This test suite validates that GHCID supports three interoperable identifier formats:
|
||
1. UUID v5 (128-bit) - Primary for Europeana, DPLA, IIIF, Wikidata integration
|
||
2. Numeric (64-bit) - For database primary keys and CSV exports
|
||
3. Human-readable - For citations and references
|
||
"""
|
||
|
||
import uuid
|
||
import pytest
|
||
from glam_extractor.identifiers.ghcid import (
|
||
GHCIDComponents,
|
||
GHCIDGenerator,
|
||
InstitutionType,
|
||
GHCID_NAMESPACE,
|
||
)
|
||
|
||
|
||
class TestGHCIDUUIDGeneration:
|
||
"""Test UUID v5 generation from GHCID strings."""
|
||
|
||
def test_uuid_deterministic(self):
|
||
"""UUID v5 should be deterministic - same input produces same UUID."""
|
||
components1 = GHCIDComponents(
|
||
country_code="US",
|
||
region_code="CA",
|
||
city_locode="SAN",
|
||
institution_type="A",
|
||
abbreviation="IA"
|
||
)
|
||
components2 = GHCIDComponents(
|
||
country_code="US",
|
||
region_code="CA",
|
||
city_locode="SAN",
|
||
institution_type="A",
|
||
abbreviation="IA"
|
||
)
|
||
|
||
uuid1 = components1.to_uuid()
|
||
uuid2 = components2.to_uuid()
|
||
|
||
assert uuid1 == uuid2
|
||
assert isinstance(uuid1, uuid.UUID)
|
||
assert uuid1.version == 5 # RFC 4122 UUID v5
|
||
|
||
def test_uuid_unique_per_ghcid(self):
|
||
"""Different GHCID strings should produce different UUIDs."""
|
||
rijksmuseum = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
|
||
stedelijk = GHCIDComponents("NL", "NH", "AMS", "M", "SM")
|
||
|
||
uuid_rm = rijksmuseum.to_uuid()
|
||
uuid_sm = stedelijk.to_uuid()
|
||
|
||
assert uuid_rm != uuid_sm
|
||
assert str(uuid_rm) != str(uuid_sm)
|
||
|
||
def test_uuid_format_rfc4122(self):
|
||
"""UUID should be valid RFC 4122 format."""
|
||
components = GHCIDComponents("BR", "RJ", "RIO", "L", "BNB")
|
||
result_uuid = components.to_uuid()
|
||
|
||
# UUID v5 format: xxxxxxxx-xxxx-5xxx-yxxx-xxxxxxxxxxxx
|
||
uuid_str = str(result_uuid)
|
||
parts = uuid_str.split('-')
|
||
|
||
assert len(parts) == 5
|
||
assert len(parts[0]) == 8 # 8 hex chars
|
||
assert len(parts[1]) == 4 # 4 hex chars
|
||
assert len(parts[2]) == 4 # 4 hex chars (version)
|
||
assert len(parts[3]) == 4 # 4 hex chars (variant)
|
||
assert len(parts[4]) == 12 # 12 hex chars
|
||
assert parts[2][0] == '5' # Version 5
|
||
|
||
def test_uuid_collision_resolver_changes_uuid(self):
|
||
"""Adding Wikidata QID (collision resolver) should produce different UUID."""
|
||
base = GHCIDComponents("NL", "NH", "AMS", "M", "SM")
|
||
with_qid = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335")
|
||
|
||
uuid_base = base.to_uuid()
|
||
uuid_qid = with_qid.to_uuid()
|
||
|
||
assert uuid_base != uuid_qid
|
||
assert base.to_string() == "NL-NH-AMS-M-SM"
|
||
assert with_qid.to_string() == "NL-NH-AMS-M-SM-Q924335"
|
||
|
||
def test_uuid_namespace_constant(self):
|
||
"""GHCID namespace should be consistent across all UUIDs."""
|
||
components = GHCIDComponents("JP", "13", "TOK", "M", "TNM")
|
||
|
||
# Manually generate UUID v5 with same namespace
|
||
ghcid_str = components.to_string()
|
||
expected_uuid = uuid.uuid5(GHCID_NAMESPACE, ghcid_str)
|
||
|
||
assert components.to_uuid() == expected_uuid
|
||
|
||
def test_uuid_string_representation(self):
|
||
"""UUID should have standard string representation."""
|
||
components = GHCIDComponents("MX", "CMX", "MEX", "M", "MNA")
|
||
result_uuid = components.to_uuid()
|
||
|
||
# Should be lowercase hex with hyphens
|
||
uuid_str = str(result_uuid)
|
||
assert uuid_str.islower() or all(c in '0123456789-abcdef' for c in uuid_str)
|
||
assert uuid_str.count('-') == 4
|
||
|
||
|
||
class TestThreeIdentifierFormats:
|
||
"""Test that GHCID supports three persistent identifier formats."""
|
||
|
||
def test_all_three_formats_generated(self):
|
||
"""GHCID should provide UUID, numeric, and human-readable formats."""
|
||
components = GHCIDComponents("CL", "RM", "SAN", "M", "MNBA")
|
||
|
||
# All three formats should be available
|
||
uuid_id = components.to_uuid()
|
||
numeric_id = components.to_numeric()
|
||
human_id = components.to_string()
|
||
|
||
assert isinstance(uuid_id, uuid.UUID)
|
||
assert isinstance(numeric_id, int)
|
||
assert isinstance(human_id, str)
|
||
|
||
# UUID format
|
||
assert len(str(uuid_id)) == 36 # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
|
||
|
||
# Numeric format (64-bit integer)
|
||
assert 0 <= numeric_id < 2**64
|
||
|
||
# Human-readable format
|
||
assert human_id == "CL-RM-SAN-M-MNBA"
|
||
|
||
def test_all_formats_deterministic(self):
|
||
"""All three formats should be deterministic."""
|
||
comp1 = GHCIDComponents("AR", "C", "BUE", "M", "MALBA")
|
||
comp2 = GHCIDComponents("AR", "C", "BUE", "M", "MALBA")
|
||
|
||
assert comp1.to_uuid() == comp2.to_uuid()
|
||
assert comp1.to_numeric() == comp2.to_numeric()
|
||
assert comp1.to_string() == comp2.to_string()
|
||
|
||
def test_uuid_is_primary_for_interoperability(self):
|
||
"""UUID v5 is recommended as primary identifier for interoperability."""
|
||
# This test documents the design decision
|
||
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
|
||
|
||
# UUID v5 provides:
|
||
# - RFC 4122 standard compliance
|
||
# - 128-bit collision resistance
|
||
# - Compatibility with Europeana, DPLA, IIIF, Wikidata
|
||
uuid_id = components.to_uuid()
|
||
|
||
assert uuid_id.version == 5
|
||
assert isinstance(uuid_id, uuid.UUID)
|
||
|
||
# Can be converted to various formats
|
||
assert str(uuid_id) # String format
|
||
assert uuid_id.hex # Hex format (no hyphens)
|
||
assert uuid_id.int # Integer format (128-bit)
|
||
assert uuid_id.bytes # Binary format (16 bytes)
|
||
|
||
|
||
class TestUUIDCollisionResistance:
|
||
"""Test collision resistance of UUID v5 identifiers."""
|
||
|
||
def test_no_collisions_in_similar_institutions(self):
|
||
"""Institutions with similar names should have unique UUIDs."""
|
||
institutions = [
|
||
GHCIDComponents("US", "NY", "NEW", "M", "MMA"), # Met Museum
|
||
GHCIDComponents("US", "NY", "NEW", "M", "MOMA"), # MoMA
|
||
GHCIDComponents("US", "NY", "NEW", "M", "AMN"), # American Museum Natural History
|
||
GHCIDComponents("US", "NY", "NEW", "M", "GM"), # Guggenheim
|
||
GHCIDComponents("US", "NY", "NEW", "M", "WCM"), # Whitney
|
||
]
|
||
|
||
uuids = [comp.to_uuid() for comp in institutions]
|
||
|
||
# All UUIDs should be unique
|
||
assert len(uuids) == len(set(uuids))
|
||
|
||
def test_collision_probability_negligible(self):
|
||
"""For 1M institutions, collision probability should be negligible."""
|
||
# UUID v5 uses 128-bit space
|
||
# For n=1,000,000 institutions:
|
||
# P(collision) ≈ n²/(2·2^128) ≈ 1.5×10^-29
|
||
|
||
# Generate UUIDs for 1000 sample institutions (representative)
|
||
uuids = set()
|
||
for i in range(1000):
|
||
comp = GHCIDComponents(
|
||
"XX",
|
||
"XX",
|
||
"XXX",
|
||
"M",
|
||
f"INST{i:04d}"
|
||
)
|
||
uuids.add(comp.to_uuid())
|
||
|
||
# All should be unique
|
||
assert len(uuids) == 1000
|
||
|
||
|
||
class TestInteroperabilityScenarios:
|
||
"""Test real-world interoperability scenarios."""
|
||
|
||
def test_europeana_compatible(self):
|
||
"""UUID format compatible with Europeana data model."""
|
||
# Europeana uses UUIDs for digital objects
|
||
components = GHCIDComponents("IT", "RM", "ROM", "M", "VG") # Vatican Museums
|
||
result_uuid = components.to_uuid()
|
||
|
||
# Europeana expects standard UUID format
|
||
assert isinstance(result_uuid, uuid.UUID)
|
||
assert result_uuid.version == 5
|
||
|
||
# Can be serialized to RDF/JSON-LD
|
||
europeana_id = f"urn:uuid:{result_uuid}"
|
||
assert europeana_id.startswith("urn:uuid:")
|
||
|
||
def test_dpla_compatible(self):
|
||
"""UUID format compatible with DPLA aggregation."""
|
||
components = GHCIDComponents("US", "DC", "WAS", "L", "LOC") # Library of Congress
|
||
result_uuid = components.to_uuid()
|
||
|
||
# DPLA uses URIs with UUIDs
|
||
dpla_uri = f"https://dp.la/api/items/{result_uuid}"
|
||
assert "dp.la" in dpla_uri
|
||
|
||
def test_iiif_manifest_identifier(self):
|
||
"""UUID can be used in IIIF manifest identifiers."""
|
||
components = GHCIDComponents("FR", "IDF", "PAR", "M", "LOU") # Louvre
|
||
result_uuid = components.to_uuid()
|
||
|
||
# IIIF manifest ID format
|
||
iiif_id = f"https://iiif.example.org/manifests/{result_uuid}/manifest.json"
|
||
assert str(result_uuid) in iiif_id
|
||
|
||
def test_wikidata_external_id(self):
|
||
"""UUID can be stored as Wikidata external identifier."""
|
||
components = GHCIDComponents("GB", "ENG", "LON", "M", "BM") # British Museum
|
||
result_uuid = components.to_uuid()
|
||
|
||
# Wikidata can store UUIDs as qualifier values
|
||
wikidata_statement = {
|
||
"property": "P12345", # Hypothetical GHCID property
|
||
"value": str(result_uuid),
|
||
"type": "external-id"
|
||
}
|
||
assert wikidata_statement["value"] == str(result_uuid)
|
||
|
||
|
||
class TestBackwardCompatibility:
|
||
"""Test compatibility with existing numeric GHCID implementation."""
|
||
|
||
def test_numeric_still_available(self):
|
||
"""Numeric format should still be available for legacy systems."""
|
||
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
|
||
|
||
numeric_id = components.to_numeric()
|
||
assert isinstance(numeric_id, int)
|
||
assert 0 <= numeric_id < 2**64
|
||
|
||
def test_human_readable_still_available(self):
|
||
"""Human-readable format should still be available."""
|
||
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
|
||
|
||
human_id = components.to_string()
|
||
assert human_id == "NL-NH-AMS-M-RM"
|
||
|
||
def test_all_formats_consistent(self):
|
||
"""All formats should be generated from same GHCID string."""
|
||
components = GHCIDComponents("JP", "13", "TOK", "M", "TNM")
|
||
ghcid_str = components.to_string()
|
||
|
||
# UUID is generated from GHCID string
|
||
expected_uuid = uuid.uuid5(GHCID_NAMESPACE, ghcid_str)
|
||
assert components.to_uuid() == expected_uuid
|
||
|
||
# Numeric is also generated from GHCID string
|
||
import hashlib
|
||
hash_bytes = hashlib.sha256(ghcid_str.encode('utf-8')).digest()
|
||
expected_numeric = int.from_bytes(hash_bytes[:8], byteorder='big', signed=False)
|
||
assert components.to_numeric() == expected_numeric
|
||
|
||
|
||
if __name__ == "__main__":
|
||
pytest.main([__file__, "-v"])
|