glam/tests/identifiers/test_ghcid_uuid.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

287 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Test UUID v5 generation for GHCID persistent identifiers.
This test suite validates that GHCID supports three interoperable identifier formats:
1. UUID v5 (128-bit) - Primary for Europeana, DPLA, IIIF, Wikidata integration
2. Numeric (64-bit) - For database primary keys and CSV exports
3. Human-readable - For citations and references
"""
import uuid
import pytest
from glam_extractor.identifiers.ghcid import (
GHCIDComponents,
GHCIDGenerator,
InstitutionType,
GHCID_NAMESPACE,
)
class TestGHCIDUUIDGeneration:
"""Test UUID v5 generation from GHCID strings."""
def test_uuid_deterministic(self):
"""UUID v5 should be deterministic - same input produces same UUID."""
components1 = GHCIDComponents(
country_code="US",
region_code="CA",
city_locode="SAN",
institution_type="A",
abbreviation="IA"
)
components2 = GHCIDComponents(
country_code="US",
region_code="CA",
city_locode="SAN",
institution_type="A",
abbreviation="IA"
)
uuid1 = components1.to_uuid()
uuid2 = components2.to_uuid()
assert uuid1 == uuid2
assert isinstance(uuid1, uuid.UUID)
assert uuid1.version == 5 # RFC 4122 UUID v5
def test_uuid_unique_per_ghcid(self):
"""Different GHCID strings should produce different UUIDs."""
rijksmuseum = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
stedelijk = GHCIDComponents("NL", "NH", "AMS", "M", "SM")
uuid_rm = rijksmuseum.to_uuid()
uuid_sm = stedelijk.to_uuid()
assert uuid_rm != uuid_sm
assert str(uuid_rm) != str(uuid_sm)
def test_uuid_format_rfc4122(self):
"""UUID should be valid RFC 4122 format."""
components = GHCIDComponents("BR", "RJ", "RIO", "L", "BNB")
result_uuid = components.to_uuid()
# UUID v5 format: xxxxxxxx-xxxx-5xxx-yxxx-xxxxxxxxxxxx
uuid_str = str(result_uuid)
parts = uuid_str.split('-')
assert len(parts) == 5
assert len(parts[0]) == 8 # 8 hex chars
assert len(parts[1]) == 4 # 4 hex chars
assert len(parts[2]) == 4 # 4 hex chars (version)
assert len(parts[3]) == 4 # 4 hex chars (variant)
assert len(parts[4]) == 12 # 12 hex chars
assert parts[2][0] == '5' # Version 5
def test_uuid_collision_resolver_changes_uuid(self):
"""Adding Wikidata QID (collision resolver) should produce different UUID."""
base = GHCIDComponents("NL", "NH", "AMS", "M", "SM")
with_qid = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335")
uuid_base = base.to_uuid()
uuid_qid = with_qid.to_uuid()
assert uuid_base != uuid_qid
assert base.to_string() == "NL-NH-AMS-M-SM"
assert with_qid.to_string() == "NL-NH-AMS-M-SM-Q924335"
def test_uuid_namespace_constant(self):
"""GHCID namespace should be consistent across all UUIDs."""
components = GHCIDComponents("JP", "13", "TOK", "M", "TNM")
# Manually generate UUID v5 with same namespace
ghcid_str = components.to_string()
expected_uuid = uuid.uuid5(GHCID_NAMESPACE, ghcid_str)
assert components.to_uuid() == expected_uuid
def test_uuid_string_representation(self):
"""UUID should have standard string representation."""
components = GHCIDComponents("MX", "CMX", "MEX", "M", "MNA")
result_uuid = components.to_uuid()
# Should be lowercase hex with hyphens
uuid_str = str(result_uuid)
assert uuid_str.islower() or all(c in '0123456789-abcdef' for c in uuid_str)
assert uuid_str.count('-') == 4
class TestThreeIdentifierFormats:
"""Test that GHCID supports three persistent identifier formats."""
def test_all_three_formats_generated(self):
"""GHCID should provide UUID, numeric, and human-readable formats."""
components = GHCIDComponents("CL", "RM", "SAN", "M", "MNBA")
# All three formats should be available
uuid_id = components.to_uuid()
numeric_id = components.to_numeric()
human_id = components.to_string()
assert isinstance(uuid_id, uuid.UUID)
assert isinstance(numeric_id, int)
assert isinstance(human_id, str)
# UUID format
assert len(str(uuid_id)) == 36 # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
# Numeric format (64-bit integer)
assert 0 <= numeric_id < 2**64
# Human-readable format
assert human_id == "CL-RM-SAN-M-MNBA"
def test_all_formats_deterministic(self):
"""All three formats should be deterministic."""
comp1 = GHCIDComponents("AR", "C", "BUE", "M", "MALBA")
comp2 = GHCIDComponents("AR", "C", "BUE", "M", "MALBA")
assert comp1.to_uuid() == comp2.to_uuid()
assert comp1.to_numeric() == comp2.to_numeric()
assert comp1.to_string() == comp2.to_string()
def test_uuid_is_primary_for_interoperability(self):
"""UUID v5 is recommended as primary identifier for interoperability."""
# This test documents the design decision
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
# UUID v5 provides:
# - RFC 4122 standard compliance
# - 128-bit collision resistance
# - Compatibility with Europeana, DPLA, IIIF, Wikidata
uuid_id = components.to_uuid()
assert uuid_id.version == 5
assert isinstance(uuid_id, uuid.UUID)
# Can be converted to various formats
assert str(uuid_id) # String format
assert uuid_id.hex # Hex format (no hyphens)
assert uuid_id.int # Integer format (128-bit)
assert uuid_id.bytes # Binary format (16 bytes)
class TestUUIDCollisionResistance:
"""Test collision resistance of UUID v5 identifiers."""
def test_no_collisions_in_similar_institutions(self):
"""Institutions with similar names should have unique UUIDs."""
institutions = [
GHCIDComponents("US", "NY", "NEW", "M", "MMA"), # Met Museum
GHCIDComponents("US", "NY", "NEW", "M", "MOMA"), # MoMA
GHCIDComponents("US", "NY", "NEW", "M", "AMN"), # American Museum Natural History
GHCIDComponents("US", "NY", "NEW", "M", "GM"), # Guggenheim
GHCIDComponents("US", "NY", "NEW", "M", "WCM"), # Whitney
]
uuids = [comp.to_uuid() for comp in institutions]
# All UUIDs should be unique
assert len(uuids) == len(set(uuids))
def test_collision_probability_negligible(self):
"""For 1M institutions, collision probability should be negligible."""
# UUID v5 uses 128-bit space
# For n=1,000,000 institutions:
# P(collision) ≈ n²/(2·2^128) ≈ 1.5×10^-29
# Generate UUIDs for 1000 sample institutions (representative)
uuids = set()
for i in range(1000):
comp = GHCIDComponents(
"XX",
"XX",
"XXX",
"M",
f"INST{i:04d}"
)
uuids.add(comp.to_uuid())
# All should be unique
assert len(uuids) == 1000
class TestInteroperabilityScenarios:
"""Test real-world interoperability scenarios."""
def test_europeana_compatible(self):
"""UUID format compatible with Europeana data model."""
# Europeana uses UUIDs for digital objects
components = GHCIDComponents("IT", "RM", "ROM", "M", "VG") # Vatican Museums
result_uuid = components.to_uuid()
# Europeana expects standard UUID format
assert isinstance(result_uuid, uuid.UUID)
assert result_uuid.version == 5
# Can be serialized to RDF/JSON-LD
europeana_id = f"urn:uuid:{result_uuid}"
assert europeana_id.startswith("urn:uuid:")
def test_dpla_compatible(self):
"""UUID format compatible with DPLA aggregation."""
components = GHCIDComponents("US", "DC", "WAS", "L", "LOC") # Library of Congress
result_uuid = components.to_uuid()
# DPLA uses URIs with UUIDs
dpla_uri = f"https://dp.la/api/items/{result_uuid}"
assert "dp.la" in dpla_uri
def test_iiif_manifest_identifier(self):
"""UUID can be used in IIIF manifest identifiers."""
components = GHCIDComponents("FR", "IDF", "PAR", "M", "LOU") # Louvre
result_uuid = components.to_uuid()
# IIIF manifest ID format
iiif_id = f"https://iiif.example.org/manifests/{result_uuid}/manifest.json"
assert str(result_uuid) in iiif_id
def test_wikidata_external_id(self):
"""UUID can be stored as Wikidata external identifier."""
components = GHCIDComponents("GB", "ENG", "LON", "M", "BM") # British Museum
result_uuid = components.to_uuid()
# Wikidata can store UUIDs as qualifier values
wikidata_statement = {
"property": "P12345", # Hypothetical GHCID property
"value": str(result_uuid),
"type": "external-id"
}
assert wikidata_statement["value"] == str(result_uuid)
class TestBackwardCompatibility:
"""Test compatibility with existing numeric GHCID implementation."""
def test_numeric_still_available(self):
"""Numeric format should still be available for legacy systems."""
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
numeric_id = components.to_numeric()
assert isinstance(numeric_id, int)
assert 0 <= numeric_id < 2**64
def test_human_readable_still_available(self):
"""Human-readable format should still be available."""
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
human_id = components.to_string()
assert human_id == "NL-NH-AMS-M-RM"
def test_all_formats_consistent(self):
"""All formats should be generated from same GHCID string."""
components = GHCIDComponents("JP", "13", "TOK", "M", "TNM")
ghcid_str = components.to_string()
# UUID is generated from GHCID string
expected_uuid = uuid.uuid5(GHCID_NAMESPACE, ghcid_str)
assert components.to_uuid() == expected_uuid
# Numeric is also generated from GHCID string
import hashlib
hash_bytes = hashlib.sha256(ghcid_str.encode('utf-8')).digest()
expected_numeric = int.from_bytes(hash_bytes[:8], byteorder='big', signed=False)
assert components.to_numeric() == expected_numeric
if __name__ == "__main__":
pytest.main([__file__, "-v"])