""" Test UUID v5 generation for GHCID persistent identifiers. This test suite validates that GHCID supports three interoperable identifier formats: 1. UUID v5 (128-bit) - Primary for Europeana, DPLA, IIIF, Wikidata integration 2. Numeric (64-bit) - For database primary keys and CSV exports 3. Human-readable - For citations and references """ import uuid import pytest from glam_extractor.identifiers.ghcid import ( GHCIDComponents, GHCIDGenerator, InstitutionType, GHCID_NAMESPACE, ) class TestGHCIDUUIDGeneration: """Test UUID v5 generation from GHCID strings.""" def test_uuid_deterministic(self): """UUID v5 should be deterministic - same input produces same UUID.""" components1 = GHCIDComponents( country_code="US", region_code="CA", city_locode="SAN", institution_type="A", abbreviation="IA" ) components2 = GHCIDComponents( country_code="US", region_code="CA", city_locode="SAN", institution_type="A", abbreviation="IA" ) uuid1 = components1.to_uuid() uuid2 = components2.to_uuid() assert uuid1 == uuid2 assert isinstance(uuid1, uuid.UUID) assert uuid1.version == 5 # RFC 4122 UUID v5 def test_uuid_unique_per_ghcid(self): """Different GHCID strings should produce different UUIDs.""" rijksmuseum = GHCIDComponents("NL", "NH", "AMS", "M", "RM") stedelijk = GHCIDComponents("NL", "NH", "AMS", "M", "SM") uuid_rm = rijksmuseum.to_uuid() uuid_sm = stedelijk.to_uuid() assert uuid_rm != uuid_sm assert str(uuid_rm) != str(uuid_sm) def test_uuid_format_rfc4122(self): """UUID should be valid RFC 4122 format.""" components = GHCIDComponents("BR", "RJ", "RIO", "L", "BNB") result_uuid = components.to_uuid() # UUID v5 format: xxxxxxxx-xxxx-5xxx-yxxx-xxxxxxxxxxxx uuid_str = str(result_uuid) parts = uuid_str.split('-') assert len(parts) == 5 assert len(parts[0]) == 8 # 8 hex chars assert len(parts[1]) == 4 # 4 hex chars assert len(parts[2]) == 4 # 4 hex chars (version) assert len(parts[3]) == 4 # 4 hex chars (variant) assert len(parts[4]) == 12 # 12 hex chars assert parts[2][0] == '5' # Version 5 def test_uuid_collision_resolver_changes_uuid(self): """Adding Wikidata QID (collision resolver) should produce different UUID.""" base = GHCIDComponents("NL", "NH", "AMS", "M", "SM") with_qid = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335") uuid_base = base.to_uuid() uuid_qid = with_qid.to_uuid() assert uuid_base != uuid_qid assert base.to_string() == "NL-NH-AMS-M-SM" assert with_qid.to_string() == "NL-NH-AMS-M-SM-Q924335" def test_uuid_namespace_constant(self): """GHCID namespace should be consistent across all UUIDs.""" components = GHCIDComponents("JP", "13", "TOK", "M", "TNM") # Manually generate UUID v5 with same namespace ghcid_str = components.to_string() expected_uuid = uuid.uuid5(GHCID_NAMESPACE, ghcid_str) assert components.to_uuid() == expected_uuid def test_uuid_string_representation(self): """UUID should have standard string representation.""" components = GHCIDComponents("MX", "CMX", "MEX", "M", "MNA") result_uuid = components.to_uuid() # Should be lowercase hex with hyphens uuid_str = str(result_uuid) assert uuid_str.islower() or all(c in '0123456789-abcdef' for c in uuid_str) assert uuid_str.count('-') == 4 class TestThreeIdentifierFormats: """Test that GHCID supports three persistent identifier formats.""" def test_all_three_formats_generated(self): """GHCID should provide UUID, numeric, and human-readable formats.""" components = GHCIDComponents("CL", "RM", "SAN", "M", "MNBA") # All three formats should be available uuid_id = components.to_uuid() numeric_id = components.to_numeric() human_id = components.to_string() assert isinstance(uuid_id, uuid.UUID) assert isinstance(numeric_id, int) assert isinstance(human_id, str) # UUID format assert len(str(uuid_id)) == 36 # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx # Numeric format (64-bit integer) assert 0 <= numeric_id < 2**64 # Human-readable format assert human_id == "CL-RM-SAN-M-MNBA" def test_all_formats_deterministic(self): """All three formats should be deterministic.""" comp1 = GHCIDComponents("AR", "C", "BUE", "M", "MALBA") comp2 = GHCIDComponents("AR", "C", "BUE", "M", "MALBA") assert comp1.to_uuid() == comp2.to_uuid() assert comp1.to_numeric() == comp2.to_numeric() assert comp1.to_string() == comp2.to_string() def test_uuid_is_primary_for_interoperability(self): """UUID v5 is recommended as primary identifier for interoperability.""" # This test documents the design decision components = GHCIDComponents("NL", "NH", "AMS", "M", "RM") # UUID v5 provides: # - RFC 4122 standard compliance # - 128-bit collision resistance # - Compatibility with Europeana, DPLA, IIIF, Wikidata uuid_id = components.to_uuid() assert uuid_id.version == 5 assert isinstance(uuid_id, uuid.UUID) # Can be converted to various formats assert str(uuid_id) # String format assert uuid_id.hex # Hex format (no hyphens) assert uuid_id.int # Integer format (128-bit) assert uuid_id.bytes # Binary format (16 bytes) class TestUUIDCollisionResistance: """Test collision resistance of UUID v5 identifiers.""" def test_no_collisions_in_similar_institutions(self): """Institutions with similar names should have unique UUIDs.""" institutions = [ GHCIDComponents("US", "NY", "NEW", "M", "MMA"), # Met Museum GHCIDComponents("US", "NY", "NEW", "M", "MOMA"), # MoMA GHCIDComponents("US", "NY", "NEW", "M", "AMN"), # American Museum Natural History GHCIDComponents("US", "NY", "NEW", "M", "GM"), # Guggenheim GHCIDComponents("US", "NY", "NEW", "M", "WCM"), # Whitney ] uuids = [comp.to_uuid() for comp in institutions] # All UUIDs should be unique assert len(uuids) == len(set(uuids)) def test_collision_probability_negligible(self): """For 1M institutions, collision probability should be negligible.""" # UUID v5 uses 128-bit space # For n=1,000,000 institutions: # P(collision) ≈ n²/(2·2^128) ≈ 1.5×10^-29 # Generate UUIDs for 1000 sample institutions (representative) uuids = set() for i in range(1000): comp = GHCIDComponents( "XX", "XX", "XXX", "M", f"INST{i:04d}" ) uuids.add(comp.to_uuid()) # All should be unique assert len(uuids) == 1000 class TestInteroperabilityScenarios: """Test real-world interoperability scenarios.""" def test_europeana_compatible(self): """UUID format compatible with Europeana data model.""" # Europeana uses UUIDs for digital objects components = GHCIDComponents("IT", "RM", "ROM", "M", "VG") # Vatican Museums result_uuid = components.to_uuid() # Europeana expects standard UUID format assert isinstance(result_uuid, uuid.UUID) assert result_uuid.version == 5 # Can be serialized to RDF/JSON-LD europeana_id = f"urn:uuid:{result_uuid}" assert europeana_id.startswith("urn:uuid:") def test_dpla_compatible(self): """UUID format compatible with DPLA aggregation.""" components = GHCIDComponents("US", "DC", "WAS", "L", "LOC") # Library of Congress result_uuid = components.to_uuid() # DPLA uses URIs with UUIDs dpla_uri = f"https://dp.la/api/items/{result_uuid}" assert "dp.la" in dpla_uri def test_iiif_manifest_identifier(self): """UUID can be used in IIIF manifest identifiers.""" components = GHCIDComponents("FR", "IDF", "PAR", "M", "LOU") # Louvre result_uuid = components.to_uuid() # IIIF manifest ID format iiif_id = f"https://iiif.example.org/manifests/{result_uuid}/manifest.json" assert str(result_uuid) in iiif_id def test_wikidata_external_id(self): """UUID can be stored as Wikidata external identifier.""" components = GHCIDComponents("GB", "ENG", "LON", "M", "BM") # British Museum result_uuid = components.to_uuid() # Wikidata can store UUIDs as qualifier values wikidata_statement = { "property": "P12345", # Hypothetical GHCID property "value": str(result_uuid), "type": "external-id" } assert wikidata_statement["value"] == str(result_uuid) class TestBackwardCompatibility: """Test compatibility with existing numeric GHCID implementation.""" def test_numeric_still_available(self): """Numeric format should still be available for legacy systems.""" components = GHCIDComponents("NL", "NH", "AMS", "M", "RM") numeric_id = components.to_numeric() assert isinstance(numeric_id, int) assert 0 <= numeric_id < 2**64 def test_human_readable_still_available(self): """Human-readable format should still be available.""" components = GHCIDComponents("NL", "NH", "AMS", "M", "RM") human_id = components.to_string() assert human_id == "NL-NH-AMS-M-RM" def test_all_formats_consistent(self): """All formats should be generated from same GHCID string.""" components = GHCIDComponents("JP", "13", "TOK", "M", "TNM") ghcid_str = components.to_string() # UUID is generated from GHCID string expected_uuid = uuid.uuid5(GHCID_NAMESPACE, ghcid_str) assert components.to_uuid() == expected_uuid # Numeric is also generated from GHCID string import hashlib hash_bytes = hashlib.sha256(ghcid_str.encode('utf-8')).digest() expected_numeric = int.from_bytes(hash_bytes[:8], byteorder='big', signed=False) assert components.to_numeric() == expected_numeric if __name__ == "__main__": pytest.main([__file__, "-v"])