glam/tests/identifiers/test_ghcid_uuid_v7.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

171 lines
5.9 KiB
Python

"""
Test UUID v7 generation for database primary keys.
UUID v7 is time-ordered and random (NOT deterministic from GHCID).
Use for database performance, not for persistent identifiers.
"""
import uuid
import pytest
import time
from glam_extractor.identifiers.ghcid import GHCIDComponents
class TestUUIDv7Generation:
"""Test UUID v7 generation for database primary keys."""
def test_uuid_v7_format(self):
"""UUID v7 should have correct version and variant."""
uuid_v7 = GHCIDComponents.generate_uuid_v7()
assert isinstance(uuid_v7, uuid.UUID)
assert uuid_v7.version == 7
assert uuid_v7.variant == uuid.RFC_4122
def test_uuid_v7_not_deterministic(self):
"""UUID v7 should be unique on each call (not deterministic)."""
uuid1 = GHCIDComponents.generate_uuid_v7()
uuid2 = GHCIDComponents.generate_uuid_v7()
assert uuid1 != uuid2
assert str(uuid1) != str(uuid2)
def test_uuid_v7_time_ordered(self):
"""UUID v7 should be time-ordered (k-sortable)."""
uuid1 = GHCIDComponents.generate_uuid_v7()
time.sleep(0.002) # 2ms delay
uuid2 = GHCIDComponents.generate_uuid_v7()
# UUIDs should sort by creation time
assert uuid1 < uuid2
assert str(uuid1) < str(uuid2)
def test_uuid_v7_format_rfc_9562(self):
"""UUID v7 should conform to RFC 9562 format."""
result_uuid = GHCIDComponents.generate_uuid_v7()
# UUID v7 format: xxxxxxxx-xxxx-7xxx-yxxx-xxxxxxxxxxxx
uuid_str = str(result_uuid)
parts = uuid_str.split('-')
assert len(parts) == 5
assert len(parts[0]) == 8 # 8 hex chars
assert len(parts[1]) == 4 # 4 hex chars
assert len(parts[2]) == 4 # 4 hex chars (version)
assert len(parts[3]) == 4 # 4 hex chars (variant)
assert len(parts[4]) == 12 # 12 hex chars
assert parts[2][0] == '7' # Version 7
def test_uuid_v7_collision_resistance(self):
"""Generate many UUIDs to test collision resistance."""
uuids = set()
count = 1000
for _ in range(count):
uuids.add(GHCIDComponents.generate_uuid_v7())
# All should be unique
assert len(uuids) == count
def test_uuid_v7_timestamp_extraction(self):
"""UUID v7 should contain valid timestamp."""
before = int(time.time() * 1000)
uuid_v7 = GHCIDComponents.generate_uuid_v7()
after = int(time.time() * 1000)
# Extract timestamp from UUID (first 48 bits = 6 bytes)
uuid_bytes = uuid_v7.bytes
timestamp_ms = int.from_bytes(uuid_bytes[:6], byteorder='big')
# Timestamp should be between before and after
assert before <= timestamp_ms <= after
def test_uuid_v7_batch_generation(self):
"""Generate batch of UUIDs in same millisecond."""
uuids = []
# Generate 100 UUIDs as fast as possible
for _ in range(100):
uuids.append(GHCIDComponents.generate_uuid_v7())
# All should be unique despite being in same millisecond
assert len(set(uuids)) == len(uuids)
# All should be sortable
sorted_uuids = sorted(uuids)
assert sorted_uuids[0] <= sorted_uuids[-1]
class TestUUIDv7UseCases:
"""Test UUID v7 use cases and integration scenarios."""
def test_database_primary_key_scenario(self):
"""UUID v7 should work as database primary key."""
# Simulate creating multiple records
records = []
for i in range(10):
record_id = GHCIDComponents.generate_uuid_v7()
records.append({
'id': record_id,
'name': f'Institution {i}',
'created_at': time.time()
})
# Small delay to ensure different timestamps
time.sleep(0.001)
# All IDs should be unique
ids = [r['id'] for r in records]
assert len(ids) == len(set(ids))
# IDs should be naturally ordered by creation time
assert ids == sorted(ids)
def test_uuid_v7_vs_uuid_v5_difference(self):
"""UUID v7 is random, UUID v5 is deterministic."""
components = GHCIDComponents("US", "CA", "SAN", "A", "IA")
# UUID v5 is deterministic
uuid_v5_1 = components.to_uuid()
uuid_v5_2 = components.to_uuid()
assert uuid_v5_1 == uuid_v5_2 # Same every time
# UUID v7 is random
uuid_v7_1 = GHCIDComponents.generate_uuid_v7()
uuid_v7_2 = GHCIDComponents.generate_uuid_v7()
assert uuid_v7_1 != uuid_v7_2 # Different every time
def test_four_identifier_strategy(self):
"""Demonstrate the four-identifier strategy."""
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
# 1. UUID v7 - Database primary key (random, time-ordered)
record_id = GHCIDComponents.generate_uuid_v7()
assert record_id.version == 7
# 2. UUID v5 - Public PID (deterministic, interoperable)
pid_uuid = components.to_uuid()
assert pid_uuid.version == 5
# 3. UUID v8 - SOTA PID (deterministic, SHA-256)
pid_sha256 = components.to_uuid_sha256()
assert pid_sha256.version == 8
# 4. GHCID string - Human-readable
ghcid_str = components.to_string()
assert ghcid_str == "NL-NH-AMS-M-RM"
# All four are different
assert record_id != pid_uuid
assert record_id != pid_sha256
assert pid_uuid != pid_sha256
# UUID v5 and v8 are deterministic
assert components.to_uuid() == pid_uuid
assert components.to_uuid_sha256() == pid_sha256
# UUID v7 is random
assert GHCIDComponents.generate_uuid_v7() != record_id
if __name__ == "__main__":
pytest.main([__file__, "-v"])