- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
171 lines
5.9 KiB
Python
171 lines
5.9 KiB
Python
"""
|
|
Test UUID v7 generation for database primary keys.
|
|
|
|
UUID v7 is time-ordered and random (NOT deterministic from GHCID).
|
|
Use for database performance, not for persistent identifiers.
|
|
"""
|
|
|
|
import uuid
|
|
import pytest
|
|
import time
|
|
from glam_extractor.identifiers.ghcid import GHCIDComponents
|
|
|
|
|
|
class TestUUIDv7Generation:
|
|
"""Test UUID v7 generation for database primary keys."""
|
|
|
|
def test_uuid_v7_format(self):
|
|
"""UUID v7 should have correct version and variant."""
|
|
uuid_v7 = GHCIDComponents.generate_uuid_v7()
|
|
|
|
assert isinstance(uuid_v7, uuid.UUID)
|
|
assert uuid_v7.version == 7
|
|
assert uuid_v7.variant == uuid.RFC_4122
|
|
|
|
def test_uuid_v7_not_deterministic(self):
|
|
"""UUID v7 should be unique on each call (not deterministic)."""
|
|
uuid1 = GHCIDComponents.generate_uuid_v7()
|
|
uuid2 = GHCIDComponents.generate_uuid_v7()
|
|
|
|
assert uuid1 != uuid2
|
|
assert str(uuid1) != str(uuid2)
|
|
|
|
def test_uuid_v7_time_ordered(self):
|
|
"""UUID v7 should be time-ordered (k-sortable)."""
|
|
uuid1 = GHCIDComponents.generate_uuid_v7()
|
|
time.sleep(0.002) # 2ms delay
|
|
uuid2 = GHCIDComponents.generate_uuid_v7()
|
|
|
|
# UUIDs should sort by creation time
|
|
assert uuid1 < uuid2
|
|
assert str(uuid1) < str(uuid2)
|
|
|
|
def test_uuid_v7_format_rfc_9562(self):
|
|
"""UUID v7 should conform to RFC 9562 format."""
|
|
result_uuid = GHCIDComponents.generate_uuid_v7()
|
|
|
|
# UUID v7 format: xxxxxxxx-xxxx-7xxx-yxxx-xxxxxxxxxxxx
|
|
uuid_str = str(result_uuid)
|
|
parts = uuid_str.split('-')
|
|
|
|
assert len(parts) == 5
|
|
assert len(parts[0]) == 8 # 8 hex chars
|
|
assert len(parts[1]) == 4 # 4 hex chars
|
|
assert len(parts[2]) == 4 # 4 hex chars (version)
|
|
assert len(parts[3]) == 4 # 4 hex chars (variant)
|
|
assert len(parts[4]) == 12 # 12 hex chars
|
|
assert parts[2][0] == '7' # Version 7
|
|
|
|
def test_uuid_v7_collision_resistance(self):
|
|
"""Generate many UUIDs to test collision resistance."""
|
|
uuids = set()
|
|
count = 1000
|
|
|
|
for _ in range(count):
|
|
uuids.add(GHCIDComponents.generate_uuid_v7())
|
|
|
|
# All should be unique
|
|
assert len(uuids) == count
|
|
|
|
def test_uuid_v7_timestamp_extraction(self):
|
|
"""UUID v7 should contain valid timestamp."""
|
|
before = int(time.time() * 1000)
|
|
uuid_v7 = GHCIDComponents.generate_uuid_v7()
|
|
after = int(time.time() * 1000)
|
|
|
|
# Extract timestamp from UUID (first 48 bits = 6 bytes)
|
|
uuid_bytes = uuid_v7.bytes
|
|
timestamp_ms = int.from_bytes(uuid_bytes[:6], byteorder='big')
|
|
|
|
# Timestamp should be between before and after
|
|
assert before <= timestamp_ms <= after
|
|
|
|
def test_uuid_v7_batch_generation(self):
|
|
"""Generate batch of UUIDs in same millisecond."""
|
|
uuids = []
|
|
|
|
# Generate 100 UUIDs as fast as possible
|
|
for _ in range(100):
|
|
uuids.append(GHCIDComponents.generate_uuid_v7())
|
|
|
|
# All should be unique despite being in same millisecond
|
|
assert len(set(uuids)) == len(uuids)
|
|
|
|
# All should be sortable
|
|
sorted_uuids = sorted(uuids)
|
|
assert sorted_uuids[0] <= sorted_uuids[-1]
|
|
|
|
|
|
class TestUUIDv7UseCases:
|
|
"""Test UUID v7 use cases and integration scenarios."""
|
|
|
|
def test_database_primary_key_scenario(self):
|
|
"""UUID v7 should work as database primary key."""
|
|
# Simulate creating multiple records
|
|
records = []
|
|
for i in range(10):
|
|
record_id = GHCIDComponents.generate_uuid_v7()
|
|
records.append({
|
|
'id': record_id,
|
|
'name': f'Institution {i}',
|
|
'created_at': time.time()
|
|
})
|
|
# Small delay to ensure different timestamps
|
|
time.sleep(0.001)
|
|
|
|
# All IDs should be unique
|
|
ids = [r['id'] for r in records]
|
|
assert len(ids) == len(set(ids))
|
|
|
|
# IDs should be naturally ordered by creation time
|
|
assert ids == sorted(ids)
|
|
|
|
def test_uuid_v7_vs_uuid_v5_difference(self):
|
|
"""UUID v7 is random, UUID v5 is deterministic."""
|
|
components = GHCIDComponents("US", "CA", "SAN", "A", "IA")
|
|
|
|
# UUID v5 is deterministic
|
|
uuid_v5_1 = components.to_uuid()
|
|
uuid_v5_2 = components.to_uuid()
|
|
assert uuid_v5_1 == uuid_v5_2 # Same every time
|
|
|
|
# UUID v7 is random
|
|
uuid_v7_1 = GHCIDComponents.generate_uuid_v7()
|
|
uuid_v7_2 = GHCIDComponents.generate_uuid_v7()
|
|
assert uuid_v7_1 != uuid_v7_2 # Different every time
|
|
|
|
def test_four_identifier_strategy(self):
|
|
"""Demonstrate the four-identifier strategy."""
|
|
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
|
|
|
|
# 1. UUID v7 - Database primary key (random, time-ordered)
|
|
record_id = GHCIDComponents.generate_uuid_v7()
|
|
assert record_id.version == 7
|
|
|
|
# 2. UUID v5 - Public PID (deterministic, interoperable)
|
|
pid_uuid = components.to_uuid()
|
|
assert pid_uuid.version == 5
|
|
|
|
# 3. UUID v8 - SOTA PID (deterministic, SHA-256)
|
|
pid_sha256 = components.to_uuid_sha256()
|
|
assert pid_sha256.version == 8
|
|
|
|
# 4. GHCID string - Human-readable
|
|
ghcid_str = components.to_string()
|
|
assert ghcid_str == "NL-NH-AMS-M-RM"
|
|
|
|
# All four are different
|
|
assert record_id != pid_uuid
|
|
assert record_id != pid_sha256
|
|
assert pid_uuid != pid_sha256
|
|
|
|
# UUID v5 and v8 are deterministic
|
|
assert components.to_uuid() == pid_uuid
|
|
assert components.to_uuid_sha256() == pid_sha256
|
|
|
|
# UUID v7 is random
|
|
assert GHCIDComponents.generate_uuid_v7() != record_id
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|