- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
152 lines
5.1 KiB
Python
152 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Minimal test script to debug collision resolution.
|
|
|
|
Test if Q-numbers are being assigned to institutions in collision groups.
|
|
"""
|
|
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from glam_extractor.models import (
|
|
HeritageCustodian, Location, Identifier, Provenance,
|
|
InstitutionType, DataSource, DataTier
|
|
)
|
|
from glam_extractor.identifiers.collision_detector import GHCIDCollisionDetector
|
|
|
|
|
|
def create_test_institution(
|
|
name: str,
|
|
city: str,
|
|
ghcid: str,
|
|
ghcid_numeric: int,
|
|
wikidata_qid: str = None
|
|
) -> HeritageCustodian:
|
|
"""Create a test institution with minimal required fields."""
|
|
identifiers = []
|
|
if wikidata_qid:
|
|
identifiers.append(
|
|
Identifier(
|
|
identifier_scheme="Wikidata",
|
|
identifier_value=wikidata_qid,
|
|
identifier_url=f"https://www.wikidata.org/wiki/{wikidata_qid}"
|
|
)
|
|
)
|
|
|
|
return HeritageCustodian(
|
|
id=f"https://w3id.org/heritage/custodian/test/{name.lower().replace(' ', '-')}",
|
|
name=name,
|
|
institution_type=InstitutionType.MUSEUM,
|
|
locations=[
|
|
Location(city=city, country="NL")
|
|
],
|
|
identifiers=identifiers if identifiers else None,
|
|
ghcid=ghcid,
|
|
ghcid_numeric=ghcid_numeric,
|
|
provenance=Provenance(
|
|
data_source=DataSource.ISIL_REGISTRY,
|
|
data_tier=DataTier.TIER_1_AUTHORITATIVE,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="Manual test creation"
|
|
)
|
|
)
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("Testing GHCID Collision Resolution")
|
|
print("=" * 80)
|
|
|
|
# Create a single extraction timestamp for both institutions
|
|
extraction_timestamp = datetime.now(timezone.utc)
|
|
|
|
# Create two institutions with the same base GHCID
|
|
print("\n1. Creating test institutions...")
|
|
inst1 = create_test_institution(
|
|
name="Stedelijk Museum Amsterdam",
|
|
city="Amsterdam",
|
|
ghcid="NL-NH-AMS-M-SM",
|
|
ghcid_numeric=123456789012,
|
|
wikidata_qid="Q621531"
|
|
)
|
|
# Override extraction_date to match
|
|
inst1.provenance.extraction_date = extraction_timestamp
|
|
|
|
inst2 = create_test_institution(
|
|
name="Science Museum Amsterdam",
|
|
city="Amsterdam",
|
|
ghcid="NL-NH-AMS-M-SM", # Same base GHCID!
|
|
ghcid_numeric=987654321098,
|
|
wikidata_qid="Q98765432"
|
|
)
|
|
# Override extraction_date to match
|
|
inst2.provenance.extraction_date = extraction_timestamp
|
|
|
|
print(f" Institution 1: {inst1.name}")
|
|
print(f" - GHCID: {inst1.ghcid}")
|
|
print(f" - GHCID numeric: {inst1.ghcid_numeric}")
|
|
print(f" - Wikidata: {inst1.identifiers[0].identifier_value if inst1.identifiers else 'None'}")
|
|
|
|
print(f" Institution 2: {inst2.name}")
|
|
print(f" - GHCID: {inst2.ghcid}")
|
|
print(f" - GHCID numeric: {inst2.ghcid_numeric}")
|
|
print(f" - Wikidata: {inst2.identifiers[0].identifier_value if inst2.identifiers else 'None'}")
|
|
|
|
# Detect collisions
|
|
print("\n2. Detecting collisions...")
|
|
detector = GHCIDCollisionDetector() # Empty published_dataset
|
|
collisions = detector.detect_collisions([inst1, inst2])
|
|
|
|
print(f" Collisions detected: {len(collisions)}")
|
|
for base_ghcid, collision_group in collisions.items():
|
|
print(f" - {base_ghcid}: {len(collision_group.institutions)} institutions")
|
|
print(f" Collision type: {collision_group.collision_type}")
|
|
|
|
# Resolve collisions
|
|
print("\n3. Resolving collisions...")
|
|
resolved = detector.resolve_collisions([inst1, inst2])
|
|
|
|
print(f" Resolved institutions: {len(resolved)}")
|
|
for inst in resolved:
|
|
print(f" - {inst.name}")
|
|
print(f" GHCID before: NL-NH-AMS-M-SM")
|
|
print(f" GHCID after: {inst.ghcid}")
|
|
print(f" Q-number added: {'-Q' in inst.ghcid}")
|
|
if inst.ghcid_history:
|
|
print(f" GHCID history entries: {len(inst.ghcid_history)}")
|
|
for entry in inst.ghcid_history:
|
|
print(f" - {entry.ghcid} (valid {entry.valid_from} to {entry.valid_to})")
|
|
else:
|
|
print(f" GHCID history: None")
|
|
|
|
# Check results
|
|
print("\n4. Verification...")
|
|
success = True
|
|
|
|
for inst in resolved:
|
|
if "-Q" not in inst.ghcid:
|
|
print(f" ❌ FAILED: {inst.name} missing Q-number in GHCID")
|
|
success = False
|
|
else:
|
|
print(f" ✅ PASSED: {inst.name} has Q-number: {inst.ghcid}")
|
|
|
|
if not inst.ghcid_history or len(inst.ghcid_history) < 2:
|
|
print(f" ❌ FAILED: {inst.name} missing GHCID history")
|
|
success = False
|
|
else:
|
|
print(f" ✅ PASSED: {inst.name} has GHCID history with {len(inst.ghcid_history)} entries")
|
|
|
|
print("\n" + "=" * 80)
|
|
if success:
|
|
print("✅ ALL TESTS PASSED")
|
|
else:
|
|
print("❌ SOME TESTS FAILED")
|
|
print("=" * 80)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|