glam/scripts/demo_enrichment_history.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

220 lines
8.3 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Demonstration of enrichment_history schema enhancement (v0.2.2)
Shows the improvement from unstructured provenance.notes to structured
EnrichmentHistoryEntry objects for tracking data quality activities.
"""
from datetime import datetime, timezone
from typing import Dict, Any
def old_approach() -> Dict[str, Any]:
"""OLD (v0.2.1): Unstructured notes string - hard to parse and query"""
return {
"provenance": {
"data_source": "CONVERSATION_NLP",
"data_tier": "TIER_4_INFERRED",
"extraction_date": "2025-11-09T10:00:00Z",
"extraction_method": "AI agent NER extraction",
"confidence_score": 0.85,
# ❌ Problem: Unstructured text, no machine-readable metadata
"notes": (
"Wikidata enriched 2025-11-10 (Q3330723, match: 100%). "
"Geocoded to (36.806495, 10.181532) via Nominatim. "
"False Wikidata match Q16665606 removed 2025-11-10."
)
}
}
def new_approach() -> Dict[str, Any]:
"""NEW (v0.2.2): Structured enrichment_history with queryable metadata"""
return {
"provenance": {
"data_source": "CONVERSATION_NLP",
"data_tier": "TIER_4_INFERRED",
"extraction_date": "2025-11-09T10:00:00Z",
"extraction_method": "AI agent NER extraction",
"confidence_score": 0.85,
# ✅ Solution: Structured, queryable enrichment log
"enrichment_history": [
{
"enrichment_date": "2025-11-10T14:30:00Z",
"enrichment_method": "Wikidata SPARQL fuzzy matching with city verification",
"enrichment_type": "WIKIDATA_IDENTIFIER",
"match_score": 1.0,
"verified": True,
"enrichment_source": "https://www.wikidata.org",
"enrichment_notes": "Perfect name match, city verified: Tunis"
},
{
"enrichment_date": "2025-11-10T14:35:00Z",
"enrichment_method": "Nominatim geocoding API",
"enrichment_type": "GEOCODING",
"match_score": 0.95,
"verified": False,
"enrichment_source": "https://nominatim.openstreetmap.org",
"enrichment_notes": "Geocoded from city name and country"
},
{
"enrichment_date": "2025-11-10T15:00:00Z",
"enrichment_method": "Manual verification after city mismatch detection",
"enrichment_type": "FALSE_POSITIVE_REMOVAL",
"match_score": None, # Manual operation
"verified": True,
"enrichment_source": None,
"enrichment_notes": "Removed Q16665606 (city mismatch: Paris vs Tunis)"
}
]
}
}
def query_examples():
"""Demonstrate advantages of structured enrichment_history"""
data = new_approach()
history = data["provenance"]["enrichment_history"]
print("=" * 70)
print("STRUCTURED ENRICHMENT HISTORY: QUERY EXAMPLES")
print("=" * 70)
print()
# Query 1: Find all Wikidata enrichments
wikidata_enrichments = [
e for e in history
if e["enrichment_type"] == "WIKIDATA_IDENTIFIER"
]
print("1. Wikidata enrichments:")
for e in wikidata_enrichments:
print(f" - Date: {e['enrichment_date']}")
print(f" Match score: {e['match_score']}")
print(f" Verified: {e['verified']}")
print()
# Query 2: Find unverified enrichments (need manual review)
unverified = [e for e in history if not e["verified"]]
print(f"2. Unverified enrichments needing review: {len(unverified)}")
for e in unverified:
print(f" - {e['enrichment_type']}: {e['enrichment_notes']}")
print()
# Query 3: Find low-confidence enrichments (< 0.9)
low_confidence = [
e for e in history
if e["match_score"] is not None and e["match_score"] < 0.9
]
print(f"3. Low-confidence enrichments (< 0.9): {len(low_confidence)}")
for e in low_confidence:
print(f" - {e['enrichment_type']}: score={e['match_score']:.2f}")
print()
# Query 4: Timeline of enrichment activities
print("4. Enrichment timeline:")
for e in sorted(history, key=lambda x: x["enrichment_date"]):
print(f" {e['enrichment_date']} - {e['enrichment_type']}")
print()
# Query 5: Count enrichments by type
from collections import Counter
type_counts = Counter(e["enrichment_type"] for e in history)
print("5. Enrichments by type:")
for enrichment_type, count in type_counts.items():
print(f" - {enrichment_type}: {count}")
print()
def migration_note():
"""Show what needs to be migrated from old notes to new structure"""
print("=" * 70)
print("MIGRATION REQUIRED")
print("=" * 70)
print()
print("Existing instances with provenance.notes need migration:")
print(" - Parse notes strings for enrichment patterns")
print(" - Extract: date, method, type, match score")
print(" - Convert to EnrichmentHistoryEntry objects")
print()
print("Example patterns to parse:")
print(' - "Wikidata enriched YYYY-MM-DD (Qnumber, match: XX%)"')
print(' - "Geocoded to (lat, lon) via Service"')
print(' - "False Wikidata match Qnumber removed YYYY-MM-DD"')
print()
print("Migration script: scripts/migrate_enrichment_notes_to_history.py")
print(" (TO BE CREATED)")
print()
def ontology_mappings():
"""Show RDF/ontology mappings for semantic interoperability"""
print("=" * 70)
print("ONTOLOGY MAPPINGS (for RDF export)")
print("=" * 70)
print()
print("EnrichmentHistoryEntry fields map to standard ontologies:")
print()
print(" enrichment_date → prov:atTime (PROV-O timestamp)")
print(" enrichment_method → prov:hadPlan (PROV-O activity plan)")
print(" enrichment_type → rdf:type (RDF type classification)")
print(" match_score → adms:confidence (ADMS confidence score)")
print(" verified → adms:status (ADMS verification status)")
print(" enrichment_source → dcterms:source (Dublin Core source)")
print(" enrichment_notes → dcterms:description (DC description)")
print(" verified_by → foaf:Agent (FOAF agent/person)")
print()
print("Enables semantic web integration with:")
print(" - W3C PROV-O (provenance tracking)")
print(" - ADMS (data quality metadata)")
print(" - Dublin Core Terms (descriptions)")
print(" - FOAF (agent identification)")
print()
if __name__ == "__main__":
print("\n" + "=" * 70)
print("SCHEMA v0.2.2 ENHANCEMENT: Structured Enrichment History")
print("=" * 70)
print()
print("BEFORE (v0.2.1): Unstructured notes")
print("-" * 70)
import json
old = old_approach()
print(json.dumps(old["provenance"], indent=2))
print()
print("❌ Problems:")
print(" - Hard to parse programmatically")
print(" - No queryability (can't filter by type, date, confidence)")
print(" - No ontology alignment (not semantic web compatible)")
print(" - Mixed concerns (multiple activities in one string)")
print()
print("=" * 70)
print("AFTER (v0.2.2): Structured enrichment_history")
print("-" * 70)
new = new_approach()
print(json.dumps(new["provenance"], indent=2))
print()
print("✅ Benefits:")
print(" - Machine-readable structured data")
print(" - Queryable (filter by type, confidence, verification status)")
print(" - Ontology-aligned (PROV-O, ADMS, DCTerms, FOAF)")
print(" - Separation of concerns (one entry per activity)")
print(" - Chronological audit log")
print()
query_examples()
migration_note()
ontology_mappings()
print("=" * 70)
print("NEXT STEPS")
print("=" * 70)
print()
print("1. ✅ Schema enhancement complete (v0.2.2)")
print("2. ⏳ Create migration script for existing instances")
print("3. ⏳ Test with Phase 3 Chile enrichment workflow")
print("4. ⏳ Update data quality reports to query enrichment_history")
print()