- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
220 lines
8.3 KiB
Python
Executable file
220 lines
8.3 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Demonstration of enrichment_history schema enhancement (v0.2.2)
|
|
|
|
Shows the improvement from unstructured provenance.notes to structured
|
|
EnrichmentHistoryEntry objects for tracking data quality activities.
|
|
"""
|
|
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Any
|
|
|
|
|
|
def old_approach() -> Dict[str, Any]:
|
|
"""OLD (v0.2.1): Unstructured notes string - hard to parse and query"""
|
|
return {
|
|
"provenance": {
|
|
"data_source": "CONVERSATION_NLP",
|
|
"data_tier": "TIER_4_INFERRED",
|
|
"extraction_date": "2025-11-09T10:00:00Z",
|
|
"extraction_method": "AI agent NER extraction",
|
|
"confidence_score": 0.85,
|
|
# ❌ Problem: Unstructured text, no machine-readable metadata
|
|
"notes": (
|
|
"Wikidata enriched 2025-11-10 (Q3330723, match: 100%). "
|
|
"Geocoded to (36.806495, 10.181532) via Nominatim. "
|
|
"False Wikidata match Q16665606 removed 2025-11-10."
|
|
)
|
|
}
|
|
}
|
|
|
|
|
|
def new_approach() -> Dict[str, Any]:
|
|
"""NEW (v0.2.2): Structured enrichment_history with queryable metadata"""
|
|
return {
|
|
"provenance": {
|
|
"data_source": "CONVERSATION_NLP",
|
|
"data_tier": "TIER_4_INFERRED",
|
|
"extraction_date": "2025-11-09T10:00:00Z",
|
|
"extraction_method": "AI agent NER extraction",
|
|
"confidence_score": 0.85,
|
|
# ✅ Solution: Structured, queryable enrichment log
|
|
"enrichment_history": [
|
|
{
|
|
"enrichment_date": "2025-11-10T14:30:00Z",
|
|
"enrichment_method": "Wikidata SPARQL fuzzy matching with city verification",
|
|
"enrichment_type": "WIKIDATA_IDENTIFIER",
|
|
"match_score": 1.0,
|
|
"verified": True,
|
|
"enrichment_source": "https://www.wikidata.org",
|
|
"enrichment_notes": "Perfect name match, city verified: Tunis"
|
|
},
|
|
{
|
|
"enrichment_date": "2025-11-10T14:35:00Z",
|
|
"enrichment_method": "Nominatim geocoding API",
|
|
"enrichment_type": "GEOCODING",
|
|
"match_score": 0.95,
|
|
"verified": False,
|
|
"enrichment_source": "https://nominatim.openstreetmap.org",
|
|
"enrichment_notes": "Geocoded from city name and country"
|
|
},
|
|
{
|
|
"enrichment_date": "2025-11-10T15:00:00Z",
|
|
"enrichment_method": "Manual verification after city mismatch detection",
|
|
"enrichment_type": "FALSE_POSITIVE_REMOVAL",
|
|
"match_score": None, # Manual operation
|
|
"verified": True,
|
|
"enrichment_source": None,
|
|
"enrichment_notes": "Removed Q16665606 (city mismatch: Paris vs Tunis)"
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
|
|
def query_examples():
|
|
"""Demonstrate advantages of structured enrichment_history"""
|
|
data = new_approach()
|
|
history = data["provenance"]["enrichment_history"]
|
|
|
|
print("=" * 70)
|
|
print("STRUCTURED ENRICHMENT HISTORY: QUERY EXAMPLES")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Query 1: Find all Wikidata enrichments
|
|
wikidata_enrichments = [
|
|
e for e in history
|
|
if e["enrichment_type"] == "WIKIDATA_IDENTIFIER"
|
|
]
|
|
print("1. Wikidata enrichments:")
|
|
for e in wikidata_enrichments:
|
|
print(f" - Date: {e['enrichment_date']}")
|
|
print(f" Match score: {e['match_score']}")
|
|
print(f" Verified: {e['verified']}")
|
|
print()
|
|
|
|
# Query 2: Find unverified enrichments (need manual review)
|
|
unverified = [e for e in history if not e["verified"]]
|
|
print(f"2. Unverified enrichments needing review: {len(unverified)}")
|
|
for e in unverified:
|
|
print(f" - {e['enrichment_type']}: {e['enrichment_notes']}")
|
|
print()
|
|
|
|
# Query 3: Find low-confidence enrichments (< 0.9)
|
|
low_confidence = [
|
|
e for e in history
|
|
if e["match_score"] is not None and e["match_score"] < 0.9
|
|
]
|
|
print(f"3. Low-confidence enrichments (< 0.9): {len(low_confidence)}")
|
|
for e in low_confidence:
|
|
print(f" - {e['enrichment_type']}: score={e['match_score']:.2f}")
|
|
print()
|
|
|
|
# Query 4: Timeline of enrichment activities
|
|
print("4. Enrichment timeline:")
|
|
for e in sorted(history, key=lambda x: x["enrichment_date"]):
|
|
print(f" {e['enrichment_date']} - {e['enrichment_type']}")
|
|
print()
|
|
|
|
# Query 5: Count enrichments by type
|
|
from collections import Counter
|
|
type_counts = Counter(e["enrichment_type"] for e in history)
|
|
print("5. Enrichments by type:")
|
|
for enrichment_type, count in type_counts.items():
|
|
print(f" - {enrichment_type}: {count}")
|
|
print()
|
|
|
|
|
|
def migration_note():
|
|
"""Show what needs to be migrated from old notes to new structure"""
|
|
print("=" * 70)
|
|
print("MIGRATION REQUIRED")
|
|
print("=" * 70)
|
|
print()
|
|
print("Existing instances with provenance.notes need migration:")
|
|
print(" - Parse notes strings for enrichment patterns")
|
|
print(" - Extract: date, method, type, match score")
|
|
print(" - Convert to EnrichmentHistoryEntry objects")
|
|
print()
|
|
print("Example patterns to parse:")
|
|
print(' - "Wikidata enriched YYYY-MM-DD (Qnumber, match: XX%)"')
|
|
print(' - "Geocoded to (lat, lon) via Service"')
|
|
print(' - "False Wikidata match Qnumber removed YYYY-MM-DD"')
|
|
print()
|
|
print("Migration script: scripts/migrate_enrichment_notes_to_history.py")
|
|
print(" (TO BE CREATED)")
|
|
print()
|
|
|
|
|
|
def ontology_mappings():
|
|
"""Show RDF/ontology mappings for semantic interoperability"""
|
|
print("=" * 70)
|
|
print("ONTOLOGY MAPPINGS (for RDF export)")
|
|
print("=" * 70)
|
|
print()
|
|
print("EnrichmentHistoryEntry fields map to standard ontologies:")
|
|
print()
|
|
print(" enrichment_date → prov:atTime (PROV-O timestamp)")
|
|
print(" enrichment_method → prov:hadPlan (PROV-O activity plan)")
|
|
print(" enrichment_type → rdf:type (RDF type classification)")
|
|
print(" match_score → adms:confidence (ADMS confidence score)")
|
|
print(" verified → adms:status (ADMS verification status)")
|
|
print(" enrichment_source → dcterms:source (Dublin Core source)")
|
|
print(" enrichment_notes → dcterms:description (DC description)")
|
|
print(" verified_by → foaf:Agent (FOAF agent/person)")
|
|
print()
|
|
print("Enables semantic web integration with:")
|
|
print(" - W3C PROV-O (provenance tracking)")
|
|
print(" - ADMS (data quality metadata)")
|
|
print(" - Dublin Core Terms (descriptions)")
|
|
print(" - FOAF (agent identification)")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("\n" + "=" * 70)
|
|
print("SCHEMA v0.2.2 ENHANCEMENT: Structured Enrichment History")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
print("BEFORE (v0.2.1): Unstructured notes")
|
|
print("-" * 70)
|
|
import json
|
|
old = old_approach()
|
|
print(json.dumps(old["provenance"], indent=2))
|
|
print()
|
|
print("❌ Problems:")
|
|
print(" - Hard to parse programmatically")
|
|
print(" - No queryability (can't filter by type, date, confidence)")
|
|
print(" - No ontology alignment (not semantic web compatible)")
|
|
print(" - Mixed concerns (multiple activities in one string)")
|
|
print()
|
|
|
|
print("=" * 70)
|
|
print("AFTER (v0.2.2): Structured enrichment_history")
|
|
print("-" * 70)
|
|
new = new_approach()
|
|
print(json.dumps(new["provenance"], indent=2))
|
|
print()
|
|
print("✅ Benefits:")
|
|
print(" - Machine-readable structured data")
|
|
print(" - Queryable (filter by type, confidence, verification status)")
|
|
print(" - Ontology-aligned (PROV-O, ADMS, DCTerms, FOAF)")
|
|
print(" - Separation of concerns (one entry per activity)")
|
|
print(" - Chronological audit log")
|
|
print()
|
|
|
|
query_examples()
|
|
migration_note()
|
|
ontology_mappings()
|
|
|
|
print("=" * 70)
|
|
print("NEXT STEPS")
|
|
print("=" * 70)
|
|
print()
|
|
print("1. ✅ Schema enhancement complete (v0.2.2)")
|
|
print("2. ⏳ Create migration script for existing instances")
|
|
print("3. ⏳ Test with Phase 3 Chile enrichment workflow")
|
|
print("4. ⏳ Update data quality reports to query enrichment_history")
|
|
print()
|