glam/scripts/demo_enrichment_history.py

#!/usr/bin/env python3
"""
Demonstration of enrichment_history schema enhancement (v0.2.2)

Shows the improvement from unstructured provenance.notes to structured
EnrichmentHistoryEntry objects for tracking data quality activities.
"""

from datetime import datetime, timezone
from typing import Dict, Any


def old_approach() -> Dict[str, Any]:
    """OLD (v0.2.1): Unstructured notes string - hard to parse and query"""
    return {
        "provenance": {
            "data_source": "CONVERSATION_NLP",
            "data_tier": "TIER_4_INFERRED",
            "extraction_date": "2025-11-09T10:00:00Z",
            "extraction_method": "AI agent NER extraction",
            "confidence_score": 0.85,
            # ❌ Problem: Unstructured text, no machine-readable metadata
            "notes": (
                "Wikidata enriched 2025-11-10 (Q3330723, match: 100%). "
                "Geocoded to (36.806495, 10.181532) via Nominatim. "
                "False Wikidata match Q16665606 removed 2025-11-10."
            )
        }
    }


def new_approach() -> Dict[str, Any]:
    """NEW (v0.2.2): Structured enrichment_history with queryable metadata"""
    return {
        "provenance": {
            "data_source": "CONVERSATION_NLP",
            "data_tier": "TIER_4_INFERRED",
            "extraction_date": "2025-11-09T10:00:00Z",
            "extraction_method": "AI agent NER extraction",
            "confidence_score": 0.85,
            # ✅ Solution: Structured, queryable enrichment log
            "enrichment_history": [
                {
                    "enrichment_date": "2025-11-10T14:30:00Z",
                    "enrichment_method": "Wikidata SPARQL fuzzy matching with city verification",
                    "enrichment_type": "WIKIDATA_IDENTIFIER",
                    "match_score": 1.0,
                    "verified": True,
                    "enrichment_source": "https://www.wikidata.org",
                    "enrichment_notes": "Perfect name match, city verified: Tunis"
                },
                {
                    "enrichment_date": "2025-11-10T14:35:00Z",
                    "enrichment_method": "Nominatim geocoding API",
                    "enrichment_type": "GEOCODING",
                    "match_score": 0.95,
                    "verified": False,
                    "enrichment_source": "https://nominatim.openstreetmap.org",
                    "enrichment_notes": "Geocoded from city name and country"
                },
                {
                    "enrichment_date": "2025-11-10T15:00:00Z",
                    "enrichment_method": "Manual verification after city mismatch detection",
                    "enrichment_type": "FALSE_POSITIVE_REMOVAL",
                    "match_score": None,  # Manual operation
                    "verified": True,
                    "enrichment_source": None,
                    "enrichment_notes": "Removed Q16665606 (city mismatch: Paris vs Tunis)"
                }
            ]
        }
    }


def query_examples():
    """Demonstrate advantages of structured enrichment_history"""
    data = new_approach()
    history = data["provenance"]["enrichment_history"]

    print("=" * 70)
    print("STRUCTURED ENRICHMENT HISTORY: QUERY EXAMPLES")
    print("=" * 70)
    print()

    # Query 1: Find all Wikidata enrichments
    wikidata_enrichments = [
        e for e in history
        if e["enrichment_type"] == "WIKIDATA_IDENTIFIER"
    ]
    print("1. Wikidata enrichments:")
    for e in wikidata_enrichments:
        print(f"   - Date: {e['enrichment_date']}")
        print(f"     Match score: {e['match_score']}")
        print(f"     Verified: {e['verified']}")
    print()

    # Query 2: Find unverified enrichments (need manual review)
    unverified = [e for e in history if not e["verified"]]
    print(f"2. Unverified enrichments needing review: {len(unverified)}")
    for e in unverified:
        print(f"   - {e['enrichment_type']}: {e['enrichment_notes']}")
    print()

    # Query 3: Find low-confidence enrichments (< 0.9)
    low_confidence = [
        e for e in history
        if e["match_score"] is not None and e["match_score"] < 0.9
    ]
    print(f"3. Low-confidence enrichments (< 0.9): {len(low_confidence)}")
    for e in low_confidence:
        print(f"   - {e['enrichment_type']}: score={e['match_score']:.2f}")
    print()

    # Query 4: Timeline of enrichment activities
    print("4. Enrichment timeline:")
    for e in sorted(history, key=lambda x: x["enrichment_date"]):
        print(f"   {e['enrichment_date']} - {e['enrichment_type']}")
    print()

    # Query 5: Count enrichments by type
    from collections import Counter
    type_counts = Counter(e["enrichment_type"] for e in history)
    print("5. Enrichments by type:")
    for enrichment_type, count in type_counts.items():
        print(f"   - {enrichment_type}: {count}")
    print()


def migration_note():
    """Show what needs to be migrated from old notes to new structure"""
    print("=" * 70)
    print("MIGRATION REQUIRED")
    print("=" * 70)
    print()
    print("Existing instances with provenance.notes need migration:")
    print("  - Parse notes strings for enrichment patterns")
    print("  - Extract: date, method, type, match score")
    print("  - Convert to EnrichmentHistoryEntry objects")
    print()
    print("Example patterns to parse:")
    print('  - "Wikidata enriched YYYY-MM-DD (Qnumber, match: XX%)"')
    print('  - "Geocoded to (lat, lon) via Service"')
    print('  - "False Wikidata match Qnumber removed YYYY-MM-DD"')
    print()
    print("Migration script: scripts/migrate_enrichment_notes_to_history.py")
    print("                  (TO BE CREATED)")
    print()


def ontology_mappings():
    """Show RDF/ontology mappings for semantic interoperability"""
    print("=" * 70)
    print("ONTOLOGY MAPPINGS (for RDF export)")
    print("=" * 70)
    print()
    print("EnrichmentHistoryEntry fields map to standard ontologies:")
    print()
    print("  enrichment_date       → prov:atTime (PROV-O timestamp)")
    print("  enrichment_method     → prov:hadPlan (PROV-O activity plan)")
    print("  enrichment_type       → rdf:type (RDF type classification)")
    print("  match_score           → adms:confidence (ADMS confidence score)")
    print("  verified              → adms:status (ADMS verification status)")
    print("  enrichment_source     → dcterms:source (Dublin Core source)")
    print("  enrichment_notes      → dcterms:description (DC description)")
    print("  verified_by           → foaf:Agent (FOAF agent/person)")
    print()
    print("Enables semantic web integration with:")
    print("  - W3C PROV-O (provenance tracking)")
    print("  - ADMS (data quality metadata)")
    print("  - Dublin Core Terms (descriptions)")
    print("  - FOAF (agent identification)")
    print()


if __name__ == "__main__":
    print("\n" + "=" * 70)
    print("SCHEMA v0.2.2 ENHANCEMENT: Structured Enrichment History")
    print("=" * 70)
    print()

    print("BEFORE (v0.2.1): Unstructured notes")
    print("-" * 70)
    import json
    old = old_approach()
    print(json.dumps(old["provenance"], indent=2))
    print()
    print("❌ Problems:")
    print("  - Hard to parse programmatically")
    print("  - No queryability (can't filter by type, date, confidence)")
    print("  - No ontology alignment (not semantic web compatible)")
    print("  - Mixed concerns (multiple activities in one string)")
    print()

    print("=" * 70)
    print("AFTER (v0.2.2): Structured enrichment_history")
    print("-" * 70)
    new = new_approach()
    print(json.dumps(new["provenance"], indent=2))
    print()
    print("✅ Benefits:")
    print("  - Machine-readable structured data")
    print("  - Queryable (filter by type, confidence, verification status)")
    print("  - Ontology-aligned (PROV-O, ADMS, DCTerms, FOAF)")
    print("  - Separation of concerns (one entry per activity)")
    print("  - Chronological audit log")
    print()

    query_examples()
    migration_note()
    ontology_mappings()

    print("=" * 70)
    print("NEXT STEPS")
    print("=" * 70)
    print()
    print("1. ✅ Schema enhancement complete (v0.2.2)")
    print("2. ⏳ Create migration script for existing instances")
    print("3. ⏳ Test with Phase 3 Chile enrichment workflow")
    print("4. ⏳ Update data quality reports to query enrichment_history")
    print()