glam/scripts/enrich_chilean_batch20_v0.2.2_test.py.bak

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 20 Wikidata Enrichment
FIRST PRODUCTION USE OF SCHEMA v0.2.2 enrichment_history

This script demonstrates the NEW structured enrichment tracking:
- Uses Provenance.enrichment_history (list of EnrichmentHistoryEntry)
- Replaces unstructured provenance.notes with queryable metadata
- Tracks enrichment_type, match_score, verified status
- Aligns with PROV-O, ADMS, and Dublin Core ontologies

Target: 19 unenriched institutions → 79% coverage goal
Schema: v0.2.2 (enrichment_history structure)
"""

import yaml
import time
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
from SPARQLWrapper import SPARQLWrapper, JSON

# =============================================================================
# CONFIGURATION
# =============================================================================

INPUT_FILE = Path('data/instances/chile/chilean_institutions_batch19_enriched.yaml')
OUTPUT_FILE = Path('data/instances/chile/chilean_institutions_batch20_enriched.yaml')
WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"

# Fuzzy match threshold (0.0-1.0)
MIN_MATCH_SCORE = 0.75

# TEST MODE: Set to a number to limit enrichment (e.g., 3 for testing)
# Set to None to process all unenriched institutions
TEST_MODE_LIMIT = 3  # TEST MODE - Only process first 3 institutions

# Rate limiting (seconds between queries)
QUERY_DELAY = 2.0  # Respect Wikidata's rate limits (1 req/sec + buffer)

# TEST MODE: Set to a number to limit enrichment (e.g., 3 for testing)
# Set to None to process all unenriched institutions
TEST_MODE_LIMIT = 3  # TEST MODE - Only process first 3 institutions

# Rate limiting (seconds between queries)
QUERY_DELAY = 2.0  # Respect Wikidata's rate limits (1 req/sec + buffer)

# =============================================================================
# WIKIDATA SPARQL QUERIES
# =============================================================================

def query_wikidata_by_name(institution_name: str, country: str = "Chile") -> Optional[Dict]:
    """
    Query Wikidata for heritage institutions by name.

    Returns dict with Q-number, label, and additional metadata if found.
    """

    # SPARQL query for Chilean heritage institutions
    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?founded WHERE {{
      # Heritage institution types
      VALUES ?type {{
        wd:Q33506       # Museum
        wd:Q7075        # Library
        wd:Q166118      # Archive
        wd:Q207694      # Art gallery
        wd:Q2668072     # Cultural institution
      }}

      ?item wdt:P31/wdt:P279* ?type .  # Instance of (with subclasses)
      ?item wdt:P17 wd:Q298 .          # Country: Chile

      # Optional identifiers
      OPTIONAL {{ ?item wdt:P214 ?viaf }}
      OPTIONAL {{ ?item wdt:P791 ?isil }}
      OPTIONAL {{ ?item wdt:P571 ?founded }}

      # Filter by name (case-insensitive)
      FILTER(CONTAINS(LCASE(?itemLabel), "{institution_name.lower()}"))

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
    }}
    LIMIT 5
    """

    sparql = SPARQLWrapper(WIKIDATA_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.2 (https://github.com/sct/glam)")

    try:
        results = sparql.query().convert()

        # Rate limiting - wait after each query
        time.sleep(QUERY_DELAY)

        # Rate limiting - wait after each query
        time.sleep(QUERY_DELAY)

        if results['results']['bindings']:
            # Return first match
            result = results['results']['bindings'][0]

            q_number = result['item']['value'].split('/')[-1]
            label = result['itemLabel']['value']
            description = result.get('itemDescription', {}).get('value', '')
            viaf = result.get('viaf', {}).get('value', None)
            isil = result.get('isil', {}).get('value', None)
            founded = result.get('founded', {}).get('value', None)

            return {
                'q_number': q_number,
                'wikidata_label': label,
                'description': description,
                'viaf': viaf,
                'isil': isil,
                'founded': founded
            }
    except Exception as e:
        print(f"    ⚠️  SPARQL query failed: {e}")
        time.sleep(QUERY_DELAY)  # Wait even on error
        time.sleep(QUERY_DELAY)  # Wait even on error

    return None

# =============================================================================
# FUZZY MATCHING
# =============================================================================

def calculate_match_score(name1: str, name2: str) -> float:
    """
    Calculate fuzzy match score between two institution names.
    Uses simple token-based matching (can be enhanced with rapidfuzz).

    Returns score 0.0-1.0
    """
    from difflib import SequenceMatcher

    # Normalize
    n1 = name1.lower().strip()
    n2 = name2.lower().strip()

    # Direct comparison
    if n1 == n2:
        return 1.0

    # Token-based matching
    matcher = SequenceMatcher(None, n1, n2)
    return matcher.ratio()

# =============================================================================
# SCHEMA v0.2.2 ENRICHMENT FUNCTIONS
# =============================================================================

def create_enrichment_entry(
    enrichment_type: str,
    enrichment_method: str,
    match_score: Optional[float] = None,
    verified: bool = False,
    enrichment_source: Optional[str] = None,
    enrichment_notes: Optional[str] = None
) -> Dict:
    """
    Create EnrichmentHistoryEntry conforming to schema v0.2.2.

    Args:
        enrichment_type: EnrichmentTypeEnum value (e.g., "WIKIDATA_IDENTIFIER")
        enrichment_method: Description of enrichment method
        match_score: Fuzzy match confidence (0.0-1.0), null for manual
        verified: Whether manually verified (default False)
        enrichment_source: Source URL (e.g., https://www.wikidata.org)
        enrichment_notes: Additional notes about enrichment

    Returns:
        Dict conforming to EnrichmentHistoryEntry class
    """
    entry = {
        'enrichment_date': datetime.now(timezone.utc).isoformat(),
        'enrichment_method': enrichment_method,
        'enrichment_type': enrichment_type,
        'verified': verified
    }

    if match_score is not None:
        entry['match_score'] = round(match_score, 3)

    if enrichment_source:
        entry['enrichment_source'] = enrichment_source

    if enrichment_notes:
        entry['enrichment_notes'] = enrichment_notes

    return entry

def add_wikidata_identifier(inst: Dict, wikidata_data: Dict, match_score: float) -> bool:
    """
    Add Wikidata identifier and enrichment_history entry to institution.

    Returns True if enrichment was added, False if already exists.
    """

    # Check if already has Wikidata
    existing_ids = inst.get('identifiers', [])
    has_wikidata = any(
        id_obj.get('identifier_scheme') == 'Wikidata'
        for id_obj in existing_ids
    )

    if has_wikidata:
        print(f"    ⚠️  Already has Wikidata identifier")
        return False

    # Add Wikidata identifier
    wikidata_id = {
        'identifier_scheme': 'Wikidata',
        'identifier_value': wikidata_data['q_number'],
        'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_data['q_number']}"
    }

    if 'identifiers' not in inst:
        inst['identifiers'] = []

    inst['identifiers'].append(wikidata_id)

    # Add VIAF if available
    if wikidata_data.get('viaf'):
        viaf_id = {
            'identifier_scheme': 'VIAF',
            'identifier_value': wikidata_data['viaf'],
            'identifier_url': f"https://viaf.org/viaf/{wikidata_data['viaf']}"
        }
        inst['identifiers'].append(viaf_id)
        print(f"    📚 Added VIAF: {wikidata_data['viaf']}")

    # Create enrichment_history entry (SCHEMA v0.2.2)
    if 'provenance' not in inst:
        inst['provenance'] = {}

    if 'enrichment_history' not in inst['provenance']:
        inst['provenance']['enrichment_history'] = []

    # Wikidata enrichment entry
    wikidata_entry = create_enrichment_entry(
        enrichment_type='WIKIDATA_IDENTIFIER',
        enrichment_method='Wikidata SPARQL query with fuzzy name matching',
        match_score=match_score,
        verified=False,  # Automated enrichment
        enrichment_source='https://www.wikidata.org',
        enrichment_notes=f"Matched to '{wikidata_data['wikidata_label']}' (Q{wikidata_data['q_number']})"
    )

    inst['provenance']['enrichment_history'].append(wikidata_entry)

    # If VIAF was added, create separate enrichment entry
    if wikidata_data.get('viaf'):
        viaf_entry = create_enrichment_entry(
            enrichment_type='VIAF_IDENTIFIER',
            enrichment_method='VIAF identifier extracted from Wikidata entity',
            match_score=None,  # Derived data, not fuzzy matched
            verified=False,
            enrichment_source='https://viaf.org',
            enrichment_notes=f"Extracted from Wikidata Q{wikidata_data['q_number']}"
        )
        inst['provenance']['enrichment_history'].append(viaf_entry)

    print(f"    ✅ Added Wikidata: {wikidata_data['q_number']} ({wikidata_data['wikidata_label']})")
    print(f"    📊 Match score: {match_score:.3f}")

    return True

# =============================================================================
# FILE I/O
# =============================================================================

def load_yaml(file_path: Path) -> List[Dict]:
    """Load YAML file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

def save_yaml(data: List[Dict], file_path: Path) -> None:
    """Save data to YAML file."""
    with open(file_path, 'w', encoding='utf-8') as f:
        yaml.dump(
            data,
            f,
            default_flow_style=False,
            allow_unicode=True,
            sort_keys=False,
            width=120,
            indent=2
        )

# =============================================================================
# MAIN ENRICHMENT WORKFLOW
# =============================================================================

def main():
    print("=" * 80)
    print("CHILEAN GLAM INSTITUTIONS - BATCH 20 ENRICHMENT (SCHEMA v0.2.2)")
    print("=" * 80)
    print()
    print("🆕 FIRST PRODUCTION USE OF enrichment_history STRUCTURE")
    print("   Schema: v0.2.2 (structured provenance tracking)")
    print("   Target: 19 unenriched institutions")
    print()

    # Load data
    print(f"📖 Loading: {INPUT_FILE}")
    institutions = load_yaml(INPUT_FILE)
    print(f"   Loaded {len(institutions)} institutions")
    print()

    # Find unenriched institutions
    unenriched = [
        inst for inst in institutions
        if not any(
            id_obj.get('identifier_scheme') == 'Wikidata'
            for id_obj in inst.get('identifiers', [])
        )
    ]

    print(f"🔍 Found {len(unenriched)} institutions without Wikidata")

    # TEST MODE: Limit number of institutions to enrich
    if TEST_MODE_LIMIT is not None:
        print(f"⚠️  TEST MODE: Limiting to first {TEST_MODE_LIMIT} institutions")
        unenriched = unenriched[:TEST_MODE_LIMIT]
        print()
    print()

    if not unenriched:
        print("✅ All institutions already enriched!")
        return

    # Create backup
    backup_file = INPUT_FILE.with_suffix('.yaml.batch20_backup')
    print(f"💾 Creating backup: {backup_file}")
    save_yaml(institutions, backup_file)
    print()

    # Enrich institutions
    print(f"🔧 Starting Wikidata enrichment...")
    print()

    enriched_count = 0
    skipped_count = 0

    for i, inst in enumerate(unenriched, 1):
        inst_name = inst['name']
        inst_city = inst.get('locations', [{}])[0].get('city', 'Unknown')

        print(f"{i}/{len(unenriched)}. {inst_name} ({inst_city})")

        # Query Wikidata
        wikidata_data = query_wikidata_by_name(inst_name)

        if not wikidata_data:
            print(f"    ❌ No Wikidata match found")
            skipped_count += 1
            print()
            continue

        # Calculate match score
        match_score = calculate_match_score(inst_name, wikidata_data['wikidata_label'])

        if match_score < MIN_MATCH_SCORE:
            print(f"    ⚠️  Match score too low: {match_score:.3f} < {MIN_MATCH_SCORE}")
            print(f"    📝 Wikidata label: {wikidata_data['wikidata_label']}")
            skipped_count += 1
            print()
            continue

        # Add enrichment
        if add_wikidata_identifier(inst, wikidata_data, match_score):
            enriched_count += 1

        print()

    # Save enriched data
    print("=" * 80)
    print(f"💾 Saving enriched data to: {OUTPUT_FILE}")
    save_yaml(institutions, OUTPUT_FILE)
    print()

    # Statistics
    total_with_wikidata = sum(
        1 for inst in institutions
        if any(
            id_obj.get('identifier_scheme') == 'Wikidata'
            for id_obj in inst.get('identifiers', [])
        )
    )

    print("=" * 80)
    print("ENRICHMENT SUMMARY")
    print("=" * 80)
    print(f"✅ Newly enriched: {enriched_count}")
    print(f"⏭️  Skipped (no match or low confidence): {skipped_count}")
    print(f"📊 Total with Wikidata: {total_with_wikidata}/{len(institutions)} ({total_with_wikidata/len(institutions)*100:.1f}%)")
    print()
    print("🎯 Schema v0.2.2 Features Used:")
    print("   - enrichment_history (list of EnrichmentHistoryEntry)")
    print("   - enrichment_type: WIKIDATA_IDENTIFIER, VIAF_IDENTIFIER")
    print("   - match_score: Fuzzy matching confidence (0.0-1.0)")
    print("   - verified: false (automated enrichment)")
    print("   - enrichment_source: Wikidata and VIAF URLs")
    print()
    print("✅ DONE")

if __name__ == '__main__':
    main()