glam/scripts/enrich_chilean_batch3.py

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 3 Wikidata Enrichment

Target: 5 more university departments
- Universidad del Bío-Bío (Chillán) → Q2661431
- Universidad de Talca (Talca) → Q3244354
- Universidad de la Frontera (Temuco) → Q3244350
- Universidad de Magallanes (Punta Arenas) → Q3244396
- Universidad de Playa Ancha (Valparaíso) → Q3244389

Strategy: Direct Q-number mapping with exact matching (100% accuracy in Batch 2)
Expected result: 6 → 11 institutions (12.2% coverage)
"""

import yaml
from pathlib import Path
from datetime import datetime, timezone
import shutil

# File paths
INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch2_enriched.yaml")
OUTPUT_FILE = Path("data/instances/chile/chilean_institutions_batch3_enriched.yaml")
BACKUP_SUFFIX = ".batch3_backup"

# Batch 3 enrichment mappings (hardcoded Q-numbers)
BATCH3_MAPPINGS = {
    "Universidad del Bío-Bío": {
        "q_number": "Q2661431",
        "city": "Chillán",
        "region": "Diguillín",
        "verification": "Chile's state university in Chillán, founded 1988"
    },
    "Universidad de Talca": {
        "q_number": "Q3244354",
        "city": "Talca",
        "region": "Talca",
        "verification": "State university in Talca, founded 1981"
    },
    "Universidad de la Frontera": {
        "q_number": "Q3244350",
        "city": "Temuco",
        "region": "Cautín",
        "verification": "State university in Temuco, founded 1981"
    },
    "Universidad de Magallanes": {
        "q_number": "Q3244396",
        "city": "Punta Arenas",
        "region": "Magallanes",
        "verification": "State university in Punta Arenas, founded 1961"
    },
    "Universidad de Playa Ancha": {
        "q_number": "Q3244389",
        "city": "Valparaíso",
        "region": "Valparaíso",
        "verification": "State university in Valparaíso, founded 1948"
    }
}


def exact_match(institution_name: str, target_name: str, city: str, target_city: str) -> bool:
    """
    Exact matching strategy (zero false positives).

    Criteria:
    1. Institution name contains target university name
    2. City/region matches
    3. Institution type is EDUCATION_PROVIDER
    """
    name_lower = institution_name.lower()
    target_lower = target_name.lower()
    city_lower = city.lower() if city else ""
    target_city_lower = target_city.lower()

    # Check if target university name is in institution name
    name_match = target_lower in name_lower

    # Check if city matches (either in city or region field)
    city_match = target_city_lower in city_lower

    return name_match and city_match


def enrich_institutions():
    """Main enrichment function."""

    print("=" * 80)
    print("CHILEAN GLAM INSTITUTIONS - BATCH 3 WIKIDATA ENRICHMENT")
    print("=" * 80)
    print()

    # Load institutions
    print(f"📖 Loading institutions from: {INPUT_FILE}")
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    total_institutions = len(institutions)
    print(f"   Loaded {total_institutions} institutions")
    print()

    # Count current Wikidata coverage
    enriched_before = sum(1 for inst in institutions
                          if inst.get('identifiers')
                          and any(i.get('identifier_scheme') == 'Wikidata'
                                 for i in inst['identifiers']))

    print(f"📊 Current Wikidata coverage: {enriched_before}/{total_institutions} ({enriched_before/total_institutions*100:.1f}%)")
    print()

    # Create backup
    backup_file = str(INPUT_FILE) + BACKUP_SUFFIX
    print(f"💾 Creating backup: {backup_file}")
    shutil.copy2(INPUT_FILE, backup_file)
    print()

    # Enrich institutions
    print("🔍 Starting Batch 3 enrichment...")
    print()

    enriched_count = 0
    skipped_count = 0

    for institution in institutions:
        name = institution.get('name', '')
        institution_type = institution.get('institution_type', '')

        # Only process EDUCATION_PROVIDER institutions
        if institution_type != 'EDUCATION_PROVIDER':
            continue

        # Get location info
        locations = institution.get('locations', [])
        if not locations:
            continue

        location = locations[0]
        city = location.get('city', '')
        region = location.get('region', '')
        city_or_region = city or region

        # Check if already enriched
        identifiers = institution.get('identifiers', [])
        has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers)

        if has_wikidata:
            continue

        # Try to match with Batch 3 mappings
        matched = False
        for target_name, mapping in BATCH3_MAPPINGS.items():
            if exact_match(name, target_name, city_or_region, mapping['city']):
                q_number = mapping['q_number']

                print(f"✅ MATCH: {name}")
                print(f"   Location: {city_or_region}")
                print(f"   Q-number: {q_number}")
                print(f"   Verification: {mapping['verification']}")

                # Add Wikidata identifier
                wikidata_id = {
                    'identifier_scheme': 'Wikidata',
                    'identifier_value': q_number,
                    'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
                }

                if not identifiers:
                    institution['identifiers'] = []

                institution['identifiers'].append(wikidata_id)

                # Update extraction method in provenance
                if 'provenance' in institution:
                    current_method = institution['provenance'].get('extraction_method', '')
                    institution['provenance']['extraction_method'] = (
                        f"{current_method} + Wikidata enrichment "
                        f"(Batch 3, parent: {target_name}, exact match)"
                    )

                enriched_count += 1
                matched = True
                print()
                break

        if not matched and institution_type == 'EDUCATION_PROVIDER':
            skipped_count += 1

    print("=" * 80)
    print(f"📊 Batch 3 Enrichment Summary")
    print("=" * 80)
    print(f"✅ Enriched: {enriched_count} institutions")
    print(f"⏭️  Skipped: {skipped_count} institutions (no match)")
    print()

    # Count final Wikidata coverage
    enriched_after = sum(1 for inst in institutions
                         if inst.get('identifiers')
                         and any(i.get('identifier_scheme') == 'Wikidata'
                                for i in inst['identifiers']))

    print(f"📈 New Wikidata coverage: {enriched_after}/{total_institutions} ({enriched_after/total_institutions*100:.1f}%)")
    print(f"   Improvement: +{enriched_after - enriched_before} institutions")
    print()

    # Save enriched dataset
    print(f"💾 Saving enriched dataset to: {OUTPUT_FILE}")
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)

    print()
    print("✅ Batch 3 enrichment complete!")
    print()
    print("📁 Files:")
    print(f"   Input:  {INPUT_FILE}")
    print(f"   Output: {OUTPUT_FILE}")
    print(f"   Backup: {backup_file}")
    print()

    # Next steps
    print("🎯 Next Steps:")
    if enriched_after < 20:
        remaining = 20 - enriched_after
        print(f"   - Need {remaining} more institutions to reach 22.2% coverage goal (20 institutions)")
        print(f"   - Consider Batch 4: Major Santiago museums or regional universities")
    else:
        print(f"   - 🎉 GOAL ACHIEVED! 22.2% coverage reached ({enriched_after} institutions)")
    print()


if __name__ == "__main__":
    enrich_institutions()