glam/scripts/enrich_chilean_batch8.py

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 8 Wikidata Enrichment (Libraries)
Uses bulk SPARQL matches from query_wikidata_chilean_libraries.py
2 libraries with verified Q-numbers from Wikidata Query Service

Target: 54/90 institutions (60% coverage)
"""

import yaml
from pathlib import Path
from datetime import datetime, timezone

# Batch 8: 2 libraries from SPARQL bulk query
BATCH_8_ENRICHMENTS = [
    {
        "name": "Biblioteca Nacional Digital",
        "city": "Iquique",
        "q_number": "Q18924152",
        "wikidata_name": "Biblioteca Nacional Digital de Chile",
        "confidence": "partial",
        "notes": "SPARQL match - partial name (full official title in Wikidata). Note: City may be incorrect in our data - this is a national digital library, not specific to Iquique.",
    },
    {
        "name": "William Mulloy Library",
        "city": "Isla de Pascua",  # Updated from Unknown
        "q_number": "Q8015912",
        "wikidata_name": "Biblioteca William Mulloy",
        "confidence": "partial",
        "founded": "2002",
        "notes": "SPARQL match - partial name (Spanish vs English). Easter Island archaeological library.",
    },
]

def load_yaml(file_path: Path) -> list:
    """Load YAML file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

def save_yaml(data: list, file_path: Path) -> None:
    """Save data to YAML file."""
    with open(file_path, 'w', encoding='utf-8') as f:
        yaml.dump(
            data,
            f,
            default_flow_style=False,
            allow_unicode=True,
            sort_keys=False,
            width=120,
            indent=2
        )

def find_institution(institutions: list, name: str, city: str) -> dict:
    """Find institution by name and city."""
    for inst in institutions:
        if inst['name'] == name:
            # Check city if provided
            if city and city != "Unknown":
                inst_city = inst.get('locations', [{}])[0].get('city', '')
                if inst_city == city or city == "Isla de Pascua":  # Allow Easter Island match
                    return inst
            else:
                return inst
    raise ValueError(f"Institution not found: {name} ({city})")

def enrich_institution(inst: dict, enrichment: dict) -> None:
    """Add Wikidata identifier to institution."""

    # Check if already has Wikidata
    existing_ids = inst.get('identifiers', [])
    has_wikidata = any(
        id_obj.get('identifier_scheme') == 'Wikidata'
        for id_obj in existing_ids
    )

    if has_wikidata:
        print(f"  ⚠️  {inst['name']} already has Wikidata identifier")
        return

    # Add Wikidata identifier
    wikidata_id = {
        'identifier_scheme': 'Wikidata',
        'identifier_value': enrichment['q_number'],
        'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}"
    }

    if 'identifiers' not in inst:
        inst['identifiers'] = []

    inst['identifiers'].append(wikidata_id)

    # Update city if it was Unknown
    if enrichment.get('city') and enrichment['city'] != "Unknown":
        locations = inst.get('locations', [])
        if locations and not locations[0].get('city'):
            locations[0]['city'] = enrichment['city']
            print(f"  📍 Updated city to: {enrichment['city']}")

    # Update provenance
    if 'provenance' not in inst:
        inst['provenance'] = {}

    inst['provenance']['enrichment_method'] = 'Wikidata SPARQL bulk query (Batch 8 - Libraries)'
    inst['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
    inst['provenance']['wikidata_match_confidence'] = enrichment['confidence']

    # Add notes
    if 'notes' not in inst['provenance']:
        inst['provenance']['notes'] = []
    elif isinstance(inst['provenance']['notes'], str):
        inst['provenance']['notes'] = [inst['provenance']['notes']]

    inst['provenance']['notes'].append(
        f"Batch 8: {enrichment['notes']}"
    )

    print(f"  ✅ Added Wikidata: {enrichment['q_number']} ({enrichment['wikidata_name']})")

def main():
    print("=" * 80)
    print("CHILEAN GLAM INSTITUTIONS - BATCH 8 ENRICHMENT (LIBRARIES)")
    print("=" * 80)
    print()

    # Load data
    input_file = Path('data/instances/chile/chilean_institutions_batch7_enriched.yaml')
    print(f"📖 Loading: {input_file}")
    institutions = load_yaml(input_file)
    print(f"   Loaded {len(institutions)} institutions")
    print()

    # Create backup
    backup_file = input_file.with_suffix('.yaml.batch8_backup')
    print(f"💾 Creating backup: {backup_file}")
    save_yaml(institutions, backup_file)
    print()

    # Apply enrichments
    print(f"🔧 Applying {len(BATCH_8_ENRICHMENTS)} enrichments...")
    print()

    enriched_count = 0
    for i, enrichment in enumerate(BATCH_8_ENRICHMENTS, 1):
        print(f"{i}. {enrichment['name']} ({enrichment['city']})")

        try:
            inst = find_institution(institutions, enrichment['name'], enrichment['city'])
            enrich_institution(inst, enrichment)
            enriched_count += 1
        except ValueError as e:
            print(f"  ❌ {e}")
        except Exception as e:
            print(f"  ❌ Error: {e}")

        print()

    # Save enriched data
    output_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml')
    print(f"💾 Saving enriched data: {output_file}")
    save_yaml(institutions, output_file)
    print()

    # Statistics
    print("=" * 80)
    print("ENRICHMENT SUMMARY")
    print("=" * 80)
    print()

    total = len(institutions)
    with_wikidata = sum(
        1 for inst in institutions
        if any(
            id_obj.get('identifier_scheme') == 'Wikidata'
            for id_obj in inst.get('identifiers', [])
        )
    )

    print(f"Total institutions: {total}")
    print(f"With Wikidata: {with_wikidata} ({with_wikidata/total*100:.1f}%)")
    print(f"Batch 8 enrichments: {enriched_count}")
    print()

    # By type
    from collections import defaultdict
    by_type = defaultdict(lambda: {'total': 0, 'with_wd': 0})

    for inst in institutions:
        inst_type = inst.get('institution_type', 'UNKNOWN')
        by_type[inst_type]['total'] += 1
        if any(
            id_obj.get('identifier_scheme') == 'Wikidata'
            for id_obj in inst.get('identifiers', [])
        ):
            by_type[inst_type]['with_wd'] += 1

    print("Coverage by type:")
    for inst_type in sorted(by_type.keys()):
        stats = by_type[inst_type]
        pct = stats['with_wd']/stats['total']*100 if stats['total'] > 0 else 0
        status = "✅" if pct == 100 else "⭐" if pct >= 50 else ""
        print(f"  {status} {inst_type}: {stats['with_wd']}/{stats['total']} ({pct:.1f}%)")
    print()

    print("🎉 Batch 8 enrichment complete!")
    print(f"📊 New coverage: {with_wikidata}/{total} ({with_wikidata/total*100:.1f}%)")

if __name__ == '__main__':
    main()