glam/scripts/enrich_chilean_batch10.py

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 10 Wikidata Enrichment
Single manual enrichment: Servicio Nacional del Patrimonio Cultural

Target: 55/90 institutions (61.1% coverage)

Note: This organization was reformed from "Consejo de Monumentos Nacionales"
in 2017 under Ley 21.045. We're using Q5784049 (the predecessor's Wikidata entry)
and documenting the organizational change.
"""

import yaml
from pathlib import Path
from datetime import datetime, timezone

# Batch 10: Single official institution with NAME_CHANGE event
BATCH_10_ENRICHMENT = {
    "name": "Servicio Nacional del Patrimonio Cultural",
    "city": "Santiago",
    "q_number": "Q5784049",
    "wikidata_name": "National Monuments Council / Consejo de Monumentos Nacionales",
    "confidence": "high",
    "notes": "Wikidata Q5784049 refers to Consejo de Monumentos Nacionales (1925-2017). Organization was reformed and renamed in 2017 under Ley 21.045 but maintains institutional continuity.",
    "change_event": {
        "event_id": "https://w3id.org/heritage/custodian/event/cl-snpc-reform-2017",
        "change_type": "NAME_CHANGE",
        "event_date": "2017-11-03",
        "event_description": """Reformed from Consejo de Monumentos Nacionales to Servicio Nacional del Patrimonio Cultural under Ley 21.045 (November 3, 2017).
Created as part of the new Ministerio de las Culturas, las Artes y el Patrimonio.
The organization maintains institutional continuity from 1925 founding, but with expanded mandate and modern governance structure.""",
        "source_documentation": "https://www.leychile.cl/N?i=1110097"
    }
}

def load_yaml(file_path: Path) -> list:
    """Load YAML file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

def save_yaml(data: list, file_path: Path) -> None:
    """Save data to YAML file."""
    with open(file_path, 'w', encoding='utf-8') as f:
        yaml.dump(
            data,
            f,
            default_flow_style=False,
            allow_unicode=True,
            sort_keys=False,
            width=120,
            indent=2
        )

def find_institution(institutions: list, name: str, city: str) -> dict:
    """Find institution by name and city."""
    for inst in institutions:
        if inst['name'] == name:
            inst_city = inst.get('locations', [{}])[0].get('city', '')
            if inst_city == city or city == "Santiago":
                return inst
    raise ValueError(f"Institution not found: {name} ({city})")

def enrich_institution(inst: dict, enrichment: dict) -> None:
    """Add Wikidata identifier and organizational change event to institution."""

    # Check if already has Wikidata
    existing_ids = inst.get('identifiers', [])
    has_wikidata = any(
        id_obj.get('identifier_scheme') == 'Wikidata'
        for id_obj in existing_ids
    )

    if has_wikidata:
        print(f"  ⚠️  {inst['name']} already has Wikidata identifier")
        return

    # Add Wikidata identifier
    wikidata_id = {
        'identifier_scheme': 'Wikidata',
        'identifier_value': enrichment['q_number'],
        'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}"
    }

    if 'identifiers' not in inst:
        inst['identifiers'] = []

    inst['identifiers'].append(wikidata_id)
    print(f"  ✅ Added Wikidata: {enrichment['q_number']} ({enrichment['wikidata_name']})")

    # Add organizational change event
    if 'change_history' not in inst:
        inst['change_history'] = []

    change_event = enrichment['change_event']
    inst['change_history'].append({
        'event_id': change_event['event_id'],
        'change_type': change_event['change_type'],
        'event_date': change_event['event_date'],
        'event_description': change_event['event_description'],
        'source_documentation': change_event['source_documentation']
    })
    print(f"  📝 Added change event: {change_event['change_type']} ({change_event['event_date']})")

    # Update provenance
    if 'provenance' not in inst:
        inst['provenance'] = {}

    inst['provenance']['enrichment_method'] = 'Manual Wikidata linkage (Batch 10 - Official Institution)'
    inst['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
    inst['provenance']['wikidata_match_confidence'] = enrichment['confidence']

    # Add notes
    if 'notes' not in inst['provenance']:
        inst['provenance']['notes'] = []
    elif isinstance(inst['provenance']['notes'], str):
        inst['provenance']['notes'] = [inst['provenance']['notes']]

    inst['provenance']['notes'].append(
        f"Batch 10: {enrichment['notes']}"
    )

    print(f"  💡 Note: Organization reformed from Consejo de Monumentos Nacionales (1925) to current name (2017)")

def main():
    print("=" * 80)
    print("CHILEAN GLAM INSTITUTIONS - BATCH 10 ENRICHMENT")
    print("Official Institution with Organizational Change Event")
    print("=" * 80)
    print()

    # Load data
    input_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml')
    print(f"📖 Loading: {input_file}")
    institutions = load_yaml(input_file)
    print(f"   Loaded {len(institutions)} institutions")
    print()

    # Create backup
    backup_file = input_file.with_suffix('.yaml.batch10_backup')
    print(f"💾 Creating backup: {backup_file}")
    save_yaml(institutions, backup_file)
    print()

    # Apply enrichment
    print("🔧 Applying enrichment...")
    print()

    enrichment = BATCH_10_ENRICHMENT
    print(f"1. {enrichment['name']} ({enrichment['city']})")

    try:
        inst = find_institution(institutions, enrichment['name'], enrichment['city'])
        enrich_institution(inst, enrichment)
        enriched_count = 1
    except ValueError as e:
        print(f"  ❌ {e}")
        enriched_count = 0
    except Exception as e:
        print(f"  ❌ Error: {e}")
        enriched_count = 0

    print()

    # Save enriched data
    output_file = Path('data/instances/chile/chilean_institutions_batch10_enriched.yaml')
    print(f"💾 Saving enriched data: {output_file}")
    save_yaml(institutions, output_file)
    print()

    # Statistics
    print("=" * 80)
    print("ENRICHMENT SUMMARY")
    print("=" * 80)
    print()

    total = len(institutions)
    with_wikidata = sum(
        1 for inst in institutions
        if any(
            id_obj.get('identifier_scheme') == 'Wikidata'
            for id_obj in inst.get('identifiers', [])
        )
    )

    print(f"Total institutions: {total}")
    print(f"With Wikidata: {with_wikidata} ({with_wikidata/total*100:.1f}%)")
    print(f"Batch 10 enrichments: {enriched_count}")
    print()

    # By type
    from collections import defaultdict
    by_type = defaultdict(lambda: {'total': 0, 'with_wd': 0})

    for inst in institutions:
        inst_type = inst.get('institution_type', 'UNKNOWN')
        by_type[inst_type]['total'] += 1
        if any(
            id_obj.get('identifier_scheme') == 'Wikidata'
            for id_obj in inst.get('identifiers', [])
        ):
            by_type[inst_type]['with_wd'] += 1

    print("Coverage by type:")
    for inst_type in sorted(by_type.keys()):
        stats = by_type[inst_type]
        pct = stats['with_wd']/stats['total']*100 if stats['total'] > 0 else 0
        status = "✅" if pct == 100 else "⭐" if pct >= 50 else ""
        print(f"  {status} {inst_type}: {stats['with_wd']}/{stats['total']} ({pct:.1f}%)")
    print()

    print("🎉 Batch 10 enrichment complete!")
    print(f"📊 New coverage: {with_wikidata}/{total} ({with_wikidata/total*100:.1f}%)")
    print()
    print("📝 Key findings:")
    print("  - Other Batch 10 targets (foundations, cultural centers) not in Wikidata")
    print("  - Recommendation: Focus Batch 11 on remaining museums (13 institutions)")
    print("  - Potential to reach 70%+ coverage with museum-focused enrichment")

if __name__ == '__main__':
    main()