glam/scripts/enrich_mexico_batch02.py

#!/usr/bin/env python3
"""
Mexican Wikidata Enrichment - Batch 2
Add Wikidata identifiers to 4 institutions with perfect name matches (100% score).

Institutions:
1. Archivo General de la Nación (Q2860534, VIAF: 159570855)
2. Museo Frida Kahlo (Q2663377, VIAF: 144233695)
3. Museo Soumaya (Q2097646, VIAF: 135048064)
4. Museo de Antropología de Xalapa (Q1841655, VIAF: 138582541)
"""

import yaml
from datetime import datetime, timezone
from pathlib import Path

# Define enrichment data - Only 100% perfect matches from Wikidata SPARQL query
BATCH_2_ENRICHMENTS = {
    "Archivo General de la Nación": {
        "wikidata": "Q2860534",
        "viaf": "159570855",
        "confidence": 0.98,
        "notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference"
    },
    "Museo Frida Kahlo": {
        "wikidata": "Q2663377",
        "viaf": "144233695",
        "confidence": 0.98,
        "notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference"
    },
    "Museo Soumaya": {
        "wikidata": "Q2097646",
        "viaf": "135048064",
        "confidence": 0.98,
        "notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference"
    },
    "Museo de Antropología de Xalapa": {
        "wikidata": "Q1841655",
        "viaf": "138582541",
        "confidence": 0.98,
        "notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference"
    }
}


def add_enrichment_identifiers(institution: dict, enrichment: dict) -> dict:
    """Add Wikidata and VIAF identifiers to institution record."""

    # Ensure identifiers list exists
    if 'identifiers' not in institution or institution['identifiers'] is None:
        institution['identifiers'] = []

    # Check if identifiers already exist (avoid duplicates)
    existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}

    # Add Wikidata identifier if not present
    if 'wikidata' in enrichment and 'Wikidata' not in existing_schemes:
        q_number = enrichment['wikidata']
        institution['identifiers'].append({
            'identifier_scheme': 'Wikidata',
            'identifier_value': q_number,
            'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
        })

    # Add VIAF identifier if not present
    if 'viaf' in enrichment and 'VIAF' not in existing_schemes:
        viaf_id = enrichment['viaf']
        institution['identifiers'].append({
            'identifier_scheme': 'VIAF',
            'identifier_value': viaf_id,
            'identifier_url': f'https://viaf.org/viaf/{viaf_id}'
        })

    # Update provenance metadata
    current_time = datetime.now(timezone.utc).isoformat()

    if 'provenance' not in institution:
        institution['provenance'] = {}

    # Add enrichment metadata
    if 'enrichment_history' not in institution['provenance']:
        institution['provenance']['enrichment_history'] = []

    enrichment_entry = {
        'enrichment_date': current_time,
        'enrichment_method': 'Wikidata SPARQL query + VIAF cross-reference (Batch 2)',
        'identifiers_added': [],
        'confidence_score': enrichment['confidence'],
        'notes': enrichment['notes']
    }

    if 'wikidata' in enrichment and 'Wikidata' not in existing_schemes:
        enrichment_entry['identifiers_added'].append(f"Wikidata:{enrichment['wikidata']}")
    if 'viaf' in enrichment and 'VIAF' not in existing_schemes:
        enrichment_entry['identifiers_added'].append(f"VIAF:{enrichment['viaf']}")

    # Only add enrichment entry if we actually added new identifiers
    if enrichment_entry['identifiers_added']:
        institution['provenance']['enrichment_history'].append(enrichment_entry)

    return institution


def main():
    """Main enrichment workflow."""

    # Load YAML file
    yaml_path = Path('/Users/kempersc/apps/glam/data/instances/mexico/mexican_institutions_geocoded.yaml')

    print("Loading Mexican institutions YAML file...")
    with open(yaml_path, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    print(f"Loaded {len(institutions)} institutions")

    # Track enrichment statistics
    enriched_count = 0
    skipped_count = 0
    not_found = []

    # Enrich each institution
    for institution in institutions:
        institution_name = institution.get('name', '')

        if institution_name in BATCH_2_ENRICHMENTS:
            enrichment = BATCH_2_ENRICHMENTS[institution_name]

            # Check if already has identifiers
            existing_identifiers = institution.get('identifiers', [])
            has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in existing_identifiers)
            has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in existing_identifiers)

            if has_wikidata and has_viaf:
                print(f"\n⏭️  Skipping (already enriched): {institution_name}")
                skipped_count += 1
                continue

            print(f"\n✅ Enriching: {institution_name}")
            print(f"  - Wikidata: {enrichment.get('wikidata', 'N/A')}")
            print(f"  - VIAF: {enrichment.get('viaf', 'N/A')}")

            institution = add_enrichment_identifiers(institution, enrichment)
            enriched_count += 1

    # Check for missing institutions
    for name in BATCH_2_ENRICHMENTS.keys():
        found = any(inst.get('name') == name for inst in institutions)
        if not found:
            not_found.append(name)

    if not_found:
        print("\n⚠️  Warning: Could not find these institutions in YAML:")
        for name in not_found:
            print(f"  - {name}")

    # Save enriched YAML
    print(f"\n📝 Summary:")
    print(f"   - Newly enriched: {enriched_count}")
    print(f"   - Already enriched (skipped): {skipped_count}")
    print(f"   - Not found in dataset: {len(not_found)}")

    print(f"\nWriting to {yaml_path}...")

    with open(yaml_path, 'w', encoding='utf-8') as f:
        yaml.dump(institutions, f,
                  default_flow_style=False,
                  allow_unicode=True,
                  sort_keys=False,
                  width=1000)

    print("✅ Batch 2 enrichment complete!")

    # Generate statistics
    total_wikidata = sum(1 for inst in institutions
                         if any(i.get('identifier_scheme') == 'Wikidata'
                               for i in inst.get('identifiers', [])))

    coverage = (total_wikidata / len(institutions)) * 100

    print(f"\n📊 Current Wikidata Coverage:")
    print(f"   {total_wikidata}/{len(institutions)} institutions ({coverage:.1f}%)")


if __name__ == '__main__':
    main()