glam/archive/scripts/brazil/enrich_brazilian_batch3.py

#!/usr/bin/env python3
"""
Enrich Brazilian institutions - Batch 3
Adds 6 more major institutions with verified Wikidata QIDs.
"""

import json
import sys
from datetime import datetime, timezone
from pathlib import Path

# Verified Wikidata QIDs for Batch 3 Brazilian institutions
BRAZILIAN_BATCH3_QID_MAP = {
    "MAM-BA": "Q10333768",                      # Museu de Arte Moderna da Bahia, Salvador
    "Centro Dragão do Mar": "Q5305525",         # Dragão do Mar Center, Fortaleza
    "CCBB Brasília": "Q56693296",               # Centro Cultural Banco do Brasil, Brasília
    "Parque Memorial Quilombo": "Q10345196",    # Quilombo dos Palmares Memorial Park
    "Museu Zoroastro Artiaga": "Q10333459",     # Museu Zoroastro Artiaga, Goiânia
}

def has_wikidata_id(institution: dict) -> bool:
    """Check if institution already has a real Wikidata ID."""
    for id_obj in institution.get("identifiers", []):
        if id_obj.get("identifier_scheme") == "Wikidata":
            value = id_obj.get("identifier_value", "")
            if value.startswith("Q") and value[1:].isdigit():
                try:
                    q_num = int(value[1:])
                    if q_num < 100000000:  # Real Q-numbers
                        return True
                except ValueError:
                    pass
    return False

def add_wikidata_id(institution: dict, qid: str, matched_name: str) -> bool:
    """Add Wikidata identifier to institution."""
    if "identifiers" not in institution:
        institution["identifiers"] = []

    # Check if already exists
    if has_wikidata_id(institution):
        return False

    # Add new Wikidata ID
    institution["identifiers"].append({
        "identifier_scheme": "Wikidata",
        "identifier_value": qid,
        "identifier_url": f"https://www.wikidata.org/wiki/{qid}"
    })

    # Update provenance
    if "provenance" not in institution:
        institution["provenance"] = {}

    provenance = institution["provenance"]
    if "enrichment_history" not in provenance:
        provenance["enrichment_history"] = []

    provenance["enrichment_history"].append({
        "enrichment_date": datetime.now(timezone.utc).isoformat(),
        "enrichment_method": f"Manual Wikidata QID assignment - Batch 3 (matched: {matched_name})",
        "data_source": "Wikidata",
        "confidence_score": 1.0
    })

    return True

def main():
    input_file = Path("data/instances/global/global_heritage_institutions_merged.json")

    if not input_file.exists():
        print(f"❌ Input file not found: {input_file}")
        sys.exit(1)

    print("=" * 80)
    print("🇧🇷 BRAZILIAN INSTITUTIONS WIKIDATA ENRICHMENT - BATCH 3")
    print("=" * 80)
    print(f"\n📖 Loading dataset from JSON: {input_file.name}")

    with open(input_file, 'r', encoding='utf-8') as f:
        institutions = json.load(f)

    print(f"✅ Loaded {len(institutions):,} institutions\n")

    # Filter to Brazilian institutions only
    brazilian = [inst for inst in institutions if any(
        loc.get('country') == 'BR' for loc in inst.get('locations', [])
    )]

    print(f"🔍 Found {len(brazilian)} Brazilian institutions")

    # Count existing Wikidata IDs
    existing_wd = sum(1 for inst in brazilian if has_wikidata_id(inst))
    print(f"📊 Currently enriched: {existing_wd}/{len(brazilian)} ({existing_wd/len(brazilian)*100:.1f}%)\n")

    enriched_count = 0
    matched_institutions = []

    for institution in brazilian:
        name = institution.get("name", "")

        # Check if this institution matches any known names
        for known_name, qid in BRAZILIAN_BATCH3_QID_MAP.items():
            if known_name.lower() in name.lower():
                # Add Wikidata ID
                if add_wikidata_id(institution, qid, known_name):
                    enriched_count += 1
                    city = institution.get('locations', [{}])[0].get('city', 'Unknown')
                    matched_institutions.append({
                        "name": name,
                        "city": city,
                        "qid": qid,
                        "matched_pattern": known_name
                    })
                    print(f"✅ Enriched: {name} ({city})")
                    print(f"   → Wikidata: {qid} (matched '{known_name}')")
                    print()
                break

    if enriched_count == 0:
        print("ℹ️  No new institutions enriched (all already have Wikidata IDs)")
        return

    # Create backup of JSON
    backup_file = input_file.with_suffix('.json.batch3_backup')
    print(f"\n💾 Creating backup: {backup_file.name}")
    import shutil
    shutil.copy2(input_file, backup_file)

    # Write updated JSON (fast!)
    print(f"💾 Writing updated JSON dataset...")
    with open(input_file, 'w', encoding='utf-8') as f:
        json.dump(institutions, f, ensure_ascii=False, indent=2)

    # Also update YAML file if it exists
    yaml_file = Path("data/instances/global/global_heritage_institutions_merged.yaml")
    if yaml_file.exists():
        print(f"💾 Updating YAML file (this may take a minute)...")
        import yaml
        with open(yaml_file, 'w', encoding='utf-8') as f:
            yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
        print(f"✅ YAML file updated")

    print("\n" + "=" * 80)
    print(f"✨ ENRICHMENT COMPLETE - BATCH 3")
    print("=" * 80)
    print(f"Institutions enriched in this batch: {enriched_count}")

    new_total = existing_wd + enriched_count
    print(f"\nBrazilian coverage: {new_total}/{len(brazilian)} = {(new_total / len(brazilian) * 100):.1f}%")
    print(f"\nMatched institutions:")
    for match in matched_institutions:
        print(f"  • {match['name']} ({match['city']}) → {match['qid']}")

if __name__ == "__main__":
    main()