glam/scripts/enrich_custodian_emic_names.py

#!/usr/bin/env python3
"""
Enrich UNESCO MoW custodian files with proper CustodianName data.

This script:
1. Loads multilingual labels from Wikidata cache
2. Determines the appropriate emic (local language) name for each custodian
3. Updates custodian YAML files with:
   - custodian_name.emic_name (local language name)
   - custodian_name.name_language (ISO 639-1 code)
   - custodian_name.standardized_name (same as emic_name for now)
4. Regenerates abbreviations from local language names if different
5. Updates GHCIDs and maintains history for changed abbreviations

Per AGENTS.md: The abbreviation and optional snake_case name suffix should be
derived from the emic name in the institution's official local/national language.
"""

import json
import yaml
import unicodedata
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Dict, List, Tuple

# Country to primary official language(s) mapping
# Format: country_name -> (primary_lang, fallback_langs)
# For multilingual countries, we'll check if the institution has a label in any official language
COUNTRY_LANGUAGE_MAP = {
    # Europe - Western
    'Germany': ('de', []),
    'Austria': ('de', []),
    'France': ('fr', []),
    'Netherlands': ('nl', []),
    'Belgium': ('nl', ['fr', 'de']),  # Check all three official languages
    'Luxembourg': ('lb', ['fr', 'de']),
    'Switzerland': ('de', ['fr', 'it', 'rm']),  # Check all four national languages
    'United Kingdom': ('en', []),
    'Ireland': ('en', ['ga']),

    # Europe - Northern
    'Norway': ('nb', ['nn', 'no']),  # Bokmål preferred, Nynorsk fallback
    'Sweden': ('sv', []),
    'Denmark': ('da', []),
    'Finland': ('fi', ['sv']),  # Swedish is also official
    'Iceland': ('is', []),

    # Europe - Southern
    'Spain': ('es', ['ca', 'eu', 'gl']),  # Regional languages
    'Portugal': ('pt', []),
    'Italy': ('it', []),
    'Greece': ('el', []),
    'Malta': ('mt', ['en']),
    'Cyprus': ('el', ['tr']),

    # Europe - Central/Eastern
    'Poland': ('pl', []),
    'Czech Republic': ('cs', []),
    'Czechia': ('cs', []),
    'Slovakia': ('sk', []),
    'Hungary': ('hu', []),
    'Slovenia': ('sl', []),
    'Croatia': ('hr', []),
    'Serbia': ('sr', []),
    'Bosnia and Herzegovina': ('bs', ['hr', 'sr']),
    'North Macedonia': ('mk', []),
    'Albania': ('sq', []),
    'Bulgaria': ('bg', []),
    'Romania': ('ro', []),
    'Moldova': ('ro', []),
    'Ukraine': ('uk', []),
    'Belarus': ('be', ['ru']),
    'Russia': ('ru', []),
    'Estonia': ('et', []),
    'Latvia': ('lv', []),
    'Lithuania': ('lt', []),

    # Americas
    'United States': ('en', []),
    'Canada': ('en', ['fr']),
    'Mexico': ('es', []),
    'Brazil': ('pt', []),
    'Argentina': ('es', []),
    'Chile': ('es', []),
    'Colombia': ('es', []),
    'Peru': ('es', []),
    'Venezuela': ('es', []),
    'Ecuador': ('es', []),
    'Bolivia': ('es', []),
    'Paraguay': ('es', ['gn']),
    'Uruguay': ('es', []),
    'Cuba': ('es', []),
    'Dominican Republic': ('es', []),
    'Puerto Rico': ('es', ['en']),
    'Costa Rica': ('es', []),
    'Panama': ('es', []),
    'Guatemala': ('es', []),
    'Honduras': ('es', []),
    'El Salvador': ('es', []),
    'Nicaragua': ('es', []),
    'Jamaica': ('en', []),
    'Trinidad and Tobago': ('en', []),
    'Barbados': ('en', []),
    'Suriname': ('nl', []),
    'Guyana': ('en', []),

    # Asia - East
    'Japan': ('ja', []),
    "People's Republic of China": ('zh', []),
    'China': ('zh', []),
    'Taiwan': ('zh', []),
    'South Korea': ('ko', []),
    'North Korea': ('ko', []),
    'Mongolia': ('mn', []),

    # Asia - Southeast
    'Vietnam': ('vi', []),
    'Thailand': ('th', []),
    'Cambodia': ('km', []),
    'Laos': ('lo', []),
    'Myanmar': ('my', []),
    'Malaysia': ('ms', []),
    'Singapore': ('en', ['zh', 'ms', 'ta']),
    'Indonesia': ('id', []),
    'Philippines': ('tl', ['en']),
    'Brunei': ('ms', []),
    'East Timor': ('pt', ['tet']),
    'Timor-Leste': ('pt', ['tet']),

    # Asia - South
    'India': ('hi', ['en', 'bn', 'ta', 'te', 'mr', 'gu', 'kn', 'ml', 'pa', 'or']),
    'Pakistan': ('ur', ['en']),
    'Bangladesh': ('bn', []),
    'Sri Lanka': ('si', ['ta']),
    'Nepal': ('ne', []),
    'Bhutan': ('dz', []),
    'Maldives': ('dv', []),

    # Asia - Central
    'Kazakhstan': ('kk', ['ru']),
    'Uzbekistan': ('uz', []),
    'Turkmenistan': ('tk', []),
    'Kyrgyzstan': ('ky', ['ru']),
    'Tajikistan': ('tg', []),
    'Afghanistan': ('ps', ['fa']),

    # Asia - West / Middle East
    'Turkey': ('tr', []),
    'Iran': ('fa', []),
    'Iraq': ('ar', ['ku']),
    'Syria': ('ar', []),
    'Lebanon': ('ar', []),
    'Jordan': ('ar', []),
    'Israel': ('he', ['ar']),
    'Palestine': ('ar', []),
    'Saudi Arabia': ('ar', []),
    'United Arab Emirates': ('ar', []),
    'Kuwait': ('ar', []),
    'Qatar': ('ar', []),
    'Bahrain': ('ar', []),
    'Oman': ('ar', []),
    'Yemen': ('ar', []),
    'Georgia': ('ka', []),
    'Armenia': ('hy', []),
    'Azerbaijan': ('az', []),

    # Africa - North
    'Egypt': ('ar', []),
    'Libya': ('ar', []),
    'Tunisia': ('ar', ['fr']),
    'Algeria': ('ar', ['fr']),
    'Morocco': ('ar', ['fr']),

    # Africa - West
    'Nigeria': ('en', []),
    'Ghana': ('en', []),
    'Senegal': ('fr', []),
    'Ivory Coast': ('fr', []),
    "Côte d'Ivoire": ('fr', []),
    'Mali': ('fr', []),
    'Burkina Faso': ('fr', []),
    'Niger': ('fr', []),
    'Benin': ('fr', []),
    'Togo': ('fr', []),
    'Guinea': ('fr', []),
    'Sierra Leone': ('en', []),
    'Liberia': ('en', []),
    'Mauritania': ('ar', ['fr']),
    'Cape Verde': ('pt', []),
    'Gambia': ('en', []),

    # Africa - East
    'Kenya': ('sw', ['en']),
    'Tanzania': ('sw', ['en']),
    'Uganda': ('en', ['sw']),
    'Rwanda': ('rw', ['fr', 'en']),
    'Burundi': ('rn', ['fr']),
    'Ethiopia': ('am', []),
    'Eritrea': ('ti', ['ar']),
    'Somalia': ('so', ['ar']),
    'Djibouti': ('fr', ['ar']),
    'Madagascar': ('mg', ['fr']),
    'Mauritius': ('en', ['fr']),
    'Seychelles': ('en', ['fr']),

    # Africa - Central
    'Democratic Republic of the Congo': ('fr', []),
    'Republic of the Congo': ('fr', []),
    'Central African Republic': ('fr', []),
    'Chad': ('fr', ['ar']),
    'Cameroon': ('fr', ['en']),
    'Gabon': ('fr', []),
    'Equatorial Guinea': ('es', ['fr', 'pt']),

    # Africa - Southern
    'South Africa': ('en', ['af', 'zu', 'xh']),
    'Namibia': ('en', ['de', 'af']),
    'Botswana': ('en', ['tn']),
    'Zimbabwe': ('en', ['sn', 'nd']),
    'Zambia': ('en', []),
    'Malawi': ('en', []),
    'Mozambique': ('pt', []),
    'Angola': ('pt', []),
    'Lesotho': ('en', ['st']),
    'Eswatini': ('en', ['ss']),

    # Oceania
    'Australia': ('en', []),
    'New Zealand': ('en', ['mi']),
    'Papua New Guinea': ('en', ['tpi', 'ho']),
    'Fiji': ('en', ['fj', 'hi']),
    'Vanuatu': ('en', ['fr', 'bi']),
    'Samoa': ('sm', ['en']),
    'Tonga': ('to', ['en']),
    'Solomon Islands': ('en', []),
    'Kiribati': ('en', ['gil']),
    'Micronesia': ('en', []),
    'Palau': ('en', ['pau']),

    # Caribbean
    'Haiti': ('ht', ['fr']),
    'Bahamas': ('en', []),
    'Curaçao': ('nl', ['pap']),
    'Aruba': ('nl', ['pap']),

    # Default fallback
    'Unknown': ('en', []),
}


def normalize_diacritics(text: str) -> str:
    """Normalize diacritics to ASCII equivalents."""
    normalized = unicodedata.normalize('NFD', text)
    ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    return ascii_text


def get_significant_words(text: str) -> List[str]:
    """Extract significant words from a name, skipping articles/prepositions."""
    # Skip words by language
    SKIP_WORDS = {
        # Dutch
        'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s",
        'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of',
        # English
        'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'under',
        'and', 'or', 'but',
        # French
        'le', 'la', 'les', 'un', 'une', 'des', 'du', 'à', 'au', 'aux', 'dans', 'sur', 'sous',
        'pour', 'par', 'avec', "l'", "d'", 'et', 'ou',
        # German
        'der', 'die', 'das', 'dem', 'den', 'ein', 'eine', 'einer', 'einem', 'einen',
        'von', 'zu', 'für', 'bei', 'nach', 'aus', 'vor', 'über', 'unter', 'durch', 'und', 'oder',
        # Spanish
        'el', 'los', 'las', 'unos', 'unas', 'del', 'al', 'con', 'por', 'para', 'sobre', 'bajo',
        'y', 'o', 'e', 'u',
        # Portuguese
        'o', 'os', 'as', 'um', 'uma', 'uns', 'umas', 'do', 'da', 'dos', 'das', 'em', 'no', 'na',
        'nos', 'nas', 'com', 'sob',
        # Italian
        'il', 'lo', 'gli', 'uno', 'di', 'dello', 'della', 'dei', 'degli', 'delle',
        'allo', 'alla', 'ai', 'agli', 'alle', 'dal', 'dallo', 'dalla', 'dai', 'dagli', 'dalle',
        'nel', 'nello', 'nella', 'nei', 'negli', 'nelle', 'sul', 'sullo', 'sulla', 'sui', 'sugli',
        'sulle', 'per', 'tra', 'fra', 'ed', 'od',
        # Russian (transliterated)
        'i', 'v', 'na', 'pri',
    }

    words = text.split()
    significant = []
    for word in words:
        # Clean word
        clean_word = re.sub(r"[''`\",.:;!?()[\]{}]", '', word.lower())
        if clean_word and clean_word not in SKIP_WORDS:
            # Skip pure numbers
            if not clean_word.isdigit() and not re.match(r'^\d+-\d+$', clean_word):
                significant.append(word)

    return significant


def generate_abbreviation(name: str, max_length: int = 10) -> str:
    """Generate abbreviation from emic name using first letters of significant words."""
    significant_words = get_significant_words(name)

    if not significant_words:
        # Fallback: use first letters of all words
        significant_words = name.split()[:3]

    # Take first letter of each word
    abbrev = ''
    for word in significant_words:
        # Clean the word of special characters
        clean = re.sub(r"[''`\",.:;!?()[\]{}&/\\+@#$%*|=<>~^_-]", '', word)
        if clean:
            # Normalize diacritics and take first letter
            first_letter = normalize_diacritics(clean[0]).upper()
            if first_letter.isalpha():
                abbrev += first_letter

    # Ensure at least 2 characters
    if len(abbrev) < 2:
        # Try to get more from the name
        clean_name = normalize_diacritics(name)
        clean_name = re.sub(r'[^A-Za-z]', '', clean_name)
        abbrev = clean_name[:3].upper()

    return abbrev[:max_length]


def get_emic_name(custodian: Dict, country: str, labels: Dict[str, str]) -> Tuple[str, str]:
    """
    Determine the appropriate emic (local language) name for a custodian.

    Returns: (emic_name, language_code)
    """
    # Get language mapping for country
    if country in COUNTRY_LANGUAGE_MAP:
        primary_lang, fallback_langs = COUNTRY_LANGUAGE_MAP[country]
        all_langs = [primary_lang] + fallback_langs
    else:
        # Unknown country - default to English
        all_langs = ['en']

    # Try each language in order
    for lang in all_langs:
        if lang in labels:
            return labels[lang], lang

    # Fallback to English if available
    if 'en' in labels:
        return labels['en'], 'en'

    # Ultimate fallback: first available label
    if labels:
        first_lang = next(iter(labels))
        return labels[first_lang], first_lang

    # No labels at all - use original name
    return custodian.get('name_en', 'Unknown'), 'en'


def load_custodian_file(filepath: Path) -> Optional[Dict]:
    """Load a custodian YAML file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None


def save_custodian_file(filepath: Path, data: Dict):
    """Save a custodian YAML file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)


def main():
    # Paths
    project_root = Path(__file__).parent.parent
    custodian_dir = project_root / 'data' / 'custodian'
    cache_dir = project_root / 'data' / 'cache'

    # Load data
    print("Loading multilingual labels...")
    with open(cache_dir / 'unesco_mow_multilingual_labels.json', 'r') as f:
        labels_data = json.load(f)
        wikidata_labels = labels_data['custodians']

    print("Loading original custodian info...")
    with open(cache_dir / 'unesco_mow_custodians.json', 'r') as f:
        custodians_data = json.load(f)
        original_custodians = {c['wikidata_id']: c for c in custodians_data['custodians']}

    print(f"Processing {len(original_custodians)} UNESCO MoW custodians...\n")

    # Statistics
    stats = {
        'total': 0,
        'updated': 0,
        'abbreviation_changed': 0,
        'ghcid_changed': 0,
        'not_found': 0,
        'errors': 0,
        'already_enriched': 0,
    }

    # Track changes for reporting
    changes = []

    timestamp = datetime.now(timezone.utc).isoformat()

    for qid, custodian in original_custodians.items():
        stats['total'] += 1

        # Find the custodian file by Wikidata ID
        matching_files = list(custodian_dir.glob('*.yaml'))
        custodian_file = None

        for filepath in matching_files:
            data = load_custodian_file(filepath)
            if data:
                # Check if this is the right custodian by Wikidata ID
                wikidata_id = data.get('original_entry', {}).get('wikidata_id') or \
                              data.get('wikidata_enrichment', {}).get('wikidata_entity_id')
                if wikidata_id == qid:
                    custodian_file = filepath
                    break

        if not custodian_file:
            stats['not_found'] += 1
            continue

        # Load full custodian data
        data = load_custodian_file(custodian_file)
        if not data:
            stats['errors'] += 1
            continue

        # Get labels for this custodian
        labels_info = wikidata_labels.get(qid, {})
        labels = labels_info.get('labels', {})

        if not labels:
            print(f"  No labels found for {qid}")
            continue

        # Determine emic name
        country = custodian.get('country', 'Unknown')
        emic_name, lang_code = get_emic_name(custodian, country, labels)

        # Current values
        current_name = data.get('custodian_name', {}).get('claim_value', '')
        current_emic = data.get('custodian_name', {}).get('emic_name', '')

        # Check if already enriched with emic_name
        if current_emic and current_emic == emic_name:
            stats['already_enriched'] += 1
            continue

        # Generate abbreviation from emic name
        new_abbrev = generate_abbreviation(emic_name)

        # Get current abbreviation from GHCID
        current_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
        current_abbrev = current_ghcid.split('-')[-1] if current_ghcid else ''

        # Update custodian_name
        if 'custodian_name' not in data:
            data['custodian_name'] = {}

        data['custodian_name']['emic_name'] = emic_name
        data['custodian_name']['name_language'] = lang_code
        data['custodian_name']['standardized_name'] = emic_name

        # Keep original English name if different
        if current_name and current_name != emic_name:
            if 'alternative_names' not in data['custodian_name']:
                data['custodian_name']['alternative_names'] = []
            if current_name not in [n.get('name') if isinstance(n, dict) else n
                                    for n in data['custodian_name']['alternative_names']]:
                data['custodian_name']['alternative_names'].append({
                    'name': current_name,
                    'language': 'en',
                    'source': 'wikidata'
                })

        # Track change
        change_info = {
            'wikidata_id': qid,
            'file': custodian_file.name,
            'country': country,
            'old_name': current_name,
            'new_emic_name': emic_name,
            'language': lang_code,
            'old_abbrev': current_abbrev,
            'new_abbrev': new_abbrev,
        }

        # Check if abbreviation changed
        if new_abbrev != current_abbrev and current_abbrev:
            stats['abbreviation_changed'] += 1
            change_info['abbrev_changed'] = True

            # TODO: For now, we don't update GHCID - that requires more careful handling
            # with collision detection. Just log the change.
            print(f"  ABBREV CHANGE: {custodian_file.name}")
            print(f"    {country}: {current_name}")
            print(f"    Emic ({lang_code}): {emic_name}")
            print(f"    Abbrev: {current_abbrev} → {new_abbrev}")

        changes.append(change_info)

        # Save updated file
        save_custodian_file(custodian_file, data)
        stats['updated'] += 1

    # Print summary
    print("\n" + "=" * 60)
    print("ENRICHMENT SUMMARY")
    print("=" * 60)
    print(f"Total custodians processed: {stats['total']}")
    print(f"Files updated: {stats['updated']}")
    print(f"Already enriched: {stats['already_enriched']}")
    print(f"Abbreviation changes detected: {stats['abbreviation_changed']}")
    print(f"Files not found: {stats['not_found']}")
    print(f"Errors: {stats['errors']}")

    # Save changes log
    changes_log = {
        'timestamp': timestamp,
        'stats': stats,
        'changes': changes
    }

    log_file = cache_dir / 'emic_name_enrichment_log.json'
    with open(log_file, 'w', encoding='utf-8') as f:
        json.dump(changes_log, f, indent=2, ensure_ascii=False)
    print(f"\nChanges log saved to: {log_file}")

    # Show sample of abbreviation changes
    abbrev_changes = [c for c in changes if c.get('abbrev_changed')]
    if abbrev_changes[:10]:
        print("\n" + "-" * 60)
        print("Sample abbreviation changes (not yet applied to GHCID):")
        print("-" * 60)
        for c in abbrev_changes[:10]:
            print(f"  {c['country']}: {c['old_abbrev']} → {c['new_abbrev']}")
            print(f"    EN: {c['old_name']}")
            print(f"    {c['language'].upper()}: {c['new_emic_name']}")
            print()


if __name__ == '__main__':
    main()