glam/scripts/fix_inst_abbreviations.py

#!/usr/bin/env python3
"""
Fix INST abbreviations by properly transliterating emic names.

This script:
1. Finds all files with INST as abbreviation
2. Extracts emic_name and name_language
3. Transliterates using transliterate_emic_names.py
4. Generates proper abbreviation
5. Updates GHCID and renames file

Usage:
    python scripts/fix_inst_abbreviations.py --dry-run
    python scripts/fix_inst_abbreviations.py
"""

import argparse
import os
import re
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, List, Tuple

import yaml

# Import transliteration function
from scripts.transliterate_emic_names import transliterate_for_abbreviation

# Skip words for abbreviation extraction (articles, prepositions, conjunctions)
SKIP_WORDS = {
    # English
    'a', 'an', 'the', 'of', 'in', 'at', 'on', 'to', 'for', 'with', 'from', 'by',
    'as', 'under', 'and', 'or', 'but',
    # Dutch
    'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
    "'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of',
    # French
    'le', 'la', 'les', 'un', 'une', 'des', 'de', 'd', 'du', 'à', 'au', 'aux',
    'en', 'dans', 'sur', 'sous', 'pour', 'par', 'avec', 'l', 'et', 'ou',
    # German
    'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'einem',
    'einen', 'von', 'zu', 'für', 'mit', 'bei', 'nach', 'aus', 'vor', 'über',
    'unter', 'durch', 'und', 'oder',
    # Spanish
    'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'de', 'del', 'a',
    'al', 'en', 'con', 'por', 'para', 'sobre', 'bajo', 'y', 'o', 'e', 'u',
    # Portuguese
    'o', 'a', 'os', 'as', 'um', 'uma', 'uns', 'umas', 'de', 'do', 'da', 'dos',
    'das', 'em', 'no', 'na', 'nos', 'nas', 'para', 'por', 'com', 'sobre', 'sob', 'e', 'ou',
    # Italian
    'il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'uno', 'una', 'di', 'del', 'dello',
    'della', 'dei', 'degli', 'delle', 'a', 'al', 'allo', 'alla', 'ai', 'agli',
    'alle', 'da', 'dal', 'dallo', 'dalla', 'dai', 'dagli', 'dalle', 'in', 'nel',
    'nello', 'nella', 'nei', 'negli', 'nelle', 'su', 'sul', 'sullo', 'sulla',
    'sui', 'sugli', 'sulle', 'con', 'per', 'tra', 'fra', 'e', 'ed', 'o', 'od',
    # Arabic transliteration common words
    'al', 'el', 'wa', 'bi', 'li', 'fi', 'min',
    # Hebrew transliteration common words
    'ha', 've', 'be', 'le', 'me',
    # Romanized CJK particles
    'no', 'wo', 'ga', 'ni', 'de', 'to', 'wa', 'e',  # Japanese
}


def extract_abbreviation(name: str) -> str:
    """Extract abbreviation from transliterated name."""
    if not name:
        return "UNK"

    # Normalize
    normalized = unicodedata.normalize('NFD', name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Remove punctuation
    cleaned = re.sub(r"[''`\",.:;!?()[\]{}]", '', ascii_name)

    # Split into words
    words = cleaned.split()

    # Filter skip words and digits
    significant = []
    for word in words:
        word_lower = word.lower()
        if word_lower not in SKIP_WORDS and not word.isdigit() and len(word) > 0:
            # Only take first letter if it's alphabetic
            if word[0].isalpha():
                significant.append(word)

    if not significant:
        # Fallback: use first 3 words regardless
        significant = [w for w in words[:3] if w and w[0].isalpha()]

    # Take first letter of each significant word (up to 10)
    abbrev = ''.join(w[0].upper() for w in significant[:10] if w)

    # Remove any non-ASCII characters that slipped through
    abbrev = ''.join(c for c in abbrev if ord(c) < 128 and c.isalpha())

    return abbrev if abbrev else "UNK"


def fix_file(filepath: Path, dry_run: bool = False) -> Dict:
    """Fix a single file's INST abbreviation."""
    filename = filepath.name

    # Read file
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        data = yaml.safe_load(content)

    if not data:
        return {'status': 'error', 'reason': 'empty file'}

    # Get emic name and language
    custodian_name = data.get('custodian_name', {})
    emic_name = custodian_name.get('emic_name')
    lang = custodian_name.get('name_language')

    if not emic_name:
        return {'status': 'skip', 'reason': 'no emic_name'}

    if not lang:
        return {'status': 'skip', 'reason': 'no name_language'}

    # Transliterate
    try:
        transliterated = transliterate_for_abbreviation(emic_name, lang)
    except Exception as e:
        return {'status': 'error', 'reason': f'transliteration failed: {e}'}

    # Extract abbreviation
    new_abbrev = extract_abbreviation(transliterated)

    if new_abbrev == "UNK" or not new_abbrev:
        return {'status': 'error', 'reason': f'could not extract abbreviation from "{transliterated}"'}

    # Get current GHCID components
    ghcid = data.get('ghcid', {})
    current_ghcid = ghcid.get('ghcid_current', '')

    # Parse current GHCID
    match = re.match(r'^([A-Z]{2})-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)$', current_ghcid)
    if not match:
        return {'status': 'error', 'reason': f'could not parse GHCID: {current_ghcid}'}

    country, region, city, inst_type, old_abbrev = match.groups()

    if old_abbrev != 'INST':
        return {'status': 'skip', 'reason': f'not INST abbreviation: {old_abbrev}'}

    # Create new GHCID
    new_ghcid = f"{country}-{region}-{city}-{inst_type}-{new_abbrev}"
    new_filename = f"{new_ghcid}.yaml"
    new_filepath = filepath.parent / new_filename

    # Check for collision
    if new_filepath.exists() and new_filepath != filepath:
        return {
            'status': 'collision',
            'old_file': filename,
            'new_file': new_filename,
            'reason': 'target file exists'
        }

    if dry_run:
        return {
            'status': 'would_update',
            'old_file': filename,
            'new_file': new_filename,
            'old_abbrev': old_abbrev,
            'new_abbrev': new_abbrev,
            'emic_name': emic_name,
            'transliterated': transliterated,
            'lang': lang
        }

    # Update GHCID in data
    data['ghcid']['ghcid_current'] = new_ghcid

    # Add history entry
    timestamp = datetime.now(timezone.utc).isoformat()
    history_entry = {
        'ghcid': new_ghcid,
        'valid_from': timestamp,
        'reason': f'Abbreviation fixed via transliteration: "{emic_name}" ({lang}) → "{transliterated}" → {new_abbrev}'
    }

    if 'ghcid_history' not in data['ghcid']:
        data['ghcid']['ghcid_history'] = []
    data['ghcid']['ghcid_history'].append(history_entry)

    # Update location_resolution if present
    if 'location_resolution' in data['ghcid']:
        # Region code should already be correct from previous fixes
        pass

    # Write updated file
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    # Rename file
    if new_filepath != filepath:
        os.rename(filepath, new_filepath)

    return {
        'status': 'updated',
        'old_file': filename,
        'new_file': new_filename,
        'old_abbrev': old_abbrev,
        'new_abbrev': new_abbrev,
        'emic_name': emic_name,
        'transliterated': transliterated,
        'lang': lang
    }


def main():
    parser = argparse.ArgumentParser(description='Fix INST abbreviations using transliteration')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process')
    args = parser.parse_args()

    custodian_dir = Path('data/custodian')

    # Find all INST files
    inst_files = list(custodian_dir.glob('*-INST.yaml'))

    if args.limit > 0:
        inst_files = inst_files[:args.limit]

    print("=" * 60)
    print(f"FIX INST ABBREVIATIONS {'(DRY RUN)' if args.dry_run else ''}")
    print("=" * 60)
    print(f"\nFound {len(inst_files)} files with INST abbreviation\n")

    results = {
        'updated': [],
        'would_update': [],
        'collision': [],
        'skip': [],
        'error': []
    }

    for filepath in sorted(inst_files):
        result = fix_file(filepath, dry_run=args.dry_run)
        status = result['status']
        results[status].append(result)

        if status in ('updated', 'would_update'):
            print(f"✓ {result['old_file']}")
            print(f"  → {result['new_file']}")
            print(f"  Emic: {result['emic_name']} ({result['lang']})")
            print(f"  Trans: {result['transliterated']}")
            print(f"  Abbrev: {result['old_abbrev']} → {result['new_abbrev']}")
            print()
        elif status == 'collision':
            print(f"⚠ COLLISION: {result['old_file']} → {result['new_file']}")
        elif status == 'error':
            print(f"✗ ERROR: {filepath.name}: {result['reason']}")

    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    if args.dry_run:
        print(f"Would update: {len(results['would_update'])}")
    else:
        print(f"Updated: {len(results['updated'])}")
    print(f"Collisions: {len(results['collision'])}")
    print(f"Skipped: {len(results['skip'])}")
    print(f"Errors: {len(results['error'])}")

    if results['collision']:
        print("\nCollisions:")
        for r in results['collision']:
            print(f"  {r['old_file']} → {r['new_file']}")

    if results['error']:
        print("\nErrors:")
        for r in results['error']:
            print(f"  {r.get('old_file', 'unknown')}: {r['reason']}")


if __name__ == '__main__':
    main()