glam/scripts/fix_non_ascii_ghcids.py

#!/usr/bin/env python3
"""
Fix non-ASCII characters in GHCID abbreviations.

This script:
1. Finds all custodian files with non-ASCII characters in the GHCID
2. Transliterates Cyrillic, normalizes fullwidth Latin, removes Japanese katakana, etc.
3. Updates the GHCID in the YAML file
4. Renames the file to match the new GHCID

Per AGENTS.md Rule 32 and .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md:
- ONLY ASCII uppercase letters (A-Z) are permitted in GHCID abbreviations
- Diacritics MUST be normalized to ASCII equivalents
- Non-Latin scripts MUST be transliterated to Latin characters
"""

import unicodedata
import re
import shutil
from pathlib import Path
from datetime import datetime, timezone
import yaml

# Use ruamel.yaml to preserve formatting
try:
    from ruamel.yaml import YAML
    yaml_handler = YAML()
    yaml_handler.preserve_quotes = True
    yaml_handler.width = 4096  # Prevent line wrapping
    USE_RUAMEL = True
except ImportError:
    USE_RUAMEL = False


# Cyrillic to Latin transliteration mapping (ISO 9:1995)
CYRILLIC_TO_LATIN = {
    # Uppercase
    'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E',
    'Ё': 'E', 'Ж': 'ZH', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K',
    'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R',
    'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'KH', 'Ц': 'TS',
    'Ч': 'CH', 'Ш': 'SH', 'Щ': 'SHCH', 'Ъ': '', 'Ы': 'Y', 'Ь': '',
    'Э': 'E', 'Ю': 'YU', 'Я': 'YA',
    # Lowercase
    'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e',
    'ё': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k',
    'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r',
    'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
    'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '',
    'э': 'e', 'ю': 'yu', 'я': 'ya',
    # Bulgarian-specific
    'Ъ': 'A', 'ъ': 'a',  # Bulgarian hard sign is pronounced like 'a'
}

# Fullwidth to ASCII mapping
FULLWIDTH_TO_ASCII = {
    'Ａ': 'A', 'Ｂ': 'B', 'Ｃ': 'C', 'Ｄ': 'D', 'Ｅ': 'E', 'Ｆ': 'F',
    'Ｇ': 'G', 'Ｈ': 'H', 'Ｉ': 'I', 'Ｊ': 'J', 'Ｋ': 'K', 'Ｌ': 'L',
    'Ｍ': 'M', 'Ｎ': 'N', 'Ｏ': 'O', 'Ｐ': 'P', 'Ｑ': 'Q', 'Ｒ': 'R',
    'Ｓ': 'S', 'Ｔ': 'T', 'Ｕ': 'U', 'Ｖ': 'V', 'Ｗ': 'W', 'Ｘ': 'X',
    'Ｙ': 'Y', 'Ｚ': 'Z',
    'ａ': 'a', 'ｂ': 'b', 'ｃ': 'c', 'ｄ': 'd', 'ｅ': 'e', 'ｆ': 'f',
    'ｇ': 'g', 'ｈ': 'h', 'ｉ': 'i', 'ｊ': 'j', 'ｋ': 'k', 'ｌ': 'l',
    'ｍ': 'm', 'ｎ': 'n', 'ｏ': 'o', 'ｐ': 'p', 'ｑ': 'q', 'ｒ': 'r',
    'ｓ': 's', 'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w', 'ｘ': 'x',
    'ｙ': 'y', 'ｚ': 'z',
}

# Special ligatures
LIGATURE_MAP = {
    'Œ': 'OE', 'œ': 'oe',
    'Æ': 'AE', 'æ': 'ae',
    'ß': 'SS',
}

# Japanese katakana - these should be removed from GHCID
JAPANESE_KATAKANA = set('アイウエオカキクケコサシスセソタチツテトナニヌネノ'
                        'ハヒフヘホマミムメモヤユヨラリルレロワヲン'
                        'ァィゥェォッャュョヴーゝゞ')

# Arabic characters that might appear in GHCIDs
ARABIC_TO_LATIN = {
    'ا': 'A', 'أ': 'A', 'إ': 'I', 'آ': 'A',
    'ب': 'B', 'ت': 'T', 'ث': 'TH', 'ج': 'J',
    'ح': 'H', 'خ': 'KH', 'د': 'D', 'ذ': 'DH',
    'ر': 'R', 'ز': 'Z', 'س': 'S', 'ش': 'SH',
    'ص': 'S', 'ض': 'D', 'ط': 'T', 'ظ': 'Z',
    'ع': 'A', 'غ': 'GH', 'ف': 'F', 'ق': 'Q',
    'ك': 'K', 'ل': 'L', 'م': 'M', 'ن': 'N',
    'ه': 'H', 'و': 'W', 'ي': 'Y', 'ى': 'A',
    'ة': 'A', 'ء': '',
}


def normalize_diacritics(text: str) -> str:
    """Normalize diacritics to ASCII equivalents."""
    # NFD decomposition separates base characters from combining marks
    normalized = unicodedata.normalize('NFD', text)
    # Remove combining marks (category 'Mn' = Mark, Nonspacing)
    ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    return ascii_text


def transliterate_char(char: str) -> str:
    """Transliterate a single character to ASCII."""
    # Check fullwidth first
    if char in FULLWIDTH_TO_ASCII:
        return FULLWIDTH_TO_ASCII[char]

    # Check Cyrillic
    if char in CYRILLIC_TO_LATIN:
        return CYRILLIC_TO_LATIN[char]

    # Check ligatures
    if char in LIGATURE_MAP:
        return LIGATURE_MAP[char]

    # Check Arabic
    if char in ARABIC_TO_LATIN:
        return ARABIC_TO_LATIN[char]

    # Skip Japanese katakana entirely
    if char in JAPANESE_KATAKANA:
        return ''

    # If it's ASCII, keep it
    if char.isascii():
        return char

    # Try normalizing diacritics
    normalized = normalize_diacritics(char)
    if normalized.isascii():
        return normalized

    # Unknown character - skip it
    return ''


def transliterate_ghcid_abbreviation(abbrev: str) -> str:
    """Transliterate a GHCID abbreviation to ASCII-only.

    Preserves numeric year suffixes (e.g., -1909, -55) which are used for
    collision resolution in GHCIDs.
    """
    # Check if there's a year/numeric suffix (e.g., "-1909", "-55")
    suffix_match = re.search(r'(-\d+)$', abbrev)
    year_suffix = ''
    abbrev_base = abbrev

    if suffix_match:
        year_suffix = suffix_match.group(1)  # Keep the hyphen and number
        abbrev_base = abbrev[:suffix_match.start()]  # Remove suffix for transliteration

    result = []
    for char in abbrev_base:
        translated = transliterate_char(char)
        result.append(translated)

    # Join and ensure uppercase
    ascii_abbrev = ''.join(result).upper()

    # Remove any remaining non-ASCII or non-letter characters (but not from suffix)
    ascii_abbrev = ''.join(c for c in ascii_abbrev if c.isascii() and c.isalpha())

    # Re-attach the year suffix
    return ascii_abbrev + year_suffix


def extract_ghcid_parts(ghcid: str) -> dict:
    """Extract parts of a GHCID string."""
    # GHCID format: {country}-{region}-{city}-{type}-{abbrev}[-{suffix}]
    parts = ghcid.split('-')
    if len(parts) >= 5:
        return {
            'country': parts[0],
            'region': parts[1],
            'city': parts[2],
            'type': parts[3],
            'abbrev': '-'.join(parts[4:])  # Handle name suffixes
        }
    return None


def reconstruct_ghcid(parts: dict, new_abbrev: str) -> str:
    """Reconstruct a GHCID with a new abbreviation."""
    return f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{new_abbrev}"


def find_non_ascii_ghcid_files(custodian_dir: Path) -> list:
    """Find all YAML files with non-ASCII characters in filename."""
    non_ascii_files = []
    for f in custodian_dir.glob('*.yaml'):
        if not f.name.isascii():
            non_ascii_files.append(f)
    return sorted(non_ascii_files)


def fix_ghcid_in_file(file_path: Path, dry_run: bool = True) -> dict:
    """Fix non-ASCII GHCID in a single file.

    Returns dict with:
        - old_ghcid: Original GHCID
        - new_ghcid: Fixed GHCID
        - old_filename: Original filename
        - new_filename: New filename
        - changes: List of changes made
        - error: Error message if any
    """
    result = {
        'old_filename': file_path.name,
        'changes': [],
        'error': None
    }

    try:
        # Read the YAML file
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Parse YAML
        data = yaml.safe_load(content)

        if not data or 'ghcid' not in data:
            result['error'] = 'No ghcid section found'
            return result

        old_ghcid = data['ghcid'].get('ghcid_current', '')
        result['old_ghcid'] = old_ghcid

        if not old_ghcid:
            result['error'] = 'No ghcid_current found'
            return result

        # Extract parts
        parts = extract_ghcid_parts(old_ghcid)
        if not parts:
            result['error'] = f'Could not parse GHCID: {old_ghcid}'
            return result

        # Transliterate the abbreviation
        old_abbrev = parts['abbrev']
        new_abbrev = transliterate_ghcid_abbreviation(old_abbrev)

        if old_abbrev == new_abbrev:
            result['error'] = 'Abbreviation is already ASCII'
            return result

        # Construct new GHCID
        new_ghcid = reconstruct_ghcid(parts, new_abbrev)
        result['new_ghcid'] = new_ghcid
        result['new_filename'] = f"{new_ghcid}.yaml"

        result['changes'].append(f'Abbreviation: {old_abbrev} -> {new_abbrev}')
        result['changes'].append(f'GHCID: {old_ghcid} -> {new_ghcid}')

        if dry_run:
            return result

        # Update the YAML content
        now = datetime.now(timezone.utc).isoformat()

        # Update ghcid_current
        data['ghcid']['ghcid_current'] = new_ghcid

        # Add to ghcid_history
        if 'ghcid_history' not in data['ghcid']:
            data['ghcid']['ghcid_history'] = []

        # Add new entry
        data['ghcid']['ghcid_history'].insert(0, {
            'ghcid': new_ghcid,
            'valid_from': now,
            'valid_to': None,
            'reason': f'Corrected abbreviation from non-ASCII ({old_abbrev}) to ASCII ({new_abbrev}) per ABBREV-CHAR-FILTER rule'
        })

        # Update valid_to on old entry if present
        if len(data['ghcid']['ghcid_history']) > 1:
            for entry in data['ghcid']['ghcid_history'][1:]:
                if entry.get('ghcid') == old_ghcid and entry.get('valid_to') is None:
                    entry['valid_to'] = now
                    break

        # Update identifiers list
        if 'identifiers' in data:
            for ident in data['identifiers']:
                if ident.get('identifier_scheme') == 'GHCID':
                    ident['identifier_value'] = new_ghcid

        # Write updated YAML
        with open(file_path, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        # Rename file
        new_path = file_path.parent / result['new_filename']
        if new_path.exists():
            result['error'] = f'Target file already exists: {new_path}'
            return result

        shutil.move(file_path, new_path)
        result['changes'].append(f'File renamed: {file_path.name} -> {new_path.name}')

        return result

    except Exception as e:
        result['error'] = str(e)
        return result


def main():
    import argparse

    parser = argparse.ArgumentParser(description='Fix non-ASCII characters in GHCID abbreviations')
    parser.add_argument('--dry-run', action='store_true', default=True,
                        help='Only show what would be changed (default: True)')
    parser.add_argument('--apply', action='store_true',
                        help='Actually apply the changes')
    parser.add_argument('--custodian-dir', type=Path,
                        default=Path('/Users/kempersc/apps/glam/data/custodian'),
                        help='Path to custodian directory')

    args = parser.parse_args()

    dry_run = not args.apply

    print(f"{'DRY RUN - ' if dry_run else ''}Fixing non-ASCII GHCID abbreviations")
    print(f"Custodian directory: {args.custodian_dir}")
    print()

    # Find files
    files = find_non_ascii_ghcid_files(args.custodian_dir)
    print(f"Found {len(files)} files with non-ASCII characters in filename")
    print()

    # Process each file
    success_count = 0
    error_count = 0
    skip_count = 0

    for file_path in files:
        result = fix_ghcid_in_file(file_path, dry_run=dry_run)

        print(f"File: {result['old_filename']}")

        if result.get('error'):
            if 'already ASCII' in result['error']:
                print(f"  SKIP: {result['error']}")
                skip_count += 1
            else:
                print(f"  ERROR: {result['error']}")
                error_count += 1
        else:
            for change in result.get('changes', []):
                print(f"  {change}")
            success_count += 1

        print()

    # Summary
    print("=" * 60)
    print(f"Summary:")
    print(f"  Total files:   {len(files)}")
    print(f"  {'Would fix' if dry_run else 'Fixed'}:      {success_count}")
    print(f"  Skipped:       {skip_count}")
    print(f"  Errors:        {error_count}")

    if dry_run:
        print()
        print("To apply changes, run with --apply flag")


if __name__ == '__main__':
    main()