glam/scripts/fix_ghcid_special_chars.py

#!/usr/bin/env python3
"""
Fix GHCID files with special characters in abbreviations.

This script:
1. Finds all custodian YAML files with special characters in filenames
2. Generates corrected GHCIDs by removing special characters
3. Updates the YAML content with new GHCID values
4. Renames files to match the new GHCID

Special characters that are removed:
- Ampersand: &
- Parentheses: ( )
- Quotes: " '
- Diacritics in abbreviations: Ö Å É Á Ż İ etc.
- Other symbols: + @ # % $ * | / \ : ; etc.

See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md
See: AGENTS.md section on "Special Characters MUST Be Excluded from Abbreviations"
"""

import os
import re
import shutil
import sys
import unicodedata
from pathlib import Path
from datetime import datetime, timezone


# Character mappings for diacritics that don't decompose with NFD
CHAR_MAP = {
    # Danish/Norwegian/Faroese
    'Æ': 'AE', 'æ': 'ae',
    'Ø': 'OE', 'ø': 'oe',
    'Å': 'AA', 'å': 'aa',

    # German
    'ß': 'SS',
    'Ä': 'AE', 'ä': 'ae',
    'Ö': 'OE', 'ö': 'oe',
    'Ü': 'UE', 'ü': 'ue',

    # Polish
    'Ł': 'L', 'ł': 'l',

    # Icelandic
    'Þ': 'TH', 'þ': 'th',
    'Ð': 'DH', 'ð': 'dh',

    # Czech/Slovak
    'Ř': 'R', 'ř': 'r',

    # Croatian/Serbian
    'Đ': 'DJ', 'đ': 'dj',

    # Turkish
    'İ': 'I', 'ı': 'i',
    'Ş': 'S', 'ş': 's',
    'Ğ': 'G', 'ğ': 'g',

    # Maltese
    'Ż': 'Z', 'ż': 'z',
    'Ħ': 'H', 'ħ': 'h',

    # Spanish
    'Ñ': 'N', 'ñ': 'n',

    # French (that don't decompose)
    'Œ': 'OE', 'œ': 'oe',
}


def normalize_abbreviation(abbrev: str) -> str:
    """
    Normalize abbreviation to ASCII A-Z only.

    Args:
        abbrev: Original abbreviation (may contain special chars)

    Returns:
        Normalized abbreviation with only A-Z characters
    """
    # Step 1: Apply character mappings for special chars
    result = ''.join(CHAR_MAP.get(c, c) for c in abbrev)

    # Step 2: NFD decomposition to remove accents
    result = unicodedata.normalize('NFD', result)
    result = ''.join(c for c in result if unicodedata.category(c) != 'Mn')

    # Step 3: Remove all non-alphabetic characters
    result = re.sub(r'[^A-Za-z]', '', result)

    # Step 4: Uppercase
    result = result.upper()

    # Step 5: Limit to 10 chars
    if len(result) > 10:
        result = result[:10]

    return result


def extract_ghcid_parts(filename: str) -> dict:
    """
    Extract GHCID parts from filename.

    Filename format: {CC}-{REG}-{CITY}-{TYPE}-{ABBREV}[-{SUFFIX}].yaml

    Returns dict with: country, region, city, type, abbreviation, suffix
    """
    # Remove .yaml extension
    base = filename.replace('.yaml', '')

    # Split by hyphen, but be careful - suffix may contain hyphens
    parts = base.split('-')

    if len(parts) < 5:
        return None

    result = {
        'country': parts[0],
        'region': parts[1],
        'city': parts[2],
        'type': parts[3],
        'abbreviation': parts[4],
        'suffix': None,
    }

    # If more than 5 parts, everything after is suffix
    if len(parts) > 5:
        result['suffix'] = '-'.join(parts[5:])

    return result


def build_filename(parts: dict) -> str:
    """Build filename from GHCID parts."""
    base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{parts['abbreviation']}"
    if parts['suffix']:
        base += f"-{parts['suffix']}"
    return base + '.yaml'


def has_special_chars(filename: str) -> bool:
    """Check if filename contains special characters in abbreviation."""
    # Look for non-alphanumeric, non-hyphen, non-period, non-underscore
    return bool(re.search(r'[^A-Za-z0-9._-]', filename))


def fix_ghcid_file(filepath: Path, dry_run: bool = True) -> tuple[str, str]:
    """
    Fix a single GHCID file with special characters.

    Args:
        filepath: Path to the YAML file
        dry_run: If True, don't actually make changes

    Returns:
        Tuple of (old_filename, new_filename) or (old_filename, None) if no change needed
    """
    old_name = filepath.name

    # Extract GHCID parts
    parts = extract_ghcid_parts(old_name)
    if not parts:
        print(f"  WARNING: Could not parse filename: {old_name}")
        return (old_name, None)

    # Normalize the abbreviation
    old_abbrev = parts['abbreviation']
    new_abbrev = normalize_abbreviation(old_abbrev)

    if old_abbrev == new_abbrev:
        # No change needed (unlikely given our filter, but just in case)
        return (old_name, None)

    # Build new filename
    parts['abbreviation'] = new_abbrev
    new_name = build_filename(parts)

    new_path = filepath.parent / new_name

    print(f"  {old_name}")
    print(f"    -> {new_name}")
    print(f"    Abbreviation: {old_abbrev} -> {new_abbrev}")

    if not dry_run:
        # Read content
        content = filepath.read_text(encoding='utf-8')

        # Replace old GHCID with new GHCID in content
        # Build old and new GHCID strings (without .yaml)
        old_ghcid = old_name.replace('.yaml', '')
        new_ghcid = new_name.replace('.yaml', '')

        # Also handle the base GHCID (without suffix)
        old_base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{old_abbrev}"
        new_base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{new_abbrev}"

        # Replace in content (be careful to replace longer strings first)
        content = content.replace(old_ghcid, new_ghcid)
        if old_base != old_ghcid:  # Only if there's a suffix
            content = content.replace(old_base, new_base)

        # Add migration note in ghcid_history if the field exists
        timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
        migration_note = f"\n# GHCID corrected on {timestamp}: {old_abbrev} -> {new_abbrev} (removed special chars)"

        # Find a good place to add the note (after the first line with ghcid)
        if 'ghcid:' in content or 'ghcid_current:' in content:
            # Add note before the file
            content = f"# Migration note: GHCID abbreviation corrected {old_abbrev} -> {new_abbrev}\n{content}"

        # Write updated content
        filepath.write_text(content, encoding='utf-8')

        # Rename file
        if new_path.exists():
            print(f"    WARNING: Target file already exists: {new_name}")
            # Create backup of existing file
            backup_path = new_path.with_suffix('.yaml.bak')
            shutil.copy2(new_path, backup_path)
            print(f"    Backed up existing file to: {backup_path.name}")

        filepath.rename(new_path)
        print(f"    RENAMED")

    return (old_name, new_name)


def main():
    """Main function to fix all GHCID files with special characters."""
    import argparse

    parser = argparse.ArgumentParser(description='Fix GHCID files with special characters')
    parser.add_argument('--dry-run', action='store_true', default=True,
                        help='Show what would be done without making changes (default)')
    parser.add_argument('--apply', action='store_true',
                        help='Actually make the changes')
    parser.add_argument('--custodian-dir', type=str,
                        default='/Users/kempersc/apps/glam/data/custodian',
                        help='Path to custodian directory')

    args = parser.parse_args()

    dry_run = not args.apply
    custodian_dir = Path(args.custodian_dir)

    if not custodian_dir.exists():
        print(f"ERROR: Directory not found: {custodian_dir}")
        sys.exit(1)

    print(f"Scanning for GHCID files with special characters in: {custodian_dir}")
    print(f"Mode: {'DRY RUN (no changes)' if dry_run else 'APPLY CHANGES'}")
    print()

    # Find all files with special characters
    affected_files = []
    for filepath in sorted(custodian_dir.glob('*.yaml')):
        if has_special_chars(filepath.name):
            affected_files.append(filepath)

    if not affected_files:
        print("No files with special characters found. All GHCIDs are valid.")
        return

    print(f"Found {len(affected_files)} files with special characters:")
    print()

    # Group by type of issue
    files_with_ampersand = [f for f in affected_files if '&' in f.name]
    files_with_parens = [f for f in affected_files if '(' in f.name or ')' in f.name]
    files_with_quotes = [f for f in affected_files if '"' in f.name or "'" in f.name]
    files_with_diacritics = [f for f in affected_files
                            if any(c in f.name for c in 'ÖÅÉÁŻİüżħñœ')]

    print(f"  - {len(files_with_ampersand)} files with & (ampersand)")
    print(f"  - {len(files_with_parens)} files with () (parentheses)")
    print(f"  - {len(files_with_quotes)} files with quotes")
    print(f"  - {len(files_with_diacritics)} files with diacritics in abbreviation")
    print()

    # Process each file
    changes = []
    for filepath in affected_files:
        old, new = fix_ghcid_file(filepath, dry_run=dry_run)
        if new:
            changes.append((old, new))
        print()

    # Summary
    print("=" * 60)
    print(f"SUMMARY: {len(changes)} files would be renamed" if dry_run else f"SUMMARY: {len(changes)} files renamed")

    if dry_run and changes:
        print()
        print("To apply these changes, run:")
        print(f"  python {__file__} --apply")


if __name__ == '__main__':
    main()