#!/usr/bin/env python3 """ Fix non-ASCII characters in GHCID abbreviations. This script: 1. Finds all custodian files with non-ASCII characters in the GHCID 2. Transliterates Cyrillic, normalizes fullwidth Latin, removes Japanese katakana, etc. 3. Updates the GHCID in the YAML file 4. Renames the file to match the new GHCID Per AGENTS.md Rule 32 and .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md: - ONLY ASCII uppercase letters (A-Z) are permitted in GHCID abbreviations - Diacritics MUST be normalized to ASCII equivalents - Non-Latin scripts MUST be transliterated to Latin characters """ import unicodedata import re import shutil from pathlib import Path from datetime import datetime, timezone import yaml # Use ruamel.yaml to preserve formatting try: from ruamel.yaml import YAML yaml_handler = YAML() yaml_handler.preserve_quotes = True yaml_handler.width = 4096 # Prevent line wrapping USE_RUAMEL = True except ImportError: USE_RUAMEL = False # Cyrillic to Latin transliteration mapping (ISO 9:1995) CYRILLIC_TO_LATIN = { # Uppercase 'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E', 'Ё': 'E', 'Ж': 'ZH', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K', 'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R', 'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'KH', 'Ц': 'TS', 'Ч': 'CH', 'Ш': 'SH', 'Щ': 'SHCH', 'Ъ': '', 'Ы': 'Y', 'Ь': '', 'Э': 'E', 'Ю': 'YU', 'Я': 'YA', # Lowercase 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu', 'я': 'ya', # Bulgarian-specific 'Ъ': 'A', 'ъ': 'a', # Bulgarian hard sign is pronounced like 'a' } # Fullwidth to ASCII mapping FULLWIDTH_TO_ASCII = { 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z', 'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z', } # Special ligatures LIGATURE_MAP = { 'Œ': 'OE', 'œ': 'oe', 'Æ': 'AE', 'æ': 'ae', 'ß': 'SS', } # Japanese katakana - these should be removed from GHCID JAPANESE_KATAKANA = set('アイウエオカキクケコサシスセソタチツテトナニヌネノ' 'ハヒフヘホマミムメモヤユヨラリルレロワヲン' 'ァィゥェォッャュョヴーゝゞ') # Arabic characters that might appear in GHCIDs ARABIC_TO_LATIN = { 'ا': 'A', 'أ': 'A', 'إ': 'I', 'آ': 'A', 'ب': 'B', 'ت': 'T', 'ث': 'TH', 'ج': 'J', 'ح': 'H', 'خ': 'KH', 'د': 'D', 'ذ': 'DH', 'ر': 'R', 'ز': 'Z', 'س': 'S', 'ش': 'SH', 'ص': 'S', 'ض': 'D', 'ط': 'T', 'ظ': 'Z', 'ع': 'A', 'غ': 'GH', 'ف': 'F', 'ق': 'Q', 'ك': 'K', 'ل': 'L', 'م': 'M', 'ن': 'N', 'ه': 'H', 'و': 'W', 'ي': 'Y', 'ى': 'A', 'ة': 'A', 'ء': '', } def normalize_diacritics(text: str) -> str: """Normalize diacritics to ASCII equivalents.""" # NFD decomposition separates base characters from combining marks normalized = unicodedata.normalize('NFD', text) # Remove combining marks (category 'Mn' = Mark, Nonspacing) ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') return ascii_text def transliterate_char(char: str) -> str: """Transliterate a single character to ASCII.""" # Check fullwidth first if char in FULLWIDTH_TO_ASCII: return FULLWIDTH_TO_ASCII[char] # Check Cyrillic if char in CYRILLIC_TO_LATIN: return CYRILLIC_TO_LATIN[char] # Check ligatures if char in LIGATURE_MAP: return LIGATURE_MAP[char] # Check Arabic if char in ARABIC_TO_LATIN: return ARABIC_TO_LATIN[char] # Skip Japanese katakana entirely if char in JAPANESE_KATAKANA: return '' # If it's ASCII, keep it if char.isascii(): return char # Try normalizing diacritics normalized = normalize_diacritics(char) if normalized.isascii(): return normalized # Unknown character - skip it return '' def transliterate_ghcid_abbreviation(abbrev: str) -> str: """Transliterate a GHCID abbreviation to ASCII-only. Preserves numeric year suffixes (e.g., -1909, -55) which are used for collision resolution in GHCIDs. """ # Check if there's a year/numeric suffix (e.g., "-1909", "-55") suffix_match = re.search(r'(-\d+)$', abbrev) year_suffix = '' abbrev_base = abbrev if suffix_match: year_suffix = suffix_match.group(1) # Keep the hyphen and number abbrev_base = abbrev[:suffix_match.start()] # Remove suffix for transliteration result = [] for char in abbrev_base: translated = transliterate_char(char) result.append(translated) # Join and ensure uppercase ascii_abbrev = ''.join(result).upper() # Remove any remaining non-ASCII or non-letter characters (but not from suffix) ascii_abbrev = ''.join(c for c in ascii_abbrev if c.isascii() and c.isalpha()) # Re-attach the year suffix return ascii_abbrev + year_suffix def extract_ghcid_parts(ghcid: str) -> dict: """Extract parts of a GHCID string.""" # GHCID format: {country}-{region}-{city}-{type}-{abbrev}[-{suffix}] parts = ghcid.split('-') if len(parts) >= 5: return { 'country': parts[0], 'region': parts[1], 'city': parts[2], 'type': parts[3], 'abbrev': '-'.join(parts[4:]) # Handle name suffixes } return None def reconstruct_ghcid(parts: dict, new_abbrev: str) -> str: """Reconstruct a GHCID with a new abbreviation.""" return f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{new_abbrev}" def find_non_ascii_ghcid_files(custodian_dir: Path) -> list: """Find all YAML files with non-ASCII characters in filename.""" non_ascii_files = [] for f in custodian_dir.glob('*.yaml'): if not f.name.isascii(): non_ascii_files.append(f) return sorted(non_ascii_files) def fix_ghcid_in_file(file_path: Path, dry_run: bool = True) -> dict: """Fix non-ASCII GHCID in a single file. Returns dict with: - old_ghcid: Original GHCID - new_ghcid: Fixed GHCID - old_filename: Original filename - new_filename: New filename - changes: List of changes made - error: Error message if any """ result = { 'old_filename': file_path.name, 'changes': [], 'error': None } try: # Read the YAML file with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Parse YAML data = yaml.safe_load(content) if not data or 'ghcid' not in data: result['error'] = 'No ghcid section found' return result old_ghcid = data['ghcid'].get('ghcid_current', '') result['old_ghcid'] = old_ghcid if not old_ghcid: result['error'] = 'No ghcid_current found' return result # Extract parts parts = extract_ghcid_parts(old_ghcid) if not parts: result['error'] = f'Could not parse GHCID: {old_ghcid}' return result # Transliterate the abbreviation old_abbrev = parts['abbrev'] new_abbrev = transliterate_ghcid_abbreviation(old_abbrev) if old_abbrev == new_abbrev: result['error'] = 'Abbreviation is already ASCII' return result # Construct new GHCID new_ghcid = reconstruct_ghcid(parts, new_abbrev) result['new_ghcid'] = new_ghcid result['new_filename'] = f"{new_ghcid}.yaml" result['changes'].append(f'Abbreviation: {old_abbrev} -> {new_abbrev}') result['changes'].append(f'GHCID: {old_ghcid} -> {new_ghcid}') if dry_run: return result # Update the YAML content now = datetime.now(timezone.utc).isoformat() # Update ghcid_current data['ghcid']['ghcid_current'] = new_ghcid # Add to ghcid_history if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] # Add new entry data['ghcid']['ghcid_history'].insert(0, { 'ghcid': new_ghcid, 'valid_from': now, 'valid_to': None, 'reason': f'Corrected abbreviation from non-ASCII ({old_abbrev}) to ASCII ({new_abbrev}) per ABBREV-CHAR-FILTER rule' }) # Update valid_to on old entry if present if len(data['ghcid']['ghcid_history']) > 1: for entry in data['ghcid']['ghcid_history'][1:]: if entry.get('ghcid') == old_ghcid and entry.get('valid_to') is None: entry['valid_to'] = now break # Update identifiers list if 'identifiers' in data: for ident in data['identifiers']: if ident.get('identifier_scheme') == 'GHCID': ident['identifier_value'] = new_ghcid # Write updated YAML with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rename file new_path = file_path.parent / result['new_filename'] if new_path.exists(): result['error'] = f'Target file already exists: {new_path}' return result shutil.move(file_path, new_path) result['changes'].append(f'File renamed: {file_path.name} -> {new_path.name}') return result except Exception as e: result['error'] = str(e) return result def main(): import argparse parser = argparse.ArgumentParser(description='Fix non-ASCII characters in GHCID abbreviations') parser.add_argument('--dry-run', action='store_true', default=True, help='Only show what would be changed (default: True)') parser.add_argument('--apply', action='store_true', help='Actually apply the changes') parser.add_argument('--custodian-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/custodian'), help='Path to custodian directory') args = parser.parse_args() dry_run = not args.apply print(f"{'DRY RUN - ' if dry_run else ''}Fixing non-ASCII GHCID abbreviations") print(f"Custodian directory: {args.custodian_dir}") print() # Find files files = find_non_ascii_ghcid_files(args.custodian_dir) print(f"Found {len(files)} files with non-ASCII characters in filename") print() # Process each file success_count = 0 error_count = 0 skip_count = 0 for file_path in files: result = fix_ghcid_in_file(file_path, dry_run=dry_run) print(f"File: {result['old_filename']}") if result.get('error'): if 'already ASCII' in result['error']: print(f" SKIP: {result['error']}") skip_count += 1 else: print(f" ERROR: {result['error']}") error_count += 1 else: for change in result.get('changes', []): print(f" {change}") success_count += 1 print() # Summary print("=" * 60) print(f"Summary:") print(f" Total files: {len(files)}") print(f" {'Would fix' if dry_run else 'Fixed'}: {success_count}") print(f" Skipped: {skip_count}") print(f" Errors: {error_count}") if dry_run: print() print("To apply changes, run with --apply flag") if __name__ == '__main__': main()