#!/usr/bin/env python3 """ Fix GHCID files with special characters in abbreviations. This script: 1. Finds all custodian YAML files with special characters in filenames 2. Generates corrected GHCIDs by removing special characters 3. Updates the YAML content with new GHCID values 4. Renames files to match the new GHCID Special characters that are removed: - Ampersand: & - Parentheses: ( ) - Quotes: " ' - Diacritics in abbreviations: Ö Å É Á Ż İ etc. - Other symbols: + @ # % $ * | / \\ : ; etc. See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md See: AGENTS.md section on "Special Characters MUST Be Excluded from Abbreviations" """ import os import re import shutil import sys import unicodedata from pathlib import Path from datetime import datetime, timezone from typing import Optional # Character mappings for diacritics that don't decompose with NFD CHAR_MAP = { # Danish/Norwegian/Faroese 'Æ': 'AE', 'æ': 'ae', 'Ø': 'OE', 'ø': 'oe', 'Å': 'AA', 'å': 'aa', # German 'ß': 'SS', 'Ä': 'AE', 'ä': 'ae', 'Ö': 'OE', 'ö': 'oe', 'Ü': 'UE', 'ü': 'ue', # Polish 'Ł': 'L', 'ł': 'l', # Icelandic 'Þ': 'TH', 'þ': 'th', 'Ð': 'DH', 'ð': 'dh', # Czech/Slovak 'Ř': 'R', 'ř': 'r', # Croatian/Serbian 'Đ': 'DJ', 'đ': 'dj', # Turkish 'İ': 'I', 'ı': 'i', 'Ş': 'S', 'ş': 's', 'Ğ': 'G', 'ğ': 'g', # Maltese 'Ż': 'Z', 'ż': 'z', 'Ħ': 'H', 'ħ': 'h', # Spanish 'Ñ': 'N', 'ñ': 'n', # French (that don't decompose) 'Œ': 'OE', 'œ': 'oe', } def normalize_abbreviation(abbrev: str) -> str: """ Normalize abbreviation to ASCII A-Z only. Args: abbrev: Original abbreviation (may contain special chars) Returns: Normalized abbreviation with only A-Z characters """ # Step 1: Apply character mappings for special chars result = ''.join(CHAR_MAP.get(c, c) for c in abbrev) # Step 2: NFD decomposition to remove accents result = unicodedata.normalize('NFD', result) result = ''.join(c for c in result if unicodedata.category(c) != 'Mn') # Step 3: Remove all non-alphabetic characters result = re.sub(r'[^A-Za-z]', '', result) # Step 4: Uppercase result = result.upper() # Step 5: Limit to 10 chars if len(result) > 10: result = result[:10] return result def extract_ghcid_parts(filename: str) -> Optional[dict]: """ Extract GHCID parts from filename. Filename format: {CC}-{REG}-{CITY}-{TYPE}-{ABBREV}[-{SUFFIX}].yaml Returns dict with: country, region, city, type, abbreviation, suffix Returns None if filename cannot be parsed. """ # Remove .yaml extension base = filename.replace('.yaml', '') # Split by hyphen, but be careful - suffix may contain hyphens parts = base.split('-') if len(parts) < 5: return None result = { 'country': parts[0], 'region': parts[1], 'city': parts[2], 'type': parts[3], 'abbreviation': parts[4], 'suffix': None, } # If more than 5 parts, everything after is suffix if len(parts) > 5: result['suffix'] = '-'.join(parts[5:]) return result def build_filename(parts: dict) -> str: """Build filename from GHCID parts.""" base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{parts['abbreviation']}" if parts['suffix']: base += f"-{parts['suffix']}" return base + '.yaml' def has_special_chars(filename: str) -> bool: """Check if filename contains special characters in abbreviation.""" # Look for non-alphanumeric, non-hyphen, non-period, non-underscore return bool(re.search(r'[^A-Za-z0-9._-]', filename)) def fix_ghcid_file(filepath: Path, dry_run: bool = True) -> tuple[str, Optional[str]]: """ Fix a single GHCID file with special characters. Args: filepath: Path to the YAML file dry_run: If True, don't actually make changes Returns: Tuple of (old_filename, new_filename) or (old_filename, None) if no change needed """ old_name = filepath.name # Extract GHCID parts parts = extract_ghcid_parts(old_name) if not parts: print(f" WARNING: Could not parse filename: {old_name}") return (old_name, None) # Normalize the abbreviation old_abbrev = parts['abbreviation'] new_abbrev = normalize_abbreviation(old_abbrev) if old_abbrev == new_abbrev: # No change needed (unlikely given our filter, but just in case) return (old_name, None) # Build new filename parts['abbreviation'] = new_abbrev new_name = build_filename(parts) new_path = filepath.parent / new_name print(f" {old_name}") print(f" -> {new_name}") print(f" Abbreviation: {old_abbrev} -> {new_abbrev}") if not dry_run: # Read content content = filepath.read_text(encoding='utf-8') # Replace old GHCID with new GHCID in content # Build old and new GHCID strings (without .yaml) old_ghcid = old_name.replace('.yaml', '') new_ghcid = new_name.replace('.yaml', '') # Also handle the base GHCID (without suffix) old_base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{old_abbrev}" new_base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{new_abbrev}" # Replace in content (be careful to replace longer strings first) content = content.replace(old_ghcid, new_ghcid) if old_base != old_ghcid: # Only if there's a suffix content = content.replace(old_base, new_base) # Add migration note in ghcid_history if the field exists timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') # Find a good place to add the note (after the first line with ghcid) if 'ghcid:' in content or 'ghcid_current:' in content: # Add note before the file content = f"# Migration note: GHCID abbreviation corrected {old_abbrev} -> {new_abbrev}\n{content}" # Write updated content filepath.write_text(content, encoding='utf-8') # Rename file if new_path.exists(): print(f" WARNING: Target file already exists: {new_name}") # Create backup of existing file backup_path = new_path.with_suffix('.yaml.bak') shutil.copy2(new_path, backup_path) print(f" Backed up existing file to: {backup_path.name}") filepath.rename(new_path) print(f" RENAMED") return (old_name, new_name) def main(): """Main function to fix all GHCID files with special characters.""" import argparse parser = argparse.ArgumentParser(description='Fix GHCID files with special characters') parser.add_argument('--dry-run', action='store_true', default=True, help='Show what would be done without making changes (default)') parser.add_argument('--apply', action='store_true', help='Actually make the changes') parser.add_argument('--custodian-dir', type=str, default='/Users/kempersc/apps/glam/data/custodian', help='Path to custodian directory') args = parser.parse_args() dry_run = not args.apply custodian_dir = Path(args.custodian_dir) if not custodian_dir.exists(): print(f"ERROR: Directory not found: {custodian_dir}") sys.exit(1) print(f"Scanning for GHCID files with special characters in: {custodian_dir}") print(f"Mode: {'DRY RUN (no changes)' if dry_run else 'APPLY CHANGES'}") print() # Find all files with special characters affected_files = [] for filepath in sorted(custodian_dir.glob('*.yaml')): if has_special_chars(filepath.name): affected_files.append(filepath) if not affected_files: print("No files with special characters found. All GHCIDs are valid.") return print(f"Found {len(affected_files)} files with special characters:") print() # Group by type of issue files_with_ampersand = [f for f in affected_files if '&' in f.name] files_with_parens = [f for f in affected_files if '(' in f.name or ')' in f.name] files_with_quotes = [f for f in affected_files if '"' in f.name or "'" in f.name] files_with_diacritics = [f for f in affected_files if any(c in f.name for c in 'ÖÅÉÁŻİüżħñœ')] print(f" - {len(files_with_ampersand)} files with & (ampersand)") print(f" - {len(files_with_parens)} files with () (parentheses)") print(f" - {len(files_with_quotes)} files with quotes") print(f" - {len(files_with_diacritics)} files with diacritics in abbreviation") print() # Process each file changes = [] for filepath in affected_files: old, new = fix_ghcid_file(filepath, dry_run=dry_run) if new: changes.append((old, new)) print() # Summary print("=" * 60) print(f"SUMMARY: {len(changes)} files would be renamed" if dry_run else f"SUMMARY: {len(changes)} files renamed") if dry_run and changes: print() print("To apply these changes, run:") print(f" python {sys.argv[0]} --apply") if __name__ == '__main__': main()