glam/scripts/fix_ghcid_special_chars.py
kempersc 63a6bccd9b fix: remove custodian files with invalid GHCID special characters
Remove 229 custodian YAML files containing invalid characters in GHCIDs:
- Ampersand (&) in abbreviations (e.g., BM&HS, UNL&AG, DR&IMSM)
- Parentheses in abbreviations (e.g., WHO(RA, VK(, SL()
- Unicode characters in filenames (Ö, Ä, Å, É, İ, Ż, etc.)

These files are replaced with corrected versions using alphabetic-only
abbreviations per AGENTS.md Rule 8 (Special Characters MUST Be Excluded).

Related scripts updated for location resolution.
2025-12-07 14:23:50 +01:00

304 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Fix GHCID files with special characters in abbreviations.
This script:
1. Finds all custodian YAML files with special characters in filenames
2. Generates corrected GHCIDs by removing special characters
3. Updates the YAML content with new GHCID values
4. Renames files to match the new GHCID
Special characters that are removed:
- Ampersand: &
- Parentheses: ( )
- Quotes: " '
- Diacritics in abbreviations: Ö Å É Á Ż İ etc.
- Other symbols: + @ # % $ * | / \\ : ; etc.
See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md
See: AGENTS.md section on "Special Characters MUST Be Excluded from Abbreviations"
"""
import os
import re
import shutil
import sys
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
# Character mappings for diacritics that don't decompose with NFD
CHAR_MAP = {
# Danish/Norwegian/Faroese
'Æ': 'AE', 'æ': 'ae',
'Ø': 'OE', 'ø': 'oe',
'Å': 'AA', 'å': 'aa',
# German
'ß': 'SS',
'Ä': 'AE', 'ä': 'ae',
'Ö': 'OE', 'ö': 'oe',
'Ü': 'UE', 'ü': 'ue',
# Polish
'Ł': 'L', 'ł': 'l',
# Icelandic
'Þ': 'TH', 'þ': 'th',
'Ð': 'DH', 'ð': 'dh',
# Czech/Slovak
'Ř': 'R', 'ř': 'r',
# Croatian/Serbian
'Đ': 'DJ', 'đ': 'dj',
# Turkish
'İ': 'I', 'ı': 'i',
'Ş': 'S', 'ş': 's',
'Ğ': 'G', 'ğ': 'g',
# Maltese
'Ż': 'Z', 'ż': 'z',
'Ħ': 'H', 'ħ': 'h',
# Spanish
'Ñ': 'N', 'ñ': 'n',
# French (that don't decompose)
'Œ': 'OE', 'œ': 'oe',
}
def normalize_abbreviation(abbrev: str) -> str:
"""
Normalize abbreviation to ASCII A-Z only.
Args:
abbrev: Original abbreviation (may contain special chars)
Returns:
Normalized abbreviation with only A-Z characters
"""
# Step 1: Apply character mappings for special chars
result = ''.join(CHAR_MAP.get(c, c) for c in abbrev)
# Step 2: NFD decomposition to remove accents
result = unicodedata.normalize('NFD', result)
result = ''.join(c for c in result if unicodedata.category(c) != 'Mn')
# Step 3: Remove all non-alphabetic characters
result = re.sub(r'[^A-Za-z]', '', result)
# Step 4: Uppercase
result = result.upper()
# Step 5: Limit to 10 chars
if len(result) > 10:
result = result[:10]
return result
def extract_ghcid_parts(filename: str) -> Optional[dict]:
"""
Extract GHCID parts from filename.
Filename format: {CC}-{REG}-{CITY}-{TYPE}-{ABBREV}[-{SUFFIX}].yaml
Returns dict with: country, region, city, type, abbreviation, suffix
Returns None if filename cannot be parsed.
"""
# Remove .yaml extension
base = filename.replace('.yaml', '')
# Split by hyphen, but be careful - suffix may contain hyphens
parts = base.split('-')
if len(parts) < 5:
return None
result = {
'country': parts[0],
'region': parts[1],
'city': parts[2],
'type': parts[3],
'abbreviation': parts[4],
'suffix': None,
}
# If more than 5 parts, everything after is suffix
if len(parts) > 5:
result['suffix'] = '-'.join(parts[5:])
return result
def build_filename(parts: dict) -> str:
"""Build filename from GHCID parts."""
base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{parts['abbreviation']}"
if parts['suffix']:
base += f"-{parts['suffix']}"
return base + '.yaml'
def has_special_chars(filename: str) -> bool:
"""Check if filename contains special characters in abbreviation."""
# Look for non-alphanumeric, non-hyphen, non-period, non-underscore
return bool(re.search(r'[^A-Za-z0-9._-]', filename))
def fix_ghcid_file(filepath: Path, dry_run: bool = True) -> tuple[str, Optional[str]]:
"""
Fix a single GHCID file with special characters.
Args:
filepath: Path to the YAML file
dry_run: If True, don't actually make changes
Returns:
Tuple of (old_filename, new_filename) or (old_filename, None) if no change needed
"""
old_name = filepath.name
# Extract GHCID parts
parts = extract_ghcid_parts(old_name)
if not parts:
print(f" WARNING: Could not parse filename: {old_name}")
return (old_name, None)
# Normalize the abbreviation
old_abbrev = parts['abbreviation']
new_abbrev = normalize_abbreviation(old_abbrev)
if old_abbrev == new_abbrev:
# No change needed (unlikely given our filter, but just in case)
return (old_name, None)
# Build new filename
parts['abbreviation'] = new_abbrev
new_name = build_filename(parts)
new_path = filepath.parent / new_name
print(f" {old_name}")
print(f" -> {new_name}")
print(f" Abbreviation: {old_abbrev} -> {new_abbrev}")
if not dry_run:
# Read content
content = filepath.read_text(encoding='utf-8')
# Replace old GHCID with new GHCID in content
# Build old and new GHCID strings (without .yaml)
old_ghcid = old_name.replace('.yaml', '')
new_ghcid = new_name.replace('.yaml', '')
# Also handle the base GHCID (without suffix)
old_base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{old_abbrev}"
new_base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{new_abbrev}"
# Replace in content (be careful to replace longer strings first)
content = content.replace(old_ghcid, new_ghcid)
if old_base != old_ghcid: # Only if there's a suffix
content = content.replace(old_base, new_base)
# Add migration note in ghcid_history if the field exists
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
# Find a good place to add the note (after the first line with ghcid)
if 'ghcid:' in content or 'ghcid_current:' in content:
# Add note before the file
content = f"# Migration note: GHCID abbreviation corrected {old_abbrev} -> {new_abbrev}\n{content}"
# Write updated content
filepath.write_text(content, encoding='utf-8')
# Rename file
if new_path.exists():
print(f" WARNING: Target file already exists: {new_name}")
# Create backup of existing file
backup_path = new_path.with_suffix('.yaml.bak')
shutil.copy2(new_path, backup_path)
print(f" Backed up existing file to: {backup_path.name}")
filepath.rename(new_path)
print(f" RENAMED")
return (old_name, new_name)
def main():
"""Main function to fix all GHCID files with special characters."""
import argparse
parser = argparse.ArgumentParser(description='Fix GHCID files with special characters')
parser.add_argument('--dry-run', action='store_true', default=True,
help='Show what would be done without making changes (default)')
parser.add_argument('--apply', action='store_true',
help='Actually make the changes')
parser.add_argument('--custodian-dir', type=str,
default='/Users/kempersc/apps/glam/data/custodian',
help='Path to custodian directory')
args = parser.parse_args()
dry_run = not args.apply
custodian_dir = Path(args.custodian_dir)
if not custodian_dir.exists():
print(f"ERROR: Directory not found: {custodian_dir}")
sys.exit(1)
print(f"Scanning for GHCID files with special characters in: {custodian_dir}")
print(f"Mode: {'DRY RUN (no changes)' if dry_run else 'APPLY CHANGES'}")
print()
# Find all files with special characters
affected_files = []
for filepath in sorted(custodian_dir.glob('*.yaml')):
if has_special_chars(filepath.name):
affected_files.append(filepath)
if not affected_files:
print("No files with special characters found. All GHCIDs are valid.")
return
print(f"Found {len(affected_files)} files with special characters:")
print()
# Group by type of issue
files_with_ampersand = [f for f in affected_files if '&' in f.name]
files_with_parens = [f for f in affected_files if '(' in f.name or ')' in f.name]
files_with_quotes = [f for f in affected_files if '"' in f.name or "'" in f.name]
files_with_diacritics = [f for f in affected_files
if any(c in f.name for c in 'ÖÅÉÁŻİüżħñœ')]
print(f" - {len(files_with_ampersand)} files with & (ampersand)")
print(f" - {len(files_with_parens)} files with () (parentheses)")
print(f" - {len(files_with_quotes)} files with quotes")
print(f" - {len(files_with_diacritics)} files with diacritics in abbreviation")
print()
# Process each file
changes = []
for filepath in affected_files:
old, new = fix_ghcid_file(filepath, dry_run=dry_run)
if new:
changes.append((old, new))
print()
# Summary
print("=" * 60)
print(f"SUMMARY: {len(changes)} files would be renamed" if dry_run else f"SUMMARY: {len(changes)} files renamed")
if dry_run and changes:
print()
print("To apply these changes, run:")
print(f" python {sys.argv[0]} --apply")
if __name__ == '__main__':
main()