303 lines
9.5 KiB
Python
303 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Fix GHCID files with special characters in abbreviations.
|
||
|
||
This script:
|
||
1. Finds all custodian YAML files with special characters in filenames
|
||
2. Generates corrected GHCIDs by removing special characters
|
||
3. Updates the YAML content with new GHCID values
|
||
4. Renames files to match the new GHCID
|
||
|
||
Special characters that are removed:
|
||
- Ampersand: &
|
||
- Parentheses: ( )
|
||
- Quotes: " '
|
||
- Diacritics in abbreviations: Ö Å É Á Ż İ etc.
|
||
- Other symbols: + @ # % $ * | / \ : ; etc.
|
||
|
||
See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md
|
||
See: AGENTS.md section on "Special Characters MUST Be Excluded from Abbreviations"
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import shutil
|
||
import sys
|
||
import unicodedata
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
|
||
|
||
# Character mappings for diacritics that don't decompose with NFD
|
||
CHAR_MAP = {
|
||
# Danish/Norwegian/Faroese
|
||
'Æ': 'AE', 'æ': 'ae',
|
||
'Ø': 'OE', 'ø': 'oe',
|
||
'Å': 'AA', 'å': 'aa',
|
||
|
||
# German
|
||
'ß': 'SS',
|
||
'Ä': 'AE', 'ä': 'ae',
|
||
'Ö': 'OE', 'ö': 'oe',
|
||
'Ü': 'UE', 'ü': 'ue',
|
||
|
||
# Polish
|
||
'Ł': 'L', 'ł': 'l',
|
||
|
||
# Icelandic
|
||
'Þ': 'TH', 'þ': 'th',
|
||
'Ð': 'DH', 'ð': 'dh',
|
||
|
||
# Czech/Slovak
|
||
'Ř': 'R', 'ř': 'r',
|
||
|
||
# Croatian/Serbian
|
||
'Đ': 'DJ', 'đ': 'dj',
|
||
|
||
# Turkish
|
||
'İ': 'I', 'ı': 'i',
|
||
'Ş': 'S', 'ş': 's',
|
||
'Ğ': 'G', 'ğ': 'g',
|
||
|
||
# Maltese
|
||
'Ż': 'Z', 'ż': 'z',
|
||
'Ħ': 'H', 'ħ': 'h',
|
||
|
||
# Spanish
|
||
'Ñ': 'N', 'ñ': 'n',
|
||
|
||
# French (that don't decompose)
|
||
'Œ': 'OE', 'œ': 'oe',
|
||
}
|
||
|
||
|
||
def normalize_abbreviation(abbrev: str) -> str:
|
||
"""
|
||
Normalize abbreviation to ASCII A-Z only.
|
||
|
||
Args:
|
||
abbrev: Original abbreviation (may contain special chars)
|
||
|
||
Returns:
|
||
Normalized abbreviation with only A-Z characters
|
||
"""
|
||
# Step 1: Apply character mappings for special chars
|
||
result = ''.join(CHAR_MAP.get(c, c) for c in abbrev)
|
||
|
||
# Step 2: NFD decomposition to remove accents
|
||
result = unicodedata.normalize('NFD', result)
|
||
result = ''.join(c for c in result if unicodedata.category(c) != 'Mn')
|
||
|
||
# Step 3: Remove all non-alphabetic characters
|
||
result = re.sub(r'[^A-Za-z]', '', result)
|
||
|
||
# Step 4: Uppercase
|
||
result = result.upper()
|
||
|
||
# Step 5: Limit to 10 chars
|
||
if len(result) > 10:
|
||
result = result[:10]
|
||
|
||
return result
|
||
|
||
|
||
def extract_ghcid_parts(filename: str) -> dict:
|
||
"""
|
||
Extract GHCID parts from filename.
|
||
|
||
Filename format: {CC}-{REG}-{CITY}-{TYPE}-{ABBREV}[-{SUFFIX}].yaml
|
||
|
||
Returns dict with: country, region, city, type, abbreviation, suffix
|
||
"""
|
||
# Remove .yaml extension
|
||
base = filename.replace('.yaml', '')
|
||
|
||
# Split by hyphen, but be careful - suffix may contain hyphens
|
||
parts = base.split('-')
|
||
|
||
if len(parts) < 5:
|
||
return None
|
||
|
||
result = {
|
||
'country': parts[0],
|
||
'region': parts[1],
|
||
'city': parts[2],
|
||
'type': parts[3],
|
||
'abbreviation': parts[4],
|
||
'suffix': None,
|
||
}
|
||
|
||
# If more than 5 parts, everything after is suffix
|
||
if len(parts) > 5:
|
||
result['suffix'] = '-'.join(parts[5:])
|
||
|
||
return result
|
||
|
||
|
||
def build_filename(parts: dict) -> str:
|
||
"""Build filename from GHCID parts."""
|
||
base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{parts['abbreviation']}"
|
||
if parts['suffix']:
|
||
base += f"-{parts['suffix']}"
|
||
return base + '.yaml'
|
||
|
||
|
||
def has_special_chars(filename: str) -> bool:
|
||
"""Check if filename contains special characters in abbreviation."""
|
||
# Look for non-alphanumeric, non-hyphen, non-period, non-underscore
|
||
return bool(re.search(r'[^A-Za-z0-9._-]', filename))
|
||
|
||
|
||
def fix_ghcid_file(filepath: Path, dry_run: bool = True) -> tuple[str, str]:
|
||
"""
|
||
Fix a single GHCID file with special characters.
|
||
|
||
Args:
|
||
filepath: Path to the YAML file
|
||
dry_run: If True, don't actually make changes
|
||
|
||
Returns:
|
||
Tuple of (old_filename, new_filename) or (old_filename, None) if no change needed
|
||
"""
|
||
old_name = filepath.name
|
||
|
||
# Extract GHCID parts
|
||
parts = extract_ghcid_parts(old_name)
|
||
if not parts:
|
||
print(f" WARNING: Could not parse filename: {old_name}")
|
||
return (old_name, None)
|
||
|
||
# Normalize the abbreviation
|
||
old_abbrev = parts['abbreviation']
|
||
new_abbrev = normalize_abbreviation(old_abbrev)
|
||
|
||
if old_abbrev == new_abbrev:
|
||
# No change needed (unlikely given our filter, but just in case)
|
||
return (old_name, None)
|
||
|
||
# Build new filename
|
||
parts['abbreviation'] = new_abbrev
|
||
new_name = build_filename(parts)
|
||
|
||
new_path = filepath.parent / new_name
|
||
|
||
print(f" {old_name}")
|
||
print(f" -> {new_name}")
|
||
print(f" Abbreviation: {old_abbrev} -> {new_abbrev}")
|
||
|
||
if not dry_run:
|
||
# Read content
|
||
content = filepath.read_text(encoding='utf-8')
|
||
|
||
# Replace old GHCID with new GHCID in content
|
||
# Build old and new GHCID strings (without .yaml)
|
||
old_ghcid = old_name.replace('.yaml', '')
|
||
new_ghcid = new_name.replace('.yaml', '')
|
||
|
||
# Also handle the base GHCID (without suffix)
|
||
old_base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{old_abbrev}"
|
||
new_base = f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{new_abbrev}"
|
||
|
||
# Replace in content (be careful to replace longer strings first)
|
||
content = content.replace(old_ghcid, new_ghcid)
|
||
if old_base != old_ghcid: # Only if there's a suffix
|
||
content = content.replace(old_base, new_base)
|
||
|
||
# Add migration note in ghcid_history if the field exists
|
||
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||
migration_note = f"\n# GHCID corrected on {timestamp}: {old_abbrev} -> {new_abbrev} (removed special chars)"
|
||
|
||
# Find a good place to add the note (after the first line with ghcid)
|
||
if 'ghcid:' in content or 'ghcid_current:' in content:
|
||
# Add note before the file
|
||
content = f"# Migration note: GHCID abbreviation corrected {old_abbrev} -> {new_abbrev}\n{content}"
|
||
|
||
# Write updated content
|
||
filepath.write_text(content, encoding='utf-8')
|
||
|
||
# Rename file
|
||
if new_path.exists():
|
||
print(f" WARNING: Target file already exists: {new_name}")
|
||
# Create backup of existing file
|
||
backup_path = new_path.with_suffix('.yaml.bak')
|
||
shutil.copy2(new_path, backup_path)
|
||
print(f" Backed up existing file to: {backup_path.name}")
|
||
|
||
filepath.rename(new_path)
|
||
print(f" RENAMED")
|
||
|
||
return (old_name, new_name)
|
||
|
||
|
||
def main():
|
||
"""Main function to fix all GHCID files with special characters."""
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(description='Fix GHCID files with special characters')
|
||
parser.add_argument('--dry-run', action='store_true', default=True,
|
||
help='Show what would be done without making changes (default)')
|
||
parser.add_argument('--apply', action='store_true',
|
||
help='Actually make the changes')
|
||
parser.add_argument('--custodian-dir', type=str,
|
||
default='/Users/kempersc/apps/glam/data/custodian',
|
||
help='Path to custodian directory')
|
||
|
||
args = parser.parse_args()
|
||
|
||
dry_run = not args.apply
|
||
custodian_dir = Path(args.custodian_dir)
|
||
|
||
if not custodian_dir.exists():
|
||
print(f"ERROR: Directory not found: {custodian_dir}")
|
||
sys.exit(1)
|
||
|
||
print(f"Scanning for GHCID files with special characters in: {custodian_dir}")
|
||
print(f"Mode: {'DRY RUN (no changes)' if dry_run else 'APPLY CHANGES'}")
|
||
print()
|
||
|
||
# Find all files with special characters
|
||
affected_files = []
|
||
for filepath in sorted(custodian_dir.glob('*.yaml')):
|
||
if has_special_chars(filepath.name):
|
||
affected_files.append(filepath)
|
||
|
||
if not affected_files:
|
||
print("No files with special characters found. All GHCIDs are valid.")
|
||
return
|
||
|
||
print(f"Found {len(affected_files)} files with special characters:")
|
||
print()
|
||
|
||
# Group by type of issue
|
||
files_with_ampersand = [f for f in affected_files if '&' in f.name]
|
||
files_with_parens = [f for f in affected_files if '(' in f.name or ')' in f.name]
|
||
files_with_quotes = [f for f in affected_files if '"' in f.name or "'" in f.name]
|
||
files_with_diacritics = [f for f in affected_files
|
||
if any(c in f.name for c in 'ÖÅÉÁŻİüżħñœ')]
|
||
|
||
print(f" - {len(files_with_ampersand)} files with & (ampersand)")
|
||
print(f" - {len(files_with_parens)} files with () (parentheses)")
|
||
print(f" - {len(files_with_quotes)} files with quotes")
|
||
print(f" - {len(files_with_diacritics)} files with diacritics in abbreviation")
|
||
print()
|
||
|
||
# Process each file
|
||
changes = []
|
||
for filepath in affected_files:
|
||
old, new = fix_ghcid_file(filepath, dry_run=dry_run)
|
||
if new:
|
||
changes.append((old, new))
|
||
print()
|
||
|
||
# Summary
|
||
print("=" * 60)
|
||
print(f"SUMMARY: {len(changes)} files would be renamed" if dry_run else f"SUMMARY: {len(changes)} files renamed")
|
||
|
||
if dry_run and changes:
|
||
print()
|
||
print("To apply these changes, run:")
|
||
print(f" python {__file__} --apply")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|