glam/scripts/fix_non_ascii_ghcids.py
2025-12-17 10:11:56 +01:00

373 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Fix non-ASCII characters in GHCID abbreviations.
This script:
1. Finds all custodian files with non-ASCII characters in the GHCID
2. Transliterates Cyrillic, normalizes fullwidth Latin, removes Japanese katakana, etc.
3. Updates the GHCID in the YAML file
4. Renames the file to match the new GHCID
Per AGENTS.md Rule 32 and .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md:
- ONLY ASCII uppercase letters (A-Z) are permitted in GHCID abbreviations
- Diacritics MUST be normalized to ASCII equivalents
- Non-Latin scripts MUST be transliterated to Latin characters
"""
import unicodedata
import re
import shutil
from pathlib import Path
from datetime import datetime, timezone
import yaml
# Use ruamel.yaml to preserve formatting
try:
from ruamel.yaml import YAML
yaml_handler = YAML()
yaml_handler.preserve_quotes = True
yaml_handler.width = 4096 # Prevent line wrapping
USE_RUAMEL = True
except ImportError:
USE_RUAMEL = False
# Cyrillic to Latin transliteration mapping (ISO 9:1995)
CYRILLIC_TO_LATIN = {
# Uppercase
'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E',
'Ё': 'E', 'Ж': 'ZH', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K',
'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R',
'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'KH', 'Ц': 'TS',
'Ч': 'CH', 'Ш': 'SH', 'Щ': 'SHCH', 'Ъ': '', 'Ы': 'Y', 'Ь': '',
'Э': 'E', 'Ю': 'YU', 'Я': 'YA',
# Lowercase
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e',
'ё': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k',
'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r',
'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '',
'э': 'e', 'ю': 'yu', 'я': 'ya',
# Bulgarian-specific
'Ъ': 'A', 'ъ': 'a', # Bulgarian hard sign is pronounced like 'a'
}
# Fullwidth to ASCII mapping
FULLWIDTH_TO_ASCII = {
'': 'A', '': 'B', '': 'C', '': 'D', '': 'E', '': 'F',
'': 'G', '': 'H', '': 'I', '': 'J', '': 'K', '': 'L',
'': 'M', '': 'N', '': 'O', '': 'P', '': 'Q', '': 'R',
'': 'S', '': 'T', '': 'U', '': 'V', '': 'W', '': 'X',
'': 'Y', '': 'Z',
'': 'a', '': 'b', '': 'c', '': 'd', '': 'e', '': 'f',
'': 'g', '': 'h', '': 'i', '': 'j', '': 'k', '': 'l',
'': 'm', '': 'n', '': 'o', '': 'p', '': 'q', '': 'r',
'': 's', '': 't', '': 'u', '': 'v', '': 'w', '': 'x',
'': 'y', '': 'z',
}
# Special ligatures
LIGATURE_MAP = {
'Œ': 'OE', 'œ': 'oe',
'Æ': 'AE', 'æ': 'ae',
'ß': 'SS',
}
# Japanese katakana - these should be removed from GHCID
JAPANESE_KATAKANA = set('アイウエオカキクケコサシスセソタチツテトナニヌネノ'
'ハヒフヘホマミムメモヤユヨラリルレロワヲン'
'ァィゥェォッャュョヴーゝゞ')
# Arabic characters that might appear in GHCIDs
ARABIC_TO_LATIN = {
'ا': 'A', 'أ': 'A', 'إ': 'I', 'آ': 'A',
'ب': 'B', 'ت': 'T', 'ث': 'TH', 'ج': 'J',
'ح': 'H', 'خ': 'KH', 'د': 'D', 'ذ': 'DH',
'ر': 'R', 'ز': 'Z', 'س': 'S', 'ش': 'SH',
'ص': 'S', 'ض': 'D', 'ط': 'T', 'ظ': 'Z',
'ع': 'A', 'غ': 'GH', 'ف': 'F', 'ق': 'Q',
'ك': 'K', 'ل': 'L', 'م': 'M', 'ن': 'N',
'ه': 'H', 'و': 'W', 'ي': 'Y', 'ى': 'A',
'ة': 'A', 'ء': '',
}
def normalize_diacritics(text: str) -> str:
"""Normalize diacritics to ASCII equivalents."""
# NFD decomposition separates base characters from combining marks
normalized = unicodedata.normalize('NFD', text)
# Remove combining marks (category 'Mn' = Mark, Nonspacing)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_text
def transliterate_char(char: str) -> str:
"""Transliterate a single character to ASCII."""
# Check fullwidth first
if char in FULLWIDTH_TO_ASCII:
return FULLWIDTH_TO_ASCII[char]
# Check Cyrillic
if char in CYRILLIC_TO_LATIN:
return CYRILLIC_TO_LATIN[char]
# Check ligatures
if char in LIGATURE_MAP:
return LIGATURE_MAP[char]
# Check Arabic
if char in ARABIC_TO_LATIN:
return ARABIC_TO_LATIN[char]
# Skip Japanese katakana entirely
if char in JAPANESE_KATAKANA:
return ''
# If it's ASCII, keep it
if char.isascii():
return char
# Try normalizing diacritics
normalized = normalize_diacritics(char)
if normalized.isascii():
return normalized
# Unknown character - skip it
return ''
def transliterate_ghcid_abbreviation(abbrev: str) -> str:
"""Transliterate a GHCID abbreviation to ASCII-only.
Preserves numeric year suffixes (e.g., -1909, -55) which are used for
collision resolution in GHCIDs.
"""
# Check if there's a year/numeric suffix (e.g., "-1909", "-55")
suffix_match = re.search(r'(-\d+)$', abbrev)
year_suffix = ''
abbrev_base = abbrev
if suffix_match:
year_suffix = suffix_match.group(1) # Keep the hyphen and number
abbrev_base = abbrev[:suffix_match.start()] # Remove suffix for transliteration
result = []
for char in abbrev_base:
translated = transliterate_char(char)
result.append(translated)
# Join and ensure uppercase
ascii_abbrev = ''.join(result).upper()
# Remove any remaining non-ASCII or non-letter characters (but not from suffix)
ascii_abbrev = ''.join(c for c in ascii_abbrev if c.isascii() and c.isalpha())
# Re-attach the year suffix
return ascii_abbrev + year_suffix
def extract_ghcid_parts(ghcid: str) -> dict:
"""Extract parts of a GHCID string."""
# GHCID format: {country}-{region}-{city}-{type}-{abbrev}[-{suffix}]
parts = ghcid.split('-')
if len(parts) >= 5:
return {
'country': parts[0],
'region': parts[1],
'city': parts[2],
'type': parts[3],
'abbrev': '-'.join(parts[4:]) # Handle name suffixes
}
return None
def reconstruct_ghcid(parts: dict, new_abbrev: str) -> str:
"""Reconstruct a GHCID with a new abbreviation."""
return f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{new_abbrev}"
def find_non_ascii_ghcid_files(custodian_dir: Path) -> list:
"""Find all YAML files with non-ASCII characters in filename."""
non_ascii_files = []
for f in custodian_dir.glob('*.yaml'):
if not f.name.isascii():
non_ascii_files.append(f)
return sorted(non_ascii_files)
def fix_ghcid_in_file(file_path: Path, dry_run: bool = True) -> dict:
"""Fix non-ASCII GHCID in a single file.
Returns dict with:
- old_ghcid: Original GHCID
- new_ghcid: Fixed GHCID
- old_filename: Original filename
- new_filename: New filename
- changes: List of changes made
- error: Error message if any
"""
result = {
'old_filename': file_path.name,
'changes': [],
'error': None
}
try:
# Read the YAML file
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Parse YAML
data = yaml.safe_load(content)
if not data or 'ghcid' not in data:
result['error'] = 'No ghcid section found'
return result
old_ghcid = data['ghcid'].get('ghcid_current', '')
result['old_ghcid'] = old_ghcid
if not old_ghcid:
result['error'] = 'No ghcid_current found'
return result
# Extract parts
parts = extract_ghcid_parts(old_ghcid)
if not parts:
result['error'] = f'Could not parse GHCID: {old_ghcid}'
return result
# Transliterate the abbreviation
old_abbrev = parts['abbrev']
new_abbrev = transliterate_ghcid_abbreviation(old_abbrev)
if old_abbrev == new_abbrev:
result['error'] = 'Abbreviation is already ASCII'
return result
# Construct new GHCID
new_ghcid = reconstruct_ghcid(parts, new_abbrev)
result['new_ghcid'] = new_ghcid
result['new_filename'] = f"{new_ghcid}.yaml"
result['changes'].append(f'Abbreviation: {old_abbrev} -> {new_abbrev}')
result['changes'].append(f'GHCID: {old_ghcid} -> {new_ghcid}')
if dry_run:
return result
# Update the YAML content
now = datetime.now(timezone.utc).isoformat()
# Update ghcid_current
data['ghcid']['ghcid_current'] = new_ghcid
# Add to ghcid_history
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
# Add new entry
data['ghcid']['ghcid_history'].insert(0, {
'ghcid': new_ghcid,
'valid_from': now,
'valid_to': None,
'reason': f'Corrected abbreviation from non-ASCII ({old_abbrev}) to ASCII ({new_abbrev}) per ABBREV-CHAR-FILTER rule'
})
# Update valid_to on old entry if present
if len(data['ghcid']['ghcid_history']) > 1:
for entry in data['ghcid']['ghcid_history'][1:]:
if entry.get('ghcid') == old_ghcid and entry.get('valid_to') is None:
entry['valid_to'] = now
break
# Update identifiers list
if 'identifiers' in data:
for ident in data['identifiers']:
if ident.get('identifier_scheme') == 'GHCID':
ident['identifier_value'] = new_ghcid
# Write updated YAML
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rename file
new_path = file_path.parent / result['new_filename']
if new_path.exists():
result['error'] = f'Target file already exists: {new_path}'
return result
shutil.move(file_path, new_path)
result['changes'].append(f'File renamed: {file_path.name} -> {new_path.name}')
return result
except Exception as e:
result['error'] = str(e)
return result
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix non-ASCII characters in GHCID abbreviations')
parser.add_argument('--dry-run', action='store_true', default=True,
help='Only show what would be changed (default: True)')
parser.add_argument('--apply', action='store_true',
help='Actually apply the changes')
parser.add_argument('--custodian-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'),
help='Path to custodian directory')
args = parser.parse_args()
dry_run = not args.apply
print(f"{'DRY RUN - ' if dry_run else ''}Fixing non-ASCII GHCID abbreviations")
print(f"Custodian directory: {args.custodian_dir}")
print()
# Find files
files = find_non_ascii_ghcid_files(args.custodian_dir)
print(f"Found {len(files)} files with non-ASCII characters in filename")
print()
# Process each file
success_count = 0
error_count = 0
skip_count = 0
for file_path in files:
result = fix_ghcid_in_file(file_path, dry_run=dry_run)
print(f"File: {result['old_filename']}")
if result.get('error'):
if 'already ASCII' in result['error']:
print(f" SKIP: {result['error']}")
skip_count += 1
else:
print(f" ERROR: {result['error']}")
error_count += 1
else:
for change in result.get('changes', []):
print(f" {change}")
success_count += 1
print()
# Summary
print("=" * 60)
print(f"Summary:")
print(f" Total files: {len(files)}")
print(f" {'Would fix' if dry_run else 'Fixed'}: {success_count}")
print(f" Skipped: {skip_count}")
print(f" Errors: {error_count}")
if dry_run:
print()
print("To apply changes, run with --apply flag")
if __name__ == '__main__':
main()