373 lines
12 KiB
Python
373 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Fix non-ASCII characters in GHCID abbreviations.
|
||
|
||
This script:
|
||
1. Finds all custodian files with non-ASCII characters in the GHCID
|
||
2. Transliterates Cyrillic, normalizes fullwidth Latin, removes Japanese katakana, etc.
|
||
3. Updates the GHCID in the YAML file
|
||
4. Renames the file to match the new GHCID
|
||
|
||
Per AGENTS.md Rule 32 and .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md:
|
||
- ONLY ASCII uppercase letters (A-Z) are permitted in GHCID abbreviations
|
||
- Diacritics MUST be normalized to ASCII equivalents
|
||
- Non-Latin scripts MUST be transliterated to Latin characters
|
||
"""
|
||
|
||
import unicodedata
|
||
import re
|
||
import shutil
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
import yaml
|
||
|
||
# Use ruamel.yaml to preserve formatting
|
||
try:
|
||
from ruamel.yaml import YAML
|
||
yaml_handler = YAML()
|
||
yaml_handler.preserve_quotes = True
|
||
yaml_handler.width = 4096 # Prevent line wrapping
|
||
USE_RUAMEL = True
|
||
except ImportError:
|
||
USE_RUAMEL = False
|
||
|
||
|
||
# Cyrillic to Latin transliteration mapping (ISO 9:1995)
|
||
CYRILLIC_TO_LATIN = {
|
||
# Uppercase
|
||
'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E',
|
||
'Ё': 'E', 'Ж': 'ZH', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K',
|
||
'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R',
|
||
'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'KH', 'Ц': 'TS',
|
||
'Ч': 'CH', 'Ш': 'SH', 'Щ': 'SHCH', 'Ъ': '', 'Ы': 'Y', 'Ь': '',
|
||
'Э': 'E', 'Ю': 'YU', 'Я': 'YA',
|
||
# Lowercase
|
||
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e',
|
||
'ё': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k',
|
||
'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r',
|
||
'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
|
||
'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '',
|
||
'э': 'e', 'ю': 'yu', 'я': 'ya',
|
||
# Bulgarian-specific
|
||
'Ъ': 'A', 'ъ': 'a', # Bulgarian hard sign is pronounced like 'a'
|
||
}
|
||
|
||
# Fullwidth to ASCII mapping
|
||
FULLWIDTH_TO_ASCII = {
|
||
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
|
||
'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
|
||
'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R',
|
||
'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X',
|
||
'Y': 'Y', 'Z': 'Z',
|
||
'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f',
|
||
'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l',
|
||
'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
|
||
's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
|
||
'y': 'y', 'z': 'z',
|
||
}
|
||
|
||
# Special ligatures
|
||
LIGATURE_MAP = {
|
||
'Œ': 'OE', 'œ': 'oe',
|
||
'Æ': 'AE', 'æ': 'ae',
|
||
'ß': 'SS',
|
||
}
|
||
|
||
# Japanese katakana - these should be removed from GHCID
|
||
JAPANESE_KATAKANA = set('アイウエオカキクケコサシスセソタチツテトナニヌネノ'
|
||
'ハヒフヘホマミムメモヤユヨラリルレロワヲン'
|
||
'ァィゥェォッャュョヴーゝゞ')
|
||
|
||
# Arabic characters that might appear in GHCIDs
|
||
ARABIC_TO_LATIN = {
|
||
'ا': 'A', 'أ': 'A', 'إ': 'I', 'آ': 'A',
|
||
'ب': 'B', 'ت': 'T', 'ث': 'TH', 'ج': 'J',
|
||
'ح': 'H', 'خ': 'KH', 'د': 'D', 'ذ': 'DH',
|
||
'ر': 'R', 'ز': 'Z', 'س': 'S', 'ش': 'SH',
|
||
'ص': 'S', 'ض': 'D', 'ط': 'T', 'ظ': 'Z',
|
||
'ع': 'A', 'غ': 'GH', 'ف': 'F', 'ق': 'Q',
|
||
'ك': 'K', 'ل': 'L', 'م': 'M', 'ن': 'N',
|
||
'ه': 'H', 'و': 'W', 'ي': 'Y', 'ى': 'A',
|
||
'ة': 'A', 'ء': '',
|
||
}
|
||
|
||
|
||
def normalize_diacritics(text: str) -> str:
|
||
"""Normalize diacritics to ASCII equivalents."""
|
||
# NFD decomposition separates base characters from combining marks
|
||
normalized = unicodedata.normalize('NFD', text)
|
||
# Remove combining marks (category 'Mn' = Mark, Nonspacing)
|
||
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||
return ascii_text
|
||
|
||
|
||
def transliterate_char(char: str) -> str:
|
||
"""Transliterate a single character to ASCII."""
|
||
# Check fullwidth first
|
||
if char in FULLWIDTH_TO_ASCII:
|
||
return FULLWIDTH_TO_ASCII[char]
|
||
|
||
# Check Cyrillic
|
||
if char in CYRILLIC_TO_LATIN:
|
||
return CYRILLIC_TO_LATIN[char]
|
||
|
||
# Check ligatures
|
||
if char in LIGATURE_MAP:
|
||
return LIGATURE_MAP[char]
|
||
|
||
# Check Arabic
|
||
if char in ARABIC_TO_LATIN:
|
||
return ARABIC_TO_LATIN[char]
|
||
|
||
# Skip Japanese katakana entirely
|
||
if char in JAPANESE_KATAKANA:
|
||
return ''
|
||
|
||
# If it's ASCII, keep it
|
||
if char.isascii():
|
||
return char
|
||
|
||
# Try normalizing diacritics
|
||
normalized = normalize_diacritics(char)
|
||
if normalized.isascii():
|
||
return normalized
|
||
|
||
# Unknown character - skip it
|
||
return ''
|
||
|
||
|
||
def transliterate_ghcid_abbreviation(abbrev: str) -> str:
|
||
"""Transliterate a GHCID abbreviation to ASCII-only.
|
||
|
||
Preserves numeric year suffixes (e.g., -1909, -55) which are used for
|
||
collision resolution in GHCIDs.
|
||
"""
|
||
# Check if there's a year/numeric suffix (e.g., "-1909", "-55")
|
||
suffix_match = re.search(r'(-\d+)$', abbrev)
|
||
year_suffix = ''
|
||
abbrev_base = abbrev
|
||
|
||
if suffix_match:
|
||
year_suffix = suffix_match.group(1) # Keep the hyphen and number
|
||
abbrev_base = abbrev[:suffix_match.start()] # Remove suffix for transliteration
|
||
|
||
result = []
|
||
for char in abbrev_base:
|
||
translated = transliterate_char(char)
|
||
result.append(translated)
|
||
|
||
# Join and ensure uppercase
|
||
ascii_abbrev = ''.join(result).upper()
|
||
|
||
# Remove any remaining non-ASCII or non-letter characters (but not from suffix)
|
||
ascii_abbrev = ''.join(c for c in ascii_abbrev if c.isascii() and c.isalpha())
|
||
|
||
# Re-attach the year suffix
|
||
return ascii_abbrev + year_suffix
|
||
|
||
|
||
def extract_ghcid_parts(ghcid: str) -> dict:
|
||
"""Extract parts of a GHCID string."""
|
||
# GHCID format: {country}-{region}-{city}-{type}-{abbrev}[-{suffix}]
|
||
parts = ghcid.split('-')
|
||
if len(parts) >= 5:
|
||
return {
|
||
'country': parts[0],
|
||
'region': parts[1],
|
||
'city': parts[2],
|
||
'type': parts[3],
|
||
'abbrev': '-'.join(parts[4:]) # Handle name suffixes
|
||
}
|
||
return None
|
||
|
||
|
||
def reconstruct_ghcid(parts: dict, new_abbrev: str) -> str:
|
||
"""Reconstruct a GHCID with a new abbreviation."""
|
||
return f"{parts['country']}-{parts['region']}-{parts['city']}-{parts['type']}-{new_abbrev}"
|
||
|
||
|
||
def find_non_ascii_ghcid_files(custodian_dir: Path) -> list:
|
||
"""Find all YAML files with non-ASCII characters in filename."""
|
||
non_ascii_files = []
|
||
for f in custodian_dir.glob('*.yaml'):
|
||
if not f.name.isascii():
|
||
non_ascii_files.append(f)
|
||
return sorted(non_ascii_files)
|
||
|
||
|
||
def fix_ghcid_in_file(file_path: Path, dry_run: bool = True) -> dict:
|
||
"""Fix non-ASCII GHCID in a single file.
|
||
|
||
Returns dict with:
|
||
- old_ghcid: Original GHCID
|
||
- new_ghcid: Fixed GHCID
|
||
- old_filename: Original filename
|
||
- new_filename: New filename
|
||
- changes: List of changes made
|
||
- error: Error message if any
|
||
"""
|
||
result = {
|
||
'old_filename': file_path.name,
|
||
'changes': [],
|
||
'error': None
|
||
}
|
||
|
||
try:
|
||
# Read the YAML file
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# Parse YAML
|
||
data = yaml.safe_load(content)
|
||
|
||
if not data or 'ghcid' not in data:
|
||
result['error'] = 'No ghcid section found'
|
||
return result
|
||
|
||
old_ghcid = data['ghcid'].get('ghcid_current', '')
|
||
result['old_ghcid'] = old_ghcid
|
||
|
||
if not old_ghcid:
|
||
result['error'] = 'No ghcid_current found'
|
||
return result
|
||
|
||
# Extract parts
|
||
parts = extract_ghcid_parts(old_ghcid)
|
||
if not parts:
|
||
result['error'] = f'Could not parse GHCID: {old_ghcid}'
|
||
return result
|
||
|
||
# Transliterate the abbreviation
|
||
old_abbrev = parts['abbrev']
|
||
new_abbrev = transliterate_ghcid_abbreviation(old_abbrev)
|
||
|
||
if old_abbrev == new_abbrev:
|
||
result['error'] = 'Abbreviation is already ASCII'
|
||
return result
|
||
|
||
# Construct new GHCID
|
||
new_ghcid = reconstruct_ghcid(parts, new_abbrev)
|
||
result['new_ghcid'] = new_ghcid
|
||
result['new_filename'] = f"{new_ghcid}.yaml"
|
||
|
||
result['changes'].append(f'Abbreviation: {old_abbrev} -> {new_abbrev}')
|
||
result['changes'].append(f'GHCID: {old_ghcid} -> {new_ghcid}')
|
||
|
||
if dry_run:
|
||
return result
|
||
|
||
# Update the YAML content
|
||
now = datetime.now(timezone.utc).isoformat()
|
||
|
||
# Update ghcid_current
|
||
data['ghcid']['ghcid_current'] = new_ghcid
|
||
|
||
# Add to ghcid_history
|
||
if 'ghcid_history' not in data['ghcid']:
|
||
data['ghcid']['ghcid_history'] = []
|
||
|
||
# Add new entry
|
||
data['ghcid']['ghcid_history'].insert(0, {
|
||
'ghcid': new_ghcid,
|
||
'valid_from': now,
|
||
'valid_to': None,
|
||
'reason': f'Corrected abbreviation from non-ASCII ({old_abbrev}) to ASCII ({new_abbrev}) per ABBREV-CHAR-FILTER rule'
|
||
})
|
||
|
||
# Update valid_to on old entry if present
|
||
if len(data['ghcid']['ghcid_history']) > 1:
|
||
for entry in data['ghcid']['ghcid_history'][1:]:
|
||
if entry.get('ghcid') == old_ghcid and entry.get('valid_to') is None:
|
||
entry['valid_to'] = now
|
||
break
|
||
|
||
# Update identifiers list
|
||
if 'identifiers' in data:
|
||
for ident in data['identifiers']:
|
||
if ident.get('identifier_scheme') == 'GHCID':
|
||
ident['identifier_value'] = new_ghcid
|
||
|
||
# Write updated YAML
|
||
with open(file_path, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||
|
||
# Rename file
|
||
new_path = file_path.parent / result['new_filename']
|
||
if new_path.exists():
|
||
result['error'] = f'Target file already exists: {new_path}'
|
||
return result
|
||
|
||
shutil.move(file_path, new_path)
|
||
result['changes'].append(f'File renamed: {file_path.name} -> {new_path.name}')
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
result['error'] = str(e)
|
||
return result
|
||
|
||
|
||
def main():
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(description='Fix non-ASCII characters in GHCID abbreviations')
|
||
parser.add_argument('--dry-run', action='store_true', default=True,
|
||
help='Only show what would be changed (default: True)')
|
||
parser.add_argument('--apply', action='store_true',
|
||
help='Actually apply the changes')
|
||
parser.add_argument('--custodian-dir', type=Path,
|
||
default=Path('/Users/kempersc/apps/glam/data/custodian'),
|
||
help='Path to custodian directory')
|
||
|
||
args = parser.parse_args()
|
||
|
||
dry_run = not args.apply
|
||
|
||
print(f"{'DRY RUN - ' if dry_run else ''}Fixing non-ASCII GHCID abbreviations")
|
||
print(f"Custodian directory: {args.custodian_dir}")
|
||
print()
|
||
|
||
# Find files
|
||
files = find_non_ascii_ghcid_files(args.custodian_dir)
|
||
print(f"Found {len(files)} files with non-ASCII characters in filename")
|
||
print()
|
||
|
||
# Process each file
|
||
success_count = 0
|
||
error_count = 0
|
||
skip_count = 0
|
||
|
||
for file_path in files:
|
||
result = fix_ghcid_in_file(file_path, dry_run=dry_run)
|
||
|
||
print(f"File: {result['old_filename']}")
|
||
|
||
if result.get('error'):
|
||
if 'already ASCII' in result['error']:
|
||
print(f" SKIP: {result['error']}")
|
||
skip_count += 1
|
||
else:
|
||
print(f" ERROR: {result['error']}")
|
||
error_count += 1
|
||
else:
|
||
for change in result.get('changes', []):
|
||
print(f" {change}")
|
||
success_count += 1
|
||
|
||
print()
|
||
|
||
# Summary
|
||
print("=" * 60)
|
||
print(f"Summary:")
|
||
print(f" Total files: {len(files)}")
|
||
print(f" {'Would fix' if dry_run else 'Fixed'}: {success_count}")
|
||
print(f" Skipped: {skip_count}")
|
||
print(f" Errors: {error_count}")
|
||
|
||
if dry_run:
|
||
print()
|
||
print("To apply changes, run with --apply flag")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|