#!/usr/bin/env python3 """ Handle GHCID collisions by adding name suffixes. When fixing region codes, some files may collide with existing files. This script resolves those by adding snake_case name suffixes. """ import os import re import sys import unicodedata from pathlib import Path from datetime import datetime, timezone # Same mappings as the main fix script LETTER_TO_ISO = { 'HL': '10', 'PR': '10', 'ST': '20', 'JC': '31', 'PL': '32', 'KA': '41', 'US': '42', 'LI': '51', 'KR': '52', 'PA': '53', 'VY': '63', 'JI': '64', 'JM': '64', 'OL': '71', 'ZL': '72', 'MO': '80', } DISTRICT_TO_REGION = { '78': '80', '79': '80', '81': '80', '82': '80', '83': '80', '84': '80', '85': '80', '86': '80', '87': '80', '88': '80', '89': '80', '90': '80', } VALID_ISO_CODES = {'10', '20', '31', '32', '41', '42', '51', '52', '53', '63', '64', '71', '72', '80'} REGION_NAMES = { '10': 'Prague', '20': 'Central Bohemian', '31': 'South Bohemian', '32': 'Plzeň', '41': 'Karlovy Vary', '42': 'Ústí nad Labem', '51': 'Liberec', '52': 'Hradec Králové', '53': 'Pardubice', '63': 'Vysočina', '64': 'South Moravian', '71': 'Olomouc', '72': 'Zlín', '80': 'Moravian-Silesian', } def get_correct_region_code(old_code: str) -> str: """Get correct ISO code for a region.""" if old_code in VALID_ISO_CODES: return old_code if old_code in LETTER_TO_ISO: return LETTER_TO_ISO[old_code] if old_code in DISTRICT_TO_REGION: return DISTRICT_TO_REGION[old_code] return None def generate_name_suffix(name: str) -> str: """Convert institution name to snake_case suffix.""" # Normalize unicode (NFD decomposition) and remove diacritics normalized = unicodedata.normalize('NFD', name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Convert to lowercase lowercase = ascii_name.lower() # Remove apostrophes, commas, and other punctuation no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase) # Replace spaces and hyphens with underscores underscored = re.sub(r'[\s\-]+', '_', no_punct) # Remove any remaining non-alphanumeric characters (except underscores) clean = re.sub(r'[^a-z0-9_]', '', underscored) # Collapse multiple underscores final = re.sub(r'_+', '_', clean).strip('_') return final def extract_name_from_yaml(filepath: Path) -> str: """Extract institution name from YAML file.""" content = filepath.read_text(encoding='utf-8') # Try to find name in original_entry match = re.search(r'original_entry:\s*\n\s*name:\s*(.+)', content) if match: return match.group(1).strip() # Try custodian_name.claim_value match = re.search(r'claim_value:\s*(.+)', content) if match: return match.group(1).strip() return None def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str, old_region: str, new_region: str) -> str: """Fix the YAML content with new GHCID and region codes.""" timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') region_name = REGION_NAMES.get(new_region, 'Unknown') reason = f"Corrected region code from CZ-{old_region} to CZ-{new_region} ({region_name}) with name suffix for collision resolution per ISO 3166-2:CZ" # Replace GHCID in ghcid_current content = re.sub( r'(ghcid_current:\s*)' + re.escape(old_ghcid), r'\g<1>' + new_ghcid, content ) # Replace GHCID in identifiers content = re.sub( r'(identifier_value:\s*)' + re.escape(old_ghcid), r'\g<1>' + new_ghcid, content ) # Replace region_code in location_resolution content = re.sub( r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region), r'\g<1>' + new_region, content, flags=re.DOTALL ) # Replace region_code in location section content = re.sub( r'(location:.*?region_code:\s*)' + re.escape(old_region), r'\g<1>' + new_region, content, flags=re.DOTALL ) # Check if ghcid_history already exists if 'ghcid_history:' in content: new_history_items = f'''- ghcid: {new_ghcid} valid_from: "{timestamp}" valid_to: null reason: "{reason}" - ghcid: {old_ghcid} valid_from: null valid_to: "{timestamp}" reason: "Previous GHCID before collision resolution" ''' content = re.sub( r'(ghcid_history:\s*\n\s*)', r'\g<1>' + new_history_items, content ) else: history_entry = f''' ghcid_history: - ghcid: {new_ghcid} valid_from: "{timestamp}" valid_to: null reason: "{reason}" - ghcid: {old_ghcid} valid_from: null valid_to: "{timestamp}" reason: "Original GHCID before collision resolution"''' content = re.sub( r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')', r'\g<1>' + history_entry, content ) return content def process_collision_file(filepath: Path, dry_run: bool = False) -> dict: """Process a file that couldn't be fixed due to collision.""" filename = filepath.name # Extract current GHCID from filename match = re.match(r'CZ-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename) if not match: return {'status': 'skipped', 'reason': 'filename pattern mismatch'} old_region = match.group(1) city = match.group(2) inst_type = match.group(3) abbrev = match.group(4) # Get correct region code new_region = get_correct_region_code(old_region) if not new_region: return {'status': 'error', 'reason': f'unknown region code: {old_region}'} if old_region == new_region: return {'status': 'skipped', 'reason': 'already correct'} # Get institution name inst_name = extract_name_from_yaml(filepath) if not inst_name: return {'status': 'error', 'reason': 'could not extract institution name'} # Generate name suffix name_suffix = generate_name_suffix(inst_name) old_ghcid = f"CZ-{old_region}-{city}-{inst_type}-{abbrev}" new_ghcid = f"CZ-{new_region}-{city}-{inst_type}-{abbrev}-{name_suffix}" new_filename = f"{new_ghcid}.yaml" new_filepath = filepath.parent / new_filename if dry_run: return { 'status': 'would_fix', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'name': inst_name, 'name_suffix': name_suffix, } # Check if new file already exists if new_filepath.exists(): return { 'status': 'collision', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'reason': 'Target still exists even with name suffix' } # Read, fix, and write content content = filepath.read_text(encoding='utf-8') new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region) new_filepath.write_text(new_content, encoding='utf-8') filepath.unlink() return { 'status': 'fixed', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'name': inst_name, } def main(): import argparse parser = argparse.ArgumentParser(description='Handle Czech region code collisions') parser.add_argument('--dry-run', action='store_true', help='Show what would be changed') parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory') args = parser.parse_args() custodian_dir = Path(args.dir) # Find all CZ-*.yaml files with non-ISO region codes non_iso_codes = set(LETTER_TO_ISO.keys()) | set(DISTRICT_TO_REGION.keys()) collision_files = [] for code in non_iso_codes: collision_files.extend(custodian_dir.glob(f'CZ-{code}-*.yaml')) print(f"Found {len(collision_files)} files with non-ISO region codes (collision victims)") results = {'fixed': [], 'would_fix': [], 'errors': [], 'collisions': []} for filepath in sorted(collision_files): result = process_collision_file(filepath, dry_run=args.dry_run) status = result['status'] if status in ('fixed', 'would_fix'): results[status].append(result) action = 'Would fix' if args.dry_run else 'Fixed' print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}") elif status == 'collision': results['collisions'].append(result) print(f" COLLISION: {result}") elif status == 'error': results['errors'].append(result) print(f" ERROR: {filepath.name} - {result['reason']}") print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:") print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}") print(f" Collisions: {len(results['collisions'])}") print(f" Errors: {len(results['errors'])}") if __name__ == '__main__': main()