#!/usr/bin/env python3 """ Fix Czech region codes in custodian YAML files. Problem: - Some files use non-ISO codes: - Letter codes (JI, HL, ST, VY, PL, OL, PA, KR, MO, US, ZL, LI, KA, PR, JM, JC) - 3-digit district codes (317, 422, 521, 802, etc.) - Invalid 2-digit codes (78, 79, 81-90 etc.) ISO 3166-2:CZ uses 2-digit NUTS-based codes for 14 regions: 10 - Prague (Hlavní město Praha) 20 - Central Bohemian (Středočeský) 31 - South Bohemian (Jihočeský) 32 - Plzeň (Plzeňský) 41 - Karlovy Vary (Karlovarský) 42 - Ústí nad Labem (Ústecký) 51 - Liberec (Liberecký) 52 - Hradec Králové (Královéhradecký) 53 - Pardubice (Pardubický) 63 - Vysočina 64 - South Moravian (Jihomoravský) 71 - Olomouc (Olomoucký) 72 - Zlín (Zlínský) 80 - Moravian-Silesian (Moravskoslezský) """ import os import re import sys from pathlib import Path from datetime import datetime, timezone # Mapping from letter codes to ISO 3166-2:CZ codes LETTER_TO_ISO = { 'HL': '10', # Hlavní město Praha (Prague) 'PR': '10', # Prague alternate 'ST': '20', # Středočeský (Central Bohemian) 'JC': '31', # Jihočeský (South Bohemian) 'PL': '32', # Plzeňský (Plzeň) 'KA': '41', # Karlovarský (Karlovy Vary) 'US': '42', # Ústecký (Ústí nad Labem) 'LI': '51', # Liberecký (Liberec) 'KR': '52', # Královéhradecký (Hradec Králové) 'PA': '53', # Pardubický (Pardubice) 'VY': '63', # Vysočina 'JI': '64', # Jihomoravský (South Moravian) - CORRECT MAPPING! 'JM': '64', # Jihomoravský alternate 'OL': '71', # Olomoucký (Olomouc) 'ZL': '72', # Zlínský (Zlín) 'MO': '80', # Moravskoslezský (Moravian-Silesian) } # Mapping from 3-digit district codes to region codes # Format: first 2 digits = region code, third digit = district DISTRICT_3DIGIT_TO_REGION = { '317': '31', # South Bohemian district '422': '42', # Ústí nad Labem district '521': '52', # Hradec Králové district '802': '80', # Moravian-Silesian district } # Mapping for district-level 2-digit codes to region codes # These appear to be district sub-codes that need region extraction # Based on ISO 3166-2:CZ structure where districts are 3-char (e.g., 20A, 31B) DISTRICT_TO_REGION = { # 78, 79 series - possibly old codes, need investigation '78': '80', # Likely Moravian-Silesian area '79': '80', # Likely Moravian-Silesian area '81': '80', # Moravian-Silesian district '82': '80', # Moravian-Silesian district (e.g., Bruntál) '83': '80', # Moravian-Silesian district '84': '80', # Moravian-Silesian district '85': '80', # Moravian-Silesian district (e.g., Karviná) '86': '80', # Moravian-Silesian district (e.g., Nový Jičín) '87': '80', # Moravian-Silesian district (e.g., Opava) '88': '80', # Moravian-Silesian district (e.g., Ostrava-město) '89': '80', # Moravian-Silesian district (e.g., Frýdek-Místek) '90': '80', # Moravian-Silesian district } # Valid ISO region codes VALID_ISO_CODES = {'10', '20', '31', '32', '41', '42', '51', '52', '53', '63', '64', '71', '72', '80'} # Region names for documentation REGION_NAMES = { '10': 'Prague (Hlavní město Praha)', '20': 'Central Bohemian (Středočeský)', '31': 'South Bohemian (Jihočeský)', '32': 'Plzeň (Plzeňský)', '41': 'Karlovy Vary (Karlovarský)', '42': 'Ústí nad Labem (Ústecký)', '51': 'Liberec (Liberecký)', '52': 'Hradec Králové (Královéhradecký)', '53': 'Pardubice (Pardubický)', '63': 'Vysočina', '64': 'South Moravian (Jihomoravský)', '71': 'Olomouc (Olomoucký)', '72': 'Zlín (Zlínský)', '80': 'Moravian-Silesian (Moravskoslezský)', } def get_correct_region_code(old_code: str) -> tuple: """ Convert old region code to correct ISO 3166-2 code. Returns: (correct_code, correction_type) """ # Already correct ISO code if old_code in VALID_ISO_CODES: return old_code, 'already_correct' # Check for letter code if old_code in LETTER_TO_ISO: return LETTER_TO_ISO[old_code], 'letter_code' # Check for 3-digit district code if old_code in DISTRICT_3DIGIT_TO_REGION: return DISTRICT_3DIGIT_TO_REGION[old_code], 'district_3digit' # Check for 2-digit district code (78-90 series) if old_code in DISTRICT_TO_REGION: return DISTRICT_TO_REGION[old_code], 'district_2digit' return None, 'unknown' def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str, old_region: str, new_region: str) -> str: """Fix the YAML content with new GHCID and region codes.""" timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') region_name = REGION_NAMES.get(new_region, 'Unknown') reason = f"Corrected region code from CZ-{old_region} to CZ-{new_region} ({region_name}) per ISO 3166-2:CZ" # Replace GHCID in ghcid_current content = re.sub( r'(ghcid_current:\s*)' + re.escape(old_ghcid), r'\g<1>' + new_ghcid, content ) # Replace GHCID in identifiers content = re.sub( r'(identifier_value:\s*)' + re.escape(old_ghcid), r'\g<1>' + new_ghcid, content ) # Replace region_code in location_resolution (be careful with patterns) content = re.sub( r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region), r'\g<1>' + new_region, content, flags=re.DOTALL ) # Replace region_code in location section content = re.sub( r'(location:.*?region_code:\s*)' + re.escape(old_region), r'\g<1>' + new_region, content, flags=re.DOTALL ) # Check if ghcid_history already exists if 'ghcid_history:' in content: # Insert new entry at the beginning of existing history new_history_items = f'''- ghcid: {new_ghcid} valid_from: "{timestamp}" valid_to: null reason: "{reason}" - ghcid: {old_ghcid} valid_from: null valid_to: "{timestamp}" reason: "Previous GHCID with incorrect region code" ''' content = re.sub( r'(ghcid_history:\s*\n\s*)', r'\g<1>' + new_history_items, content ) else: # Add ghcid_history after ghcid_current history_entry = f''' ghcid_history: - ghcid: {new_ghcid} valid_from: "{timestamp}" valid_to: null reason: "{reason}" - ghcid: {old_ghcid} valid_from: null valid_to: "{timestamp}" reason: "Original GHCID with incorrect region code"''' content = re.sub( r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')', r'\g<1>' + history_entry, content ) return content def process_file(filepath: Path, dry_run: bool = False) -> dict: """Process a single YAML file and return results.""" filename = filepath.name # Extract current GHCID from filename (e.g., CZ-JI-BRN-M-MBM.yaml) match = re.match(r'CZ-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename) if not match: return {'status': 'skipped', 'reason': 'filename pattern mismatch'} old_region = match.group(1) city = match.group(2) inst_type = match.group(3) abbrev = match.group(4) # Get correct region code new_region, correction_type = get_correct_region_code(old_region) if correction_type == 'already_correct': return {'status': 'skipped', 'reason': 'already correct'} if correction_type == 'unknown': return {'status': 'error', 'reason': f'unknown region code: {old_region}'} old_ghcid = f"CZ-{old_region}-{city}-{inst_type}-{abbrev}" new_ghcid = f"CZ-{new_region}-{city}-{inst_type}-{abbrev}" new_filename = f"{new_ghcid}.yaml" new_filepath = filepath.parent / new_filename if dry_run: return { 'status': 'would_fix', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'old_file': filename, 'new_file': new_filename, 'old_region': old_region, 'new_region': new_region } # Check if target file already exists (collision) if new_filepath.exists() and filepath != new_filepath: return { 'status': 'collision', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'reason': f'Target file {new_filename} already exists' } # Read file content content = filepath.read_text(encoding='utf-8') # Fix content new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region) # Write to new file new_filepath.write_text(new_content, encoding='utf-8') # Remove old file if different name if filepath != new_filepath: filepath.unlink() return { 'status': 'fixed', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'old_file': filename, 'new_file': new_filename } def main(): import argparse parser = argparse.ArgumentParser(description='Fix Czech region codes in custodian files') parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without making changes') parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory containing custodian files') parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)') args = parser.parse_args() custodian_dir = Path(args.dir) # Find all CZ-*.yaml files cz_files = list(custodian_dir.glob('CZ-*.yaml')) print(f"Found {len(cz_files)} Czech files") results = { 'fixed': [], 'would_fix': [], 'skipped': [], 'errors': [], 'collisions': [] } processed = 0 for filepath in sorted(cz_files): if args.limit > 0 and processed >= args.limit: break result = process_file(filepath, dry_run=args.dry_run) status = result['status'] if status in ('fixed', 'would_fix'): results[status].append(result) action = 'Would fix' if args.dry_run else 'Fixed' print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}") processed += 1 elif status == 'collision': results['collisions'].append(result) print(f" COLLISION: {result['old_ghcid']} -> {result['new_ghcid']}: {result['reason']}") processed += 1 elif status == 'error': results['errors'].append((filepath.name, result['reason'])) print(f" ERROR: {filepath.name} - {result['reason']}") processed += 1 # Skip already correct files silently print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:") print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}") print(f" Collisions: {len(results['collisions'])}") print(f" Already correct: {len(cz_files) - processed}") print(f" Errors: {len(results['errors'])}") if results['errors']: print("\nUnknown region codes (need mapping):") unknown_codes = set() for filename, reason in results['errors']: if 'unknown region code:' in reason: code = reason.split(':')[1].strip() unknown_codes.add(code) for code in sorted(unknown_codes): count = sum(1 for f, r in results['errors'] if code in r) print(f" {code}: {count} files") if __name__ == '__main__': main()