#!/usr/bin/env python3 """ Resolve CZ-XX (unknown region) files to proper ISO 3166-2:CZ region codes. This script updates 36 Czech institution files that have placeholder XX region codes to their correct ISO 3166-2:CZ region codes based on researched location data. Research completed 2025-12-07 via GeoNames database and web searches. """ import os import re import yaml from datetime import datetime, timezone from pathlib import Path # GeoNames Admin1 → ISO 3166-2:CZ region code mapping ADMIN1_TO_ISO = { '52': '10', # Prague '78': '64', # South Moravian (Jihomoravský) '79': '31', # South Bohemian (Jihočeský) '80': '63', # Vysočina '81': '41', # Karlovy Vary '82': '52', # Hradec Králové '83': '51', # Liberec '84': '71', # Olomouc '85': '80', # Moravian-Silesian (Moravskoslezský) '86': '53', # Pardubice '87': '32', # Plzeň '88': '20', # Central Bohemian (Středočeský) '89': '42', # Ústí nad Labem '90': '72', # Zlín } # Research results: mapping from old filename suffix to resolution data # Format: (new_region_code, new_city_code, city_name, geonames_id, admin1_code) RESOLUTIONS = { # Archives (A) 'A-SAČTÚ': ('10', 'PRA', 'Prague', 3067696, '52'), 'A-SAČÚZK': ('10', 'PRA', 'Prague', 3067696, '52'), 'A-SAERÚ': ('63', 'JIH', 'Jihlava', 3074199, '80'), 'A-SAÚPOHS': ('64', 'BRN', 'Brno', 3078610, '78'), 'A-BSS': ('51', 'MAS', 'Malá Skála', 3071223, '83'), 'A-PSJAK': ('53', 'BNO', 'Brandýs nad Orlicí', 3078836, '86'), 'A-ZI': ('10', 'PRA', 'Prague', 3067696, '52'), # Admin location # Galleries (G) 'G-GAU': ('52', 'HOS', 'Hostinné', 3075058, '82'), 'G-GVP': ('20', 'MLB', 'Mladá Boleslav', 3070544, '88'), # Libraries (L) - Many are research institutes in Prague/Brno 'L-SÚPRO': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE064 'L-ÚRB': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE444 'L-ÚSLOZ': ('10', 'PRA', 'Prague', 3067696, '52'), # ABE215 'L-VŠZFA': ('10', 'PRA', 'Prague', 3067696, '52'), 'L-VŠZR': ('10', 'PRA', 'Prague', 3067696, '52'), 'L-VÚB': ('64', 'BRN', 'Brno', 3078610, '78'), # BOC006 'L-VÚI': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC043 'L-VÚP': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC066 'L-VÚRV': ('10', 'PRA', 'Prague', 3067696, '52'), # ABC162 'L-VUTÚTMŘP': ('64', 'BRN', 'Brno', 3078610, '78'), 'L-VVÚNP': ('64', 'BRN', 'Brno', 3078610, '78'), # BOF045 'L-ZVVZVÚV': ('10', 'PRA', 'Prague', 3067696, '52'), # ABF127 # Museums (M) 'M-BMOP': ('32', 'KPC', 'Klenčí pod Čerchovem', 3073644, '87'), 'M-MD': ('10', 'PRA', 'Prague', 3067696, '52'), 'M-MH': ('20', 'KNC', 'Kostelec nad Černými Lesy', 3073152, '88'), 'M-MJD': ('32', 'CHU', 'Chudenice', 3077528, '87'), 'M-MKISMDAH': ('63', 'HUM', 'Humpolec', 3074723, '80'), 'M-MMGLK': ('20', 'POD', 'Poděbrady', 3068107, '88'), 'M-MMM': ('42', 'MIK', 'Mikulášovice', 3070725, '89'), # Mikcentrum! 'M-MMSR': ('10', 'PRA', 'Prague', 3067696, '52'), 'M-MRV': ('51', 'DES', 'Desná', 3077198, '83'), 'M-MSČ': ('20', 'OST', 'Ostředek', 3068792, '88'), 'M-MTZSŘ': ('52', 'DEO', 'Deštné v Orlických horách', 3077191, '82'), 'M-MVBŽS': ('31', 'VOD', 'Vodňany', 3062642, '79'), 'M-PDEHAM': ('53', 'HOL', 'Holice', 3075599, '86'), 'M-PMJH': ('31', 'HUS', 'Husinec', 3074686, '79'), 'M-PZV': ('51', 'PNJ', 'Paseky nad Jizerou', 3068552, '83'), } def generate_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" # Remove diacritics and common prefixes import unicodedata normalized = unicodedata.normalize('NFD', city_name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Handle multi-word names words = ascii_name.split() # Skip common prefixes in Czech skip_words = {'nad', 'pod', 'v', 'u', 'na'} significant_words = [w for w in words if w.lower() not in skip_words] if len(significant_words) == 1: # Single word: first 3 letters return significant_words[0][:3].upper() elif len(significant_words) >= 2: # Multi-word: initials return ''.join(w[0].upper() for w in significant_words[:3]) else: return ascii_name[:3].upper() def update_yaml_file(filepath: Path, resolution: tuple) -> tuple: """ Update a YAML file with resolved region/city data. Returns: (old_ghcid, new_ghcid, new_filepath) """ region_code, city_code, city_name, geonames_id, admin1_code = resolution with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Parse YAML data = yaml.safe_load(content) # Extract current GHCID old_ghcid = data.get('ghcid', {}).get('ghcid_current', '') # Build new GHCID # Pattern: CZ-XX-XXX-{TYPE}-{ABBREV} -> CZ-{REGION}-{CITY}-{TYPE}-{ABBREV} match = re.match(r'CZ-XX-XXX-([A-Z])-(.+)$', old_ghcid) if not match: print(f" WARNING: Could not parse GHCID: {old_ghcid}") return None, None, None inst_type, abbrev = match.groups() new_ghcid = f"CZ-{region_code}-{city_code}-{inst_type}-{abbrev}" timestamp = datetime.now(timezone.utc).isoformat() # Update ghcid section data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['location_resolution'] = { 'method': 'GEONAMES_RESEARCH', 'country_code': 'CZ', 'region_code': region_code, 'region_name': get_region_name(region_code), 'city_code': city_code, 'city_name': city_name, 'geonames_id': geonames_id, 'admin1_code': admin1_code, 'resolution_timestamp': timestamp, 'research_date': '2025-12-07', 'research_method': 'GeoNames database + web search verification' } # Add history entry if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] data['ghcid']['ghcid_history'].append({ 'ghcid': new_ghcid, 'valid_from': timestamp, 'reason': f'Region resolved via GeoNames research: XX→{region_code}, city: {city_name} (GeoNames ID: {geonames_id})' }) # Update provenance notes if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance']: data['provenance']['notes'] = [] data['provenance']['notes'].append( f'Region resolved {timestamp[:10]}: XX→CZ-{region_code} ({city_name}) via GeoNames research' ) # Update location if present if 'location' not in data: data['location'] = {} data['location']['city'] = city_name data['location']['country'] = 'CZ' data['location']['region'] = get_region_name(region_code) data['location']['geonames_id'] = geonames_id # Write updated YAML new_filename = f"{new_ghcid}.yaml" new_filepath = filepath.parent / new_filename with open(new_filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Remove old file if different if new_filepath != filepath: filepath.unlink() return old_ghcid, new_ghcid, new_filepath def get_region_name(region_code: str) -> str: """Get region name from ISO 3166-2:CZ code.""" region_names = { '10': 'Prague', '20': 'Central Bohemian', '31': 'South Bohemian', '32': 'Plzeň', '41': 'Karlovy Vary', '42': 'Ústí nad Labem', '51': 'Liberec', '52': 'Hradec Králové', '53': 'Pardubice', '63': 'Vysočina', '64': 'South Moravian', '71': 'Olomouc', '72': 'Zlín', '80': 'Moravian-Silesian', } return region_names.get(region_code, 'Unknown') def main(): """Main execution function.""" custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') # Find all CZ-XX-XXX files xx_files = list(custodian_dir.glob('CZ-XX-XXX-*.yaml')) print(f"Found {len(xx_files)} CZ-XX-XXX files to resolve") resolved = 0 failed = 0 for filepath in sorted(xx_files): filename = filepath.stem # Extract suffix (e.g., "A-SAČTÚ" from "CZ-XX-XXX-A-SAČTÚ") suffix_match = re.match(r'CZ-XX-XXX-(.+)$', filename) if not suffix_match: print(f" SKIP: Could not parse filename: {filename}") failed += 1 continue suffix = suffix_match.group(1) if suffix not in RESOLUTIONS: print(f" SKIP: No resolution for: {suffix}") failed += 1 continue resolution = RESOLUTIONS[suffix] try: old_ghcid, new_ghcid, new_filepath = update_yaml_file(filepath, resolution) if old_ghcid and new_ghcid: print(f" ✓ {old_ghcid} → {new_ghcid}") resolved += 1 else: print(f" ✗ Failed to update: {filepath.name}") failed += 1 except Exception as e: print(f" ✗ Error processing {filepath.name}: {e}") failed += 1 print(f"\n{'='*60}") print(f"SUMMARY: Resolved {resolved}/{len(xx_files)} files") if failed: print(f" Failed: {failed}") # Verify no CZ-XX files remain remaining = list(custodian_dir.glob('CZ-XX-*.yaml')) print(f"\nRemaining CZ-XX files: {len(remaining)}") if remaining: for f in remaining: print(f" - {f.name}") if __name__ == '__main__': main()