glam/scripts/resolve_cz_xx_regions.py

#!/usr/bin/env python3
"""
Resolve CZ-XX (unknown region) files to proper ISO 3166-2:CZ region codes.

This script updates 36 Czech institution files that have placeholder XX region codes
to their correct ISO 3166-2:CZ region codes based on researched location data.

Research completed 2025-12-07 via GeoNames database and web searches.
"""

import os
import re
import yaml
from datetime import datetime, timezone
from pathlib import Path

# GeoNames Admin1 → ISO 3166-2:CZ region code mapping
ADMIN1_TO_ISO = {
    '52': '10',  # Prague
    '78': '64',  # South Moravian (Jihomoravský)
    '79': '31',  # South Bohemian (Jihočeský)
    '80': '63',  # Vysočina
    '81': '41',  # Karlovy Vary
    '82': '52',  # Hradec Králové
    '83': '51',  # Liberec
    '84': '71',  # Olomouc
    '85': '80',  # Moravian-Silesian (Moravskoslezský)
    '86': '53',  # Pardubice
    '87': '32',  # Plzeň
    '88': '20',  # Central Bohemian (Středočeský)
    '89': '42',  # Ústí nad Labem
    '90': '72',  # Zlín
}

# Research results: mapping from old filename suffix to resolution data
# Format: (new_region_code, new_city_code, city_name, geonames_id, admin1_code)
RESOLUTIONS = {
    # Archives (A)
    'A-SAČTÚ': ('10', 'PRA', 'Prague', 3067696, '52'),
    'A-SAČÚZK': ('10', 'PRA', 'Prague', 3067696, '52'),
    'A-SAERÚ': ('63', 'JIH', 'Jihlava', 3074199, '80'),
    'A-SAÚPOHS': ('64', 'BRN', 'Brno', 3078610, '78'),
    'A-BSS': ('51', 'MAS', 'Malá Skála', 3071223, '83'),
    'A-PSJAK': ('53', 'BNO', 'Brandýs nad Orlicí', 3078836, '86'),
    'A-ZI': ('10', 'PRA', 'Prague', 3067696, '52'),  # Admin location

    # Galleries (G)
    'G-GAU': ('52', 'HOS', 'Hostinné', 3075058, '82'),
    'G-GVP': ('20', 'MLB', 'Mladá Boleslav', 3070544, '88'),

    # Libraries (L) - Many are research institutes in Prague/Brno
    'L-SÚPRO': ('10', 'PRA', 'Prague', 3067696, '52'),  # ABE064
    'L-ÚRB': ('10', 'PRA', 'Prague', 3067696, '52'),    # ABE444
    'L-ÚSLOZ': ('10', 'PRA', 'Prague', 3067696, '52'),  # ABE215
    'L-VŠZFA': ('10', 'PRA', 'Prague', 3067696, '52'),
    'L-VŠZR': ('10', 'PRA', 'Prague', 3067696, '52'),
    'L-VÚB': ('64', 'BRN', 'Brno', 3078610, '78'),      # BOC006
    'L-VÚI': ('10', 'PRA', 'Prague', 3067696, '52'),    # ABC043
    'L-VÚP': ('10', 'PRA', 'Prague', 3067696, '52'),    # ABC066
    'L-VÚRV': ('10', 'PRA', 'Prague', 3067696, '52'),   # ABC162
    'L-VUTÚTMŘP': ('64', 'BRN', 'Brno', 3078610, '78'),
    'L-VVÚNP': ('64', 'BRN', 'Brno', 3078610, '78'),    # BOF045
    'L-ZVVZVÚV': ('10', 'PRA', 'Prague', 3067696, '52'), # ABF127

    # Museums (M)
    'M-BMOP': ('32', 'KPC', 'Klenčí pod Čerchovem', 3073644, '87'),
    'M-MD': ('10', 'PRA', 'Prague', 3067696, '52'),
    'M-MH': ('20', 'KNC', 'Kostelec nad Černými Lesy', 3073152, '88'),
    'M-MJD': ('32', 'CHU', 'Chudenice', 3077528, '87'),
    'M-MKISMDAH': ('63', 'HUM', 'Humpolec', 3074723, '80'),
    'M-MMGLK': ('20', 'POD', 'Poděbrady', 3068107, '88'),
    'M-MMM': ('42', 'MIK', 'Mikulášovice', 3070725, '89'),  # Mikcentrum!
    'M-MMSR': ('10', 'PRA', 'Prague', 3067696, '52'),
    'M-MRV': ('51', 'DES', 'Desná', 3077198, '83'),
    'M-MSČ': ('20', 'OST', 'Ostředek', 3068792, '88'),
    'M-MTZSŘ': ('52', 'DEO', 'Deštné v Orlických horách', 3077191, '82'),
    'M-MVBŽS': ('31', 'VOD', 'Vodňany', 3062642, '79'),
    'M-PDEHAM': ('53', 'HOL', 'Holice', 3075599, '86'),
    'M-PMJH': ('31', 'HUS', 'Husinec', 3074686, '79'),
    'M-PZV': ('51', 'PNJ', 'Paseky nad Jizerou', 3068552, '83'),
}


def generate_city_code(city_name: str) -> str:
    """Generate 3-letter city code from city name."""
    # Remove diacritics and common prefixes
    import unicodedata
    normalized = unicodedata.normalize('NFD', city_name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Handle multi-word names
    words = ascii_name.split()

    # Skip common prefixes in Czech
    skip_words = {'nad', 'pod', 'v', 'u', 'na'}
    significant_words = [w for w in words if w.lower() not in skip_words]

    if len(significant_words) == 1:
        # Single word: first 3 letters
        return significant_words[0][:3].upper()
    elif len(significant_words) >= 2:
        # Multi-word: initials
        return ''.join(w[0].upper() for w in significant_words[:3])
    else:
        return ascii_name[:3].upper()


def update_yaml_file(filepath: Path, resolution: tuple) -> tuple:
    """
    Update a YAML file with resolved region/city data.

    Returns: (old_ghcid, new_ghcid, new_filepath)
    """
    region_code, city_code, city_name, geonames_id, admin1_code = resolution

    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # Parse YAML
    data = yaml.safe_load(content)

    # Extract current GHCID
    old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')

    # Build new GHCID
    # Pattern: CZ-XX-XXX-{TYPE}-{ABBREV} -> CZ-{REGION}-{CITY}-{TYPE}-{ABBREV}
    match = re.match(r'CZ-XX-XXX-([A-Z])-(.+)$', old_ghcid)
    if not match:
        print(f"  WARNING: Could not parse GHCID: {old_ghcid}")
        return None, None, None

    inst_type, abbrev = match.groups()
    new_ghcid = f"CZ-{region_code}-{city_code}-{inst_type}-{abbrev}"

    timestamp = datetime.now(timezone.utc).isoformat()

    # Update ghcid section
    data['ghcid']['ghcid_current'] = new_ghcid
    data['ghcid']['location_resolution'] = {
        'method': 'GEONAMES_RESEARCH',
        'country_code': 'CZ',
        'region_code': region_code,
        'region_name': get_region_name(region_code),
        'city_code': city_code,
        'city_name': city_name,
        'geonames_id': geonames_id,
        'admin1_code': admin1_code,
        'resolution_timestamp': timestamp,
        'research_date': '2025-12-07',
        'research_method': 'GeoNames database + web search verification'
    }

    # Add history entry
    if 'ghcid_history' not in data['ghcid']:
        data['ghcid']['ghcid_history'] = []

    data['ghcid']['ghcid_history'].append({
        'ghcid': new_ghcid,
        'valid_from': timestamp,
        'reason': f'Region resolved via GeoNames research: XX→{region_code}, city: {city_name} (GeoNames ID: {geonames_id})'
    })

    # Update provenance notes
    if 'provenance' not in data:
        data['provenance'] = {}
    if 'notes' not in data['provenance']:
        data['provenance']['notes'] = []
    data['provenance']['notes'].append(
        f'Region resolved {timestamp[:10]}: XX→CZ-{region_code} ({city_name}) via GeoNames research'
    )

    # Update location if present
    if 'location' not in data:
        data['location'] = {}
    data['location']['city'] = city_name
    data['location']['country'] = 'CZ'
    data['location']['region'] = get_region_name(region_code)
    data['location']['geonames_id'] = geonames_id

    # Write updated YAML
    new_filename = f"{new_ghcid}.yaml"
    new_filepath = filepath.parent / new_filename

    with open(new_filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    # Remove old file if different
    if new_filepath != filepath:
        filepath.unlink()

    return old_ghcid, new_ghcid, new_filepath


def get_region_name(region_code: str) -> str:
    """Get region name from ISO 3166-2:CZ code."""
    region_names = {
        '10': 'Prague',
        '20': 'Central Bohemian',
        '31': 'South Bohemian',
        '32': 'Plzeň',
        '41': 'Karlovy Vary',
        '42': 'Ústí nad Labem',
        '51': 'Liberec',
        '52': 'Hradec Králové',
        '53': 'Pardubice',
        '63': 'Vysočina',
        '64': 'South Moravian',
        '71': 'Olomouc',
        '72': 'Zlín',
        '80': 'Moravian-Silesian',
    }
    return region_names.get(region_code, 'Unknown')


def main():
    """Main execution function."""
    custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')

    # Find all CZ-XX-XXX files
    xx_files = list(custodian_dir.glob('CZ-XX-XXX-*.yaml'))
    print(f"Found {len(xx_files)} CZ-XX-XXX files to resolve")

    resolved = 0
    failed = 0

    for filepath in sorted(xx_files):
        filename = filepath.stem
        # Extract suffix (e.g., "A-SAČTÚ" from "CZ-XX-XXX-A-SAČTÚ")
        suffix_match = re.match(r'CZ-XX-XXX-(.+)$', filename)
        if not suffix_match:
            print(f"  SKIP: Could not parse filename: {filename}")
            failed += 1
            continue

        suffix = suffix_match.group(1)

        if suffix not in RESOLUTIONS:
            print(f"  SKIP: No resolution for: {suffix}")
            failed += 1
            continue

        resolution = RESOLUTIONS[suffix]
        try:
            old_ghcid, new_ghcid, new_filepath = update_yaml_file(filepath, resolution)
            if old_ghcid and new_ghcid:
                print(f"  ✓ {old_ghcid} → {new_ghcid}")
                resolved += 1
            else:
                print(f"  ✗ Failed to update: {filepath.name}")
                failed += 1
        except Exception as e:
            print(f"  ✗ Error processing {filepath.name}: {e}")
            failed += 1

    print(f"\n{'='*60}")
    print(f"SUMMARY: Resolved {resolved}/{len(xx_files)} files")
    if failed:
        print(f"         Failed: {failed}")

    # Verify no CZ-XX files remain
    remaining = list(custodian_dir.glob('CZ-XX-*.yaml'))
    print(f"\nRemaining CZ-XX files: {len(remaining)}")
    if remaining:
        for f in remaining:
            print(f"  - {f.name}")


if __name__ == '__main__':
    main()