glam/scripts/fix_czech_region_codes.py

#!/usr/bin/env python3
"""
Fix Czech region codes in custodian YAML files.

Problem:
- Some files use non-ISO codes:
  - Letter codes (JI, HL, ST, VY, PL, OL, PA, KR, MO, US, ZL, LI, KA, PR, JM, JC)
  - 3-digit district codes (317, 422, 521, 802, etc.)
  - Invalid 2-digit codes (78, 79, 81-90 etc.)

ISO 3166-2:CZ uses 2-digit NUTS-based codes for 14 regions:
  10 - Prague (Hlavní město Praha)
  20 - Central Bohemian (Středočeský)
  31 - South Bohemian (Jihočeský)
  32 - Plzeň (Plzeňský)
  41 - Karlovy Vary (Karlovarský)
  42 - Ústí nad Labem (Ústecký)
  51 - Liberec (Liberecký)
  52 - Hradec Králové (Královéhradecký)
  53 - Pardubice (Pardubický)
  63 - Vysočina
  64 - South Moravian (Jihomoravský)
  71 - Olomouc (Olomoucký)
  72 - Zlín (Zlínský)
  80 - Moravian-Silesian (Moravskoslezský)
"""

import os
import re
import sys
from pathlib import Path
from datetime import datetime, timezone

# Mapping from letter codes to ISO 3166-2:CZ codes
LETTER_TO_ISO = {
    'HL': '10',   # Hlavní město Praha (Prague)
    'PR': '10',   # Prague alternate
    'ST': '20',   # Středočeský (Central Bohemian)
    'JC': '31',   # Jihočeský (South Bohemian)
    'PL': '32',   # Plzeňský (Plzeň)
    'KA': '41',   # Karlovarský (Karlovy Vary)
    'US': '42',   # Ústecký (Ústí nad Labem)
    'LI': '51',   # Liberecký (Liberec)
    'KR': '52',   # Královéhradecký (Hradec Králové)
    'PA': '53',   # Pardubický (Pardubice)
    'VY': '63',   # Vysočina
    'JI': '64',   # Jihomoravský (South Moravian) - CORRECT MAPPING!
    'JM': '64',   # Jihomoravský alternate
    'OL': '71',   # Olomoucký (Olomouc)
    'ZL': '72',   # Zlínský (Zlín)
    'MO': '80',   # Moravskoslezský (Moravian-Silesian)
}

# Mapping from 3-digit district codes to region codes
# Format: first 2 digits = region code, third digit = district
DISTRICT_3DIGIT_TO_REGION = {
    '317': '31',  # South Bohemian district
    '422': '42',  # Ústí nad Labem district
    '521': '52',  # Hradec Králové district
    '802': '80',  # Moravian-Silesian district
}

# Mapping for district-level 2-digit codes to region codes
# These appear to be district sub-codes that need region extraction
# Based on ISO 3166-2:CZ structure where districts are 3-char (e.g., 20A, 31B)
DISTRICT_TO_REGION = {
    # 78, 79 series - possibly old codes, need investigation
    '78': '80',   # Likely Moravian-Silesian area
    '79': '80',   # Likely Moravian-Silesian area
    '81': '80',   # Moravian-Silesian district
    '82': '80',   # Moravian-Silesian district (e.g., Bruntál)
    '83': '80',   # Moravian-Silesian district
    '84': '80',   # Moravian-Silesian district
    '85': '80',   # Moravian-Silesian district (e.g., Karviná)
    '86': '80',   # Moravian-Silesian district (e.g., Nový Jičín)
    '87': '80',   # Moravian-Silesian district (e.g., Opava)
    '88': '80',   # Moravian-Silesian district (e.g., Ostrava-město)
    '89': '80',   # Moravian-Silesian district (e.g., Frýdek-Místek)
    '90': '80',   # Moravian-Silesian district
}

# Valid ISO region codes
VALID_ISO_CODES = {'10', '20', '31', '32', '41', '42', '51', '52', '53', '63', '64', '71', '72', '80'}

# Region names for documentation
REGION_NAMES = {
    '10': 'Prague (Hlavní město Praha)',
    '20': 'Central Bohemian (Středočeský)',
    '31': 'South Bohemian (Jihočeský)',
    '32': 'Plzeň (Plzeňský)',
    '41': 'Karlovy Vary (Karlovarský)',
    '42': 'Ústí nad Labem (Ústecký)',
    '51': 'Liberec (Liberecký)',
    '52': 'Hradec Králové (Královéhradecký)',
    '53': 'Pardubice (Pardubický)',
    '63': 'Vysočina',
    '64': 'South Moravian (Jihomoravský)',
    '71': 'Olomouc (Olomoucký)',
    '72': 'Zlín (Zlínský)',
    '80': 'Moravian-Silesian (Moravskoslezský)',
}


def get_correct_region_code(old_code: str) -> tuple:
    """
    Convert old region code to correct ISO 3166-2 code.

    Returns: (correct_code, correction_type)
    """
    # Already correct ISO code
    if old_code in VALID_ISO_CODES:
        return old_code, 'already_correct'

    # Check for letter code
    if old_code in LETTER_TO_ISO:
        return LETTER_TO_ISO[old_code], 'letter_code'

    # Check for 3-digit district code
    if old_code in DISTRICT_3DIGIT_TO_REGION:
        return DISTRICT_3DIGIT_TO_REGION[old_code], 'district_3digit'

    # Check for 2-digit district code (78-90 series)
    if old_code in DISTRICT_TO_REGION:
        return DISTRICT_TO_REGION[old_code], 'district_2digit'

    return None, 'unknown'


def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str,
                     old_region: str, new_region: str) -> str:
    """Fix the YAML content with new GHCID and region codes."""

    timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
    region_name = REGION_NAMES.get(new_region, 'Unknown')

    reason = f"Corrected region code from CZ-{old_region} to CZ-{new_region} ({region_name}) per ISO 3166-2:CZ"

    # Replace GHCID in ghcid_current
    content = re.sub(
        r'(ghcid_current:\s*)' + re.escape(old_ghcid),
        r'\g<1>' + new_ghcid,
        content
    )

    # Replace GHCID in identifiers
    content = re.sub(
        r'(identifier_value:\s*)' + re.escape(old_ghcid),
        r'\g<1>' + new_ghcid,
        content
    )

    # Replace region_code in location_resolution (be careful with patterns)
    content = re.sub(
        r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region),
        r'\g<1>' + new_region,
        content,
        flags=re.DOTALL
    )

    # Replace region_code in location section
    content = re.sub(
        r'(location:.*?region_code:\s*)' + re.escape(old_region),
        r'\g<1>' + new_region,
        content,
        flags=re.DOTALL
    )

    # Check if ghcid_history already exists
    if 'ghcid_history:' in content:
        # Insert new entry at the beginning of existing history
        new_history_items = f'''- ghcid: {new_ghcid}
    valid_from: "{timestamp}"
    valid_to: null
    reason: "{reason}"
  - ghcid: {old_ghcid}
    valid_from: null
    valid_to: "{timestamp}"
    reason: "Previous GHCID with incorrect region code"
  '''
        content = re.sub(
            r'(ghcid_history:\s*\n\s*)',
            r'\g<1>' + new_history_items,
            content
        )
    else:
        # Add ghcid_history after ghcid_current
        history_entry = f'''
  ghcid_history:
  - ghcid: {new_ghcid}
    valid_from: "{timestamp}"
    valid_to: null
    reason: "{reason}"
  - ghcid: {old_ghcid}
    valid_from: null
    valid_to: "{timestamp}"
    reason: "Original GHCID with incorrect region code"'''
        content = re.sub(
            r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')',
            r'\g<1>' + history_entry,
            content
        )

    return content


def process_file(filepath: Path, dry_run: bool = False) -> dict:
    """Process a single YAML file and return results."""

    filename = filepath.name

    # Extract current GHCID from filename (e.g., CZ-JI-BRN-M-MBM.yaml)
    match = re.match(r'CZ-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename)
    if not match:
        return {'status': 'skipped', 'reason': 'filename pattern mismatch'}

    old_region = match.group(1)
    city = match.group(2)
    inst_type = match.group(3)
    abbrev = match.group(4)

    # Get correct region code
    new_region, correction_type = get_correct_region_code(old_region)

    if correction_type == 'already_correct':
        return {'status': 'skipped', 'reason': 'already correct'}

    if correction_type == 'unknown':
        return {'status': 'error', 'reason': f'unknown region code: {old_region}'}

    old_ghcid = f"CZ-{old_region}-{city}-{inst_type}-{abbrev}"
    new_ghcid = f"CZ-{new_region}-{city}-{inst_type}-{abbrev}"
    new_filename = f"{new_ghcid}.yaml"
    new_filepath = filepath.parent / new_filename

    if dry_run:
        return {
            'status': 'would_fix',
            'old_ghcid': old_ghcid,
            'new_ghcid': new_ghcid,
            'old_file': filename,
            'new_file': new_filename,
            'old_region': old_region,
            'new_region': new_region
        }

    # Check if target file already exists (collision)
    if new_filepath.exists() and filepath != new_filepath:
        return {
            'status': 'collision',
            'old_ghcid': old_ghcid,
            'new_ghcid': new_ghcid,
            'reason': f'Target file {new_filename} already exists'
        }

    # Read file content
    content = filepath.read_text(encoding='utf-8')

    # Fix content
    new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region)

    # Write to new file
    new_filepath.write_text(new_content, encoding='utf-8')

    # Remove old file if different name
    if filepath != new_filepath:
        filepath.unlink()

    return {
        'status': 'fixed',
        'old_ghcid': old_ghcid,
        'new_ghcid': new_ghcid,
        'old_file': filename,
        'new_file': new_filename
    }


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Fix Czech region codes in custodian files')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without making changes')
    parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory containing custodian files')
    parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)')
    args = parser.parse_args()

    custodian_dir = Path(args.dir)

    # Find all CZ-*.yaml files
    cz_files = list(custodian_dir.glob('CZ-*.yaml'))
    print(f"Found {len(cz_files)} Czech files")

    results = {
        'fixed': [],
        'would_fix': [],
        'skipped': [],
        'errors': [],
        'collisions': []
    }

    processed = 0
    for filepath in sorted(cz_files):
        if args.limit > 0 and processed >= args.limit:
            break

        result = process_file(filepath, dry_run=args.dry_run)
        status = result['status']

        if status in ('fixed', 'would_fix'):
            results[status].append(result)
            action = 'Would fix' if args.dry_run else 'Fixed'
            print(f"  {action}: {result['old_ghcid']} -> {result['new_ghcid']}")
            processed += 1
        elif status == 'collision':
            results['collisions'].append(result)
            print(f"  COLLISION: {result['old_ghcid']} -> {result['new_ghcid']}: {result['reason']}")
            processed += 1
        elif status == 'error':
            results['errors'].append((filepath.name, result['reason']))
            print(f"  ERROR: {filepath.name} - {result['reason']}")
            processed += 1
        # Skip already correct files silently

    print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
    print(f"  Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}")
    print(f"  Collisions: {len(results['collisions'])}")
    print(f"  Already correct: {len(cz_files) - processed}")
    print(f"  Errors: {len(results['errors'])}")

    if results['errors']:
        print("\nUnknown region codes (need mapping):")
        unknown_codes = set()
        for filename, reason in results['errors']:
            if 'unknown region code:' in reason:
                code = reason.split(':')[1].strip()
                unknown_codes.add(code)
        for code in sorted(unknown_codes):
            count = sum(1 for f, r in results['errors'] if code in r)
            print(f"  {code}: {count} files")


if __name__ == '__main__':
    main()