glam/scripts/fix_japan_region_codes.py

#!/usr/bin/env python3
"""
Fix Japanese region codes in custodian YAML files.

Problem:
- Some files use letter codes (TO, KA, AI, etc.) which are prefecture abbreviations
- ISO 3166-2:JP uses 2-digit numeric codes (01-47)

Mapping from letter abbreviations to ISO 3166-2:JP codes.
"""

import os
import re
import sys
from pathlib import Path
from datetime import datetime, timezone

# Mapping from prefecture abbreviations to ISO 3166-2:JP codes
# Based on Japanese prefecture order (sorted by JIS code)
LETTER_TO_ISO = {
    'HO': '01',   # Hokkaido
    'AO': '02',   # Aomori
    'IW': '03',   # Iwate
    'MG': '04',   # Miyagi
    'AK': '05',   # Akita
    'YMG': '06',  # Yamagata (using YMG to avoid conflict)
    'FS': '07',   # Fukushima (using FS to avoid conflict with Fukui/Fukuoka)
    'FKS': '07',  # Fukushima alternate
    'IB': '08',   # Ibaraki
    'TC': '09',   # Tochigi
    'GNM': '10',  # Gunma (using GNM to be specific)
    'GU': '10',   # Gunma
    'SA': '11',   # Saitama
    'CH': '12',   # Chiba
    'TO': '13',   # Tokyo
    'TK': '13',   # Tokyo alternate
    'KA': '14',   # Kanagawa
    'NI': '15',   # Niigata
    'TO2': '16',  # Toyama (conflict with Tokyo - rare)
    'IS': '17',   # Ishikawa
    'FU': '18',   # Fukui
    'YA': '19',   # Yamanashi
    'NA': '20',   # Nagano
    'GI': '21',   # Gifu
    'SZO': '22',  # Shizuoka (using SZO to be specific)
    'AI': '23',   # Aichi
    'MIE': '24',  # Mie
    'SH': '25',   # Shiga
    'KY': '26',   # Kyoto
    'OS': '27',   # Osaka
    'HY': '28',   # Hyogo
    'NR': '29',   # Nara
    'WA': '30',   # Wakayama
    'TT': '31',   # Tottori
    'SM': '32',   # Shimane
    'OK': '33',   # Okayama
    'HI': '34',   # Hiroshima
    'YMC': '35',  # Yamaguchi
    'TKS': '36',  # Tokushima
    'KG': '37',   # Kagawa
    'EH': '38',   # Ehime
    'KC': '39',   # Kochi
    'FO': '40',   # Fukuoka (using FO to avoid conflict)
    'SG': '41',   # Saga
    'NS': '42',   # Nagasaki
    'KU': '43',   # Kumamoto
    'OI': '44',   # Oita
    'MI': '45',   # Miyazaki
    'KS': '46',   # Kagoshima
    'OO': '47',   # Okinawa (using OO to be specific)
}

# Additional mappings that might appear in the data
ADDITIONAL_MAPPINGS = {
    # Common variations
    'TOK': '13',  # Tokyo
    'KAN': '14',  # Kanagawa
    'KN': '14',   # Kanagawa (another abbreviation found in data)
    'OSA': '27',  # Osaka
    'KYO': '26',  # Kyoto
    'HOK': '01',  # Hokkaido
    'SAI': '11',  # Saitama
    'CHI': '12',  # Chiba
    'NAG': '20',  # Nagano (could also be Nagasaki - 42)
    'HIR': '34',  # Hiroshima
    'OKI': '47',  # Okinawa
    'FUK': '40',  # Fukuoka (most common)
    'KO': '39',   # Kochi (高知県) - found in data as Kochi Ken
}

# Merge mappings
LETTER_TO_ISO.update(ADDITIONAL_MAPPINGS)

# Prefecture names for documentation
PREFECTURE_NAMES = {
    '01': 'Hokkaido', '02': 'Aomori', '03': 'Iwate', '04': 'Miyagi',
    '05': 'Akita', '06': 'Yamagata', '07': 'Fukushima', '08': 'Ibaraki',
    '09': 'Tochigi', '10': 'Gunma', '11': 'Saitama', '12': 'Chiba',
    '13': 'Tokyo', '14': 'Kanagawa', '15': 'Niigata', '16': 'Toyama',
    '17': 'Ishikawa', '18': 'Fukui', '19': 'Yamanashi', '20': 'Nagano',
    '21': 'Gifu', '22': 'Shizuoka', '23': 'Aichi', '24': 'Mie',
    '25': 'Shiga', '26': 'Kyoto', '27': 'Osaka', '28': 'Hyogo',
    '29': 'Nara', '30': 'Wakayama', '31': 'Tottori', '32': 'Shimane',
    '33': 'Okayama', '34': 'Hiroshima', '35': 'Yamaguchi', '36': 'Tokushima',
    '37': 'Kagawa', '38': 'Ehime', '39': 'Kochi', '40': 'Fukuoka',
    '41': 'Saga', '42': 'Nagasaki', '43': 'Kumamoto', '44': 'Oita',
    '45': 'Miyazaki', '46': 'Kagoshima', '47': 'Okinawa',
}


def get_correct_region_code(old_code: str) -> tuple:
    """
    Convert old region code to correct ISO 3166-2 code.

    Returns: (correct_code, correction_type)
    """
    # Already correct (2-digit numeric)
    if re.match(r'^[0-4][0-9]$', old_code) and int(old_code) >= 1 and int(old_code) <= 47:
        return old_code, 'already_correct'

    # Check for letter code
    if old_code in LETTER_TO_ISO:
        return LETTER_TO_ISO[old_code], 'letter_code'

    return None, 'unknown'


def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str,
                     old_region: str, new_region: str) -> str:
    """Fix the YAML content with new GHCID and region codes."""

    timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
    prefecture_name = PREFECTURE_NAMES.get(new_region, 'Unknown')

    reason = f"Corrected region code from JP-{old_region} (abbreviation) to JP-{new_region} ({prefecture_name}) per ISO 3166-2:JP"

    # Replace GHCID in ghcid_current
    content = re.sub(
        r'(ghcid_current:\s*)' + re.escape(old_ghcid),
        r'\g<1>' + new_ghcid,
        content
    )

    # Replace GHCID in identifiers
    content = re.sub(
        r'(identifier_value:\s*)' + re.escape(old_ghcid),
        r'\g<1>' + new_ghcid,
        content
    )

    # Replace region_code in location_resolution (be careful with patterns)
    content = re.sub(
        r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region),
        r'\g<1>' + new_region,
        content,
        flags=re.DOTALL
    )

    # Replace region_code in location section
    content = re.sub(
        r'(location:.*?region_code:\s*)' + re.escape(old_region),
        r'\g<1>' + new_region,
        content,
        flags=re.DOTALL
    )

    # Check if ghcid_history already exists
    if 'ghcid_history:' in content:
        # Insert new entry at the beginning of existing history
        new_history_items = f'''- ghcid: {new_ghcid}
    valid_from: "{timestamp}"
    valid_to: null
    reason: "{reason}"
  - ghcid: {old_ghcid}
    valid_from: null
    valid_to: "{timestamp}"
    reason: "Previous GHCID with incorrect region code"
  '''
        content = re.sub(
            r'(ghcid_history:\s*\n\s*)',
            r'\g<1>' + new_history_items,
            content
        )
    else:
        # Add ghcid_history after ghcid_current
        history_entry = f'''
  ghcid_history:
  - ghcid: {new_ghcid}
    valid_from: "{timestamp}"
    valid_to: null
    reason: "{reason}"
  - ghcid: {old_ghcid}
    valid_from: null
    valid_to: "{timestamp}"
    reason: "Original GHCID with incorrect region code"'''
        content = re.sub(
            r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')',
            r'\g<1>' + history_entry,
            content
        )

    return content


def process_file(filepath: Path, dry_run: bool = False) -> dict:
    """Process a single YAML file and return results."""

    filename = filepath.name

    # Extract current GHCID from filename (e.g., JP-TO-ADA-L-AL.yaml)
    match = re.match(r'JP-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename)
    if not match:
        return {'status': 'skipped', 'reason': 'filename pattern mismatch'}

    old_region = match.group(1)
    city = match.group(2)
    inst_type = match.group(3)
    abbrev = match.group(4)

    # Get correct region code
    new_region, correction_type = get_correct_region_code(old_region)

    if correction_type == 'already_correct':
        return {'status': 'skipped', 'reason': 'already correct'}

    if correction_type == 'unknown':
        return {'status': 'error', 'reason': f'unknown region code: {old_region}'}

    old_ghcid = f"JP-{old_region}-{city}-{inst_type}-{abbrev}"
    new_ghcid = f"JP-{new_region}-{city}-{inst_type}-{abbrev}"
    new_filename = f"{new_ghcid}.yaml"
    new_filepath = filepath.parent / new_filename

    if dry_run:
        return {
            'status': 'would_fix',
            'old_ghcid': old_ghcid,
            'new_ghcid': new_ghcid,
            'old_file': filename,
            'new_file': new_filename,
            'old_region': old_region,
            'new_region': new_region
        }

    # Check if target file already exists (collision)
    if new_filepath.exists() and filepath != new_filepath:
        return {
            'status': 'collision',
            'old_ghcid': old_ghcid,
            'new_ghcid': new_ghcid,
            'reason': f'Target file {new_filename} already exists'
        }

    # Read file content
    content = filepath.read_text(encoding='utf-8')

    # Fix content
    new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region)

    # Write to new file
    new_filepath.write_text(new_content, encoding='utf-8')

    # Remove old file if different name
    if filepath != new_filepath:
        filepath.unlink()

    return {
        'status': 'fixed',
        'old_ghcid': old_ghcid,
        'new_ghcid': new_ghcid,
        'old_file': filename,
        'new_file': new_filename
    }


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Fix Japanese region codes in custodian files')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without making changes')
    parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory containing custodian files')
    parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)')
    args = parser.parse_args()

    custodian_dir = Path(args.dir)

    # Find all JP-*.yaml files with letter codes
    jp_files = list(custodian_dir.glob('JP-*.yaml'))
    print(f"Found {len(jp_files)} Japanese files")

    results = {
        'fixed': [],
        'would_fix': [],
        'skipped': [],
        'errors': [],
        'collisions': []
    }

    processed = 0
    for filepath in sorted(jp_files):
        if args.limit > 0 and processed >= args.limit:
            break

        result = process_file(filepath, dry_run=args.dry_run)
        status = result['status']

        if status in ('fixed', 'would_fix'):
            results[status].append(result)
            action = 'Would fix' if args.dry_run else 'Fixed'
            print(f"  {action}: {result['old_ghcid']} -> {result['new_ghcid']}")
            processed += 1
        elif status == 'collision':
            results['collisions'].append(result)
            print(f"  COLLISION: {result['old_ghcid']} -> {result['new_ghcid']}: {result['reason']}")
            processed += 1
        elif status == 'error':
            results['errors'].append((filepath.name, result['reason']))
            print(f"  ERROR: {filepath.name} - {result['reason']}")
            processed += 1
        # Skip already correct files silently

    print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
    print(f"  Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}")
    print(f"  Collisions: {len(results['collisions'])}")
    print(f"  Already correct: {len(jp_files) - processed}")
    print(f"  Errors: {len(results['errors'])}")

    if results['errors']:
        print("\nUnknown region codes (need mapping):")
        unknown_codes = set()
        for filename, reason in results['errors']:
            if 'unknown region code:' in reason:
                code = reason.split(':')[1].strip()
                unknown_codes.add(code)
        for code in sorted(unknown_codes):
            count = sum(1 for f, r in results['errors'] if code in r)
            print(f"  {code}: {count} files")


if __name__ == '__main__':
    main()