#!/usr/bin/env python3 """ Fix Japanese region codes in custodian YAML files. Problem: - Some files use letter codes (TO, KA, AI, etc.) which are prefecture abbreviations - ISO 3166-2:JP uses 2-digit numeric codes (01-47) Mapping from letter abbreviations to ISO 3166-2:JP codes. """ import os import re import sys from pathlib import Path from datetime import datetime, timezone # Mapping from prefecture abbreviations to ISO 3166-2:JP codes # Based on Japanese prefecture order (sorted by JIS code) LETTER_TO_ISO = { 'HO': '01', # Hokkaido 'AO': '02', # Aomori 'IW': '03', # Iwate 'MG': '04', # Miyagi 'AK': '05', # Akita 'YMG': '06', # Yamagata (using YMG to avoid conflict) 'FS': '07', # Fukushima (using FS to avoid conflict with Fukui/Fukuoka) 'FKS': '07', # Fukushima alternate 'IB': '08', # Ibaraki 'TC': '09', # Tochigi 'GNM': '10', # Gunma (using GNM to be specific) 'GU': '10', # Gunma 'SA': '11', # Saitama 'CH': '12', # Chiba 'TO': '13', # Tokyo 'TK': '13', # Tokyo alternate 'KA': '14', # Kanagawa 'NI': '15', # Niigata 'TO2': '16', # Toyama (conflict with Tokyo - rare) 'IS': '17', # Ishikawa 'FU': '18', # Fukui 'YA': '19', # Yamanashi 'NA': '20', # Nagano 'GI': '21', # Gifu 'SZO': '22', # Shizuoka (using SZO to be specific) 'AI': '23', # Aichi 'MIE': '24', # Mie 'SH': '25', # Shiga 'KY': '26', # Kyoto 'OS': '27', # Osaka 'HY': '28', # Hyogo 'NR': '29', # Nara 'WA': '30', # Wakayama 'TT': '31', # Tottori 'SM': '32', # Shimane 'OK': '33', # Okayama 'HI': '34', # Hiroshima 'YMC': '35', # Yamaguchi 'TKS': '36', # Tokushima 'KG': '37', # Kagawa 'EH': '38', # Ehime 'KC': '39', # Kochi 'FO': '40', # Fukuoka (using FO to avoid conflict) 'SG': '41', # Saga 'NS': '42', # Nagasaki 'KU': '43', # Kumamoto 'OI': '44', # Oita 'MI': '45', # Miyazaki 'KS': '46', # Kagoshima 'OO': '47', # Okinawa (using OO to be specific) } # Additional mappings that might appear in the data ADDITIONAL_MAPPINGS = { # Common variations 'TOK': '13', # Tokyo 'KAN': '14', # Kanagawa 'KN': '14', # Kanagawa (another abbreviation found in data) 'OSA': '27', # Osaka 'KYO': '26', # Kyoto 'HOK': '01', # Hokkaido 'SAI': '11', # Saitama 'CHI': '12', # Chiba 'NAG': '20', # Nagano (could also be Nagasaki - 42) 'HIR': '34', # Hiroshima 'OKI': '47', # Okinawa 'FUK': '40', # Fukuoka (most common) 'KO': '39', # Kochi (高知県) - found in data as Kochi Ken } # Merge mappings LETTER_TO_ISO.update(ADDITIONAL_MAPPINGS) # Prefecture names for documentation PREFECTURE_NAMES = { '01': 'Hokkaido', '02': 'Aomori', '03': 'Iwate', '04': 'Miyagi', '05': 'Akita', '06': 'Yamagata', '07': 'Fukushima', '08': 'Ibaraki', '09': 'Tochigi', '10': 'Gunma', '11': 'Saitama', '12': 'Chiba', '13': 'Tokyo', '14': 'Kanagawa', '15': 'Niigata', '16': 'Toyama', '17': 'Ishikawa', '18': 'Fukui', '19': 'Yamanashi', '20': 'Nagano', '21': 'Gifu', '22': 'Shizuoka', '23': 'Aichi', '24': 'Mie', '25': 'Shiga', '26': 'Kyoto', '27': 'Osaka', '28': 'Hyogo', '29': 'Nara', '30': 'Wakayama', '31': 'Tottori', '32': 'Shimane', '33': 'Okayama', '34': 'Hiroshima', '35': 'Yamaguchi', '36': 'Tokushima', '37': 'Kagawa', '38': 'Ehime', '39': 'Kochi', '40': 'Fukuoka', '41': 'Saga', '42': 'Nagasaki', '43': 'Kumamoto', '44': 'Oita', '45': 'Miyazaki', '46': 'Kagoshima', '47': 'Okinawa', } def get_correct_region_code(old_code: str) -> tuple: """ Convert old region code to correct ISO 3166-2 code. Returns: (correct_code, correction_type) """ # Already correct (2-digit numeric) if re.match(r'^[0-4][0-9]$', old_code) and int(old_code) >= 1 and int(old_code) <= 47: return old_code, 'already_correct' # Check for letter code if old_code in LETTER_TO_ISO: return LETTER_TO_ISO[old_code], 'letter_code' return None, 'unknown' def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str, old_region: str, new_region: str) -> str: """Fix the YAML content with new GHCID and region codes.""" timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') prefecture_name = PREFECTURE_NAMES.get(new_region, 'Unknown') reason = f"Corrected region code from JP-{old_region} (abbreviation) to JP-{new_region} ({prefecture_name}) per ISO 3166-2:JP" # Replace GHCID in ghcid_current content = re.sub( r'(ghcid_current:\s*)' + re.escape(old_ghcid), r'\g<1>' + new_ghcid, content ) # Replace GHCID in identifiers content = re.sub( r'(identifier_value:\s*)' + re.escape(old_ghcid), r'\g<1>' + new_ghcid, content ) # Replace region_code in location_resolution (be careful with patterns) content = re.sub( r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region), r'\g<1>' + new_region, content, flags=re.DOTALL ) # Replace region_code in location section content = re.sub( r'(location:.*?region_code:\s*)' + re.escape(old_region), r'\g<1>' + new_region, content, flags=re.DOTALL ) # Check if ghcid_history already exists if 'ghcid_history:' in content: # Insert new entry at the beginning of existing history new_history_items = f'''- ghcid: {new_ghcid} valid_from: "{timestamp}" valid_to: null reason: "{reason}" - ghcid: {old_ghcid} valid_from: null valid_to: "{timestamp}" reason: "Previous GHCID with incorrect region code" ''' content = re.sub( r'(ghcid_history:\s*\n\s*)', r'\g<1>' + new_history_items, content ) else: # Add ghcid_history after ghcid_current history_entry = f''' ghcid_history: - ghcid: {new_ghcid} valid_from: "{timestamp}" valid_to: null reason: "{reason}" - ghcid: {old_ghcid} valid_from: null valid_to: "{timestamp}" reason: "Original GHCID with incorrect region code"''' content = re.sub( r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')', r'\g<1>' + history_entry, content ) return content def process_file(filepath: Path, dry_run: bool = False) -> dict: """Process a single YAML file and return results.""" filename = filepath.name # Extract current GHCID from filename (e.g., JP-TO-ADA-L-AL.yaml) match = re.match(r'JP-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename) if not match: return {'status': 'skipped', 'reason': 'filename pattern mismatch'} old_region = match.group(1) city = match.group(2) inst_type = match.group(3) abbrev = match.group(4) # Get correct region code new_region, correction_type = get_correct_region_code(old_region) if correction_type == 'already_correct': return {'status': 'skipped', 'reason': 'already correct'} if correction_type == 'unknown': return {'status': 'error', 'reason': f'unknown region code: {old_region}'} old_ghcid = f"JP-{old_region}-{city}-{inst_type}-{abbrev}" new_ghcid = f"JP-{new_region}-{city}-{inst_type}-{abbrev}" new_filename = f"{new_ghcid}.yaml" new_filepath = filepath.parent / new_filename if dry_run: return { 'status': 'would_fix', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'old_file': filename, 'new_file': new_filename, 'old_region': old_region, 'new_region': new_region } # Check if target file already exists (collision) if new_filepath.exists() and filepath != new_filepath: return { 'status': 'collision', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'reason': f'Target file {new_filename} already exists' } # Read file content content = filepath.read_text(encoding='utf-8') # Fix content new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region) # Write to new file new_filepath.write_text(new_content, encoding='utf-8') # Remove old file if different name if filepath != new_filepath: filepath.unlink() return { 'status': 'fixed', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'old_file': filename, 'new_file': new_filename } def main(): import argparse parser = argparse.ArgumentParser(description='Fix Japanese region codes in custodian files') parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without making changes') parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory containing custodian files') parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)') args = parser.parse_args() custodian_dir = Path(args.dir) # Find all JP-*.yaml files with letter codes jp_files = list(custodian_dir.glob('JP-*.yaml')) print(f"Found {len(jp_files)} Japanese files") results = { 'fixed': [], 'would_fix': [], 'skipped': [], 'errors': [], 'collisions': [] } processed = 0 for filepath in sorted(jp_files): if args.limit > 0 and processed >= args.limit: break result = process_file(filepath, dry_run=args.dry_run) status = result['status'] if status in ('fixed', 'would_fix'): results[status].append(result) action = 'Would fix' if args.dry_run else 'Fixed' print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}") processed += 1 elif status == 'collision': results['collisions'].append(result) print(f" COLLISION: {result['old_ghcid']} -> {result['new_ghcid']}: {result['reason']}") processed += 1 elif status == 'error': results['errors'].append((filepath.name, result['reason'])) print(f" ERROR: {filepath.name} - {result['reason']}") processed += 1 # Skip already correct files silently print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:") print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}") print(f" Collisions: {len(results['collisions'])}") print(f" Already correct: {len(jp_files) - processed}") print(f" Errors: {len(results['errors'])}") if results['errors']: print("\nUnknown region codes (need mapping):") unknown_codes = set() for filename, reason in results['errors']: if 'unknown region code:' in reason: code = reason.split(':')[1].strip() unknown_codes.add(code) for code in sorted(unknown_codes): count = sum(1 for f, r in results['errors'] if code in r) print(f" {code}: {count} files") if __name__ == '__main__': main()