glam/scripts/fix_japan_region_codes.py
2025-12-10 13:01:13 +01:00

340 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Fix Japanese region codes in custodian YAML files.
Problem:
- Some files use letter codes (TO, KA, AI, etc.) which are prefecture abbreviations
- ISO 3166-2:JP uses 2-digit numeric codes (01-47)
Mapping from letter abbreviations to ISO 3166-2:JP codes.
"""
import os
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
# Mapping from prefecture abbreviations to ISO 3166-2:JP codes
# Based on Japanese prefecture order (sorted by JIS code)
LETTER_TO_ISO = {
'HO': '01', # Hokkaido
'AO': '02', # Aomori
'IW': '03', # Iwate
'MG': '04', # Miyagi
'AK': '05', # Akita
'YMG': '06', # Yamagata (using YMG to avoid conflict)
'FS': '07', # Fukushima (using FS to avoid conflict with Fukui/Fukuoka)
'FKS': '07', # Fukushima alternate
'IB': '08', # Ibaraki
'TC': '09', # Tochigi
'GNM': '10', # Gunma (using GNM to be specific)
'GU': '10', # Gunma
'SA': '11', # Saitama
'CH': '12', # Chiba
'TO': '13', # Tokyo
'TK': '13', # Tokyo alternate
'KA': '14', # Kanagawa
'NI': '15', # Niigata
'TO2': '16', # Toyama (conflict with Tokyo - rare)
'IS': '17', # Ishikawa
'FU': '18', # Fukui
'YA': '19', # Yamanashi
'NA': '20', # Nagano
'GI': '21', # Gifu
'SZO': '22', # Shizuoka (using SZO to be specific)
'AI': '23', # Aichi
'MIE': '24', # Mie
'SH': '25', # Shiga
'KY': '26', # Kyoto
'OS': '27', # Osaka
'HY': '28', # Hyogo
'NR': '29', # Nara
'WA': '30', # Wakayama
'TT': '31', # Tottori
'SM': '32', # Shimane
'OK': '33', # Okayama
'HI': '34', # Hiroshima
'YMC': '35', # Yamaguchi
'TKS': '36', # Tokushima
'KG': '37', # Kagawa
'EH': '38', # Ehime
'KC': '39', # Kochi
'FO': '40', # Fukuoka (using FO to avoid conflict)
'SG': '41', # Saga
'NS': '42', # Nagasaki
'KU': '43', # Kumamoto
'OI': '44', # Oita
'MI': '45', # Miyazaki
'KS': '46', # Kagoshima
'OO': '47', # Okinawa (using OO to be specific)
}
# Additional mappings that might appear in the data
ADDITIONAL_MAPPINGS = {
# Common variations
'TOK': '13', # Tokyo
'KAN': '14', # Kanagawa
'KN': '14', # Kanagawa (another abbreviation found in data)
'OSA': '27', # Osaka
'KYO': '26', # Kyoto
'HOK': '01', # Hokkaido
'SAI': '11', # Saitama
'CHI': '12', # Chiba
'NAG': '20', # Nagano (could also be Nagasaki - 42)
'HIR': '34', # Hiroshima
'OKI': '47', # Okinawa
'FUK': '40', # Fukuoka (most common)
'KO': '39', # Kochi (高知県) - found in data as Kochi Ken
}
# Merge mappings
LETTER_TO_ISO.update(ADDITIONAL_MAPPINGS)
# Prefecture names for documentation
PREFECTURE_NAMES = {
'01': 'Hokkaido', '02': 'Aomori', '03': 'Iwate', '04': 'Miyagi',
'05': 'Akita', '06': 'Yamagata', '07': 'Fukushima', '08': 'Ibaraki',
'09': 'Tochigi', '10': 'Gunma', '11': 'Saitama', '12': 'Chiba',
'13': 'Tokyo', '14': 'Kanagawa', '15': 'Niigata', '16': 'Toyama',
'17': 'Ishikawa', '18': 'Fukui', '19': 'Yamanashi', '20': 'Nagano',
'21': 'Gifu', '22': 'Shizuoka', '23': 'Aichi', '24': 'Mie',
'25': 'Shiga', '26': 'Kyoto', '27': 'Osaka', '28': 'Hyogo',
'29': 'Nara', '30': 'Wakayama', '31': 'Tottori', '32': 'Shimane',
'33': 'Okayama', '34': 'Hiroshima', '35': 'Yamaguchi', '36': 'Tokushima',
'37': 'Kagawa', '38': 'Ehime', '39': 'Kochi', '40': 'Fukuoka',
'41': 'Saga', '42': 'Nagasaki', '43': 'Kumamoto', '44': 'Oita',
'45': 'Miyazaki', '46': 'Kagoshima', '47': 'Okinawa',
}
def get_correct_region_code(old_code: str) -> tuple:
"""
Convert old region code to correct ISO 3166-2 code.
Returns: (correct_code, correction_type)
"""
# Already correct (2-digit numeric)
if re.match(r'^[0-4][0-9]$', old_code) and int(old_code) >= 1 and int(old_code) <= 47:
return old_code, 'already_correct'
# Check for letter code
if old_code in LETTER_TO_ISO:
return LETTER_TO_ISO[old_code], 'letter_code'
return None, 'unknown'
def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str,
old_region: str, new_region: str) -> str:
"""Fix the YAML content with new GHCID and region codes."""
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
prefecture_name = PREFECTURE_NAMES.get(new_region, 'Unknown')
reason = f"Corrected region code from JP-{old_region} (abbreviation) to JP-{new_region} ({prefecture_name}) per ISO 3166-2:JP"
# Replace GHCID in ghcid_current
content = re.sub(
r'(ghcid_current:\s*)' + re.escape(old_ghcid),
r'\g<1>' + new_ghcid,
content
)
# Replace GHCID in identifiers
content = re.sub(
r'(identifier_value:\s*)' + re.escape(old_ghcid),
r'\g<1>' + new_ghcid,
content
)
# Replace region_code in location_resolution (be careful with patterns)
content = re.sub(
r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region),
r'\g<1>' + new_region,
content,
flags=re.DOTALL
)
# Replace region_code in location section
content = re.sub(
r'(location:.*?region_code:\s*)' + re.escape(old_region),
r'\g<1>' + new_region,
content,
flags=re.DOTALL
)
# Check if ghcid_history already exists
if 'ghcid_history:' in content:
# Insert new entry at the beginning of existing history
new_history_items = f'''- ghcid: {new_ghcid}
valid_from: "{timestamp}"
valid_to: null
reason: "{reason}"
- ghcid: {old_ghcid}
valid_from: null
valid_to: "{timestamp}"
reason: "Previous GHCID with incorrect region code"
'''
content = re.sub(
r'(ghcid_history:\s*\n\s*)',
r'\g<1>' + new_history_items,
content
)
else:
# Add ghcid_history after ghcid_current
history_entry = f'''
ghcid_history:
- ghcid: {new_ghcid}
valid_from: "{timestamp}"
valid_to: null
reason: "{reason}"
- ghcid: {old_ghcid}
valid_from: null
valid_to: "{timestamp}"
reason: "Original GHCID with incorrect region code"'''
content = re.sub(
r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')',
r'\g<1>' + history_entry,
content
)
return content
def process_file(filepath: Path, dry_run: bool = False) -> dict:
"""Process a single YAML file and return results."""
filename = filepath.name
# Extract current GHCID from filename (e.g., JP-TO-ADA-L-AL.yaml)
match = re.match(r'JP-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename)
if not match:
return {'status': 'skipped', 'reason': 'filename pattern mismatch'}
old_region = match.group(1)
city = match.group(2)
inst_type = match.group(3)
abbrev = match.group(4)
# Get correct region code
new_region, correction_type = get_correct_region_code(old_region)
if correction_type == 'already_correct':
return {'status': 'skipped', 'reason': 'already correct'}
if correction_type == 'unknown':
return {'status': 'error', 'reason': f'unknown region code: {old_region}'}
old_ghcid = f"JP-{old_region}-{city}-{inst_type}-{abbrev}"
new_ghcid = f"JP-{new_region}-{city}-{inst_type}-{abbrev}"
new_filename = f"{new_ghcid}.yaml"
new_filepath = filepath.parent / new_filename
if dry_run:
return {
'status': 'would_fix',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'old_file': filename,
'new_file': new_filename,
'old_region': old_region,
'new_region': new_region
}
# Check if target file already exists (collision)
if new_filepath.exists() and filepath != new_filepath:
return {
'status': 'collision',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'reason': f'Target file {new_filename} already exists'
}
# Read file content
content = filepath.read_text(encoding='utf-8')
# Fix content
new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region)
# Write to new file
new_filepath.write_text(new_content, encoding='utf-8')
# Remove old file if different name
if filepath != new_filepath:
filepath.unlink()
return {
'status': 'fixed',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'old_file': filename,
'new_file': new_filename
}
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix Japanese region codes in custodian files')
parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without making changes')
parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory containing custodian files')
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)')
args = parser.parse_args()
custodian_dir = Path(args.dir)
# Find all JP-*.yaml files with letter codes
jp_files = list(custodian_dir.glob('JP-*.yaml'))
print(f"Found {len(jp_files)} Japanese files")
results = {
'fixed': [],
'would_fix': [],
'skipped': [],
'errors': [],
'collisions': []
}
processed = 0
for filepath in sorted(jp_files):
if args.limit > 0 and processed >= args.limit:
break
result = process_file(filepath, dry_run=args.dry_run)
status = result['status']
if status in ('fixed', 'would_fix'):
results[status].append(result)
action = 'Would fix' if args.dry_run else 'Fixed'
print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}")
processed += 1
elif status == 'collision':
results['collisions'].append(result)
print(f" COLLISION: {result['old_ghcid']} -> {result['new_ghcid']}: {result['reason']}")
processed += 1
elif status == 'error':
results['errors'].append((filepath.name, result['reason']))
print(f" ERROR: {filepath.name} - {result['reason']}")
processed += 1
# Skip already correct files silently
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}")
print(f" Collisions: {len(results['collisions'])}")
print(f" Already correct: {len(jp_files) - processed}")
print(f" Errors: {len(results['errors'])}")
if results['errors']:
print("\nUnknown region codes (need mapping):")
unknown_codes = set()
for filename, reason in results['errors']:
if 'unknown region code:' in reason:
code = reason.split(':')[1].strip()
unknown_codes.add(code)
for code in sorted(unknown_codes):
count = sum(1 for f, r in results['errors'] if code in r)
print(f" {code}: {count} files")
if __name__ == '__main__':
main()