340 lines
11 KiB
Python
340 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix Japanese region codes in custodian YAML files.
|
|
|
|
Problem:
|
|
- Some files use letter codes (TO, KA, AI, etc.) which are prefecture abbreviations
|
|
- ISO 3166-2:JP uses 2-digit numeric codes (01-47)
|
|
|
|
Mapping from letter abbreviations to ISO 3166-2:JP codes.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Mapping from prefecture abbreviations to ISO 3166-2:JP codes
|
|
# Based on Japanese prefecture order (sorted by JIS code)
|
|
LETTER_TO_ISO = {
|
|
'HO': '01', # Hokkaido
|
|
'AO': '02', # Aomori
|
|
'IW': '03', # Iwate
|
|
'MG': '04', # Miyagi
|
|
'AK': '05', # Akita
|
|
'YMG': '06', # Yamagata (using YMG to avoid conflict)
|
|
'FS': '07', # Fukushima (using FS to avoid conflict with Fukui/Fukuoka)
|
|
'FKS': '07', # Fukushima alternate
|
|
'IB': '08', # Ibaraki
|
|
'TC': '09', # Tochigi
|
|
'GNM': '10', # Gunma (using GNM to be specific)
|
|
'GU': '10', # Gunma
|
|
'SA': '11', # Saitama
|
|
'CH': '12', # Chiba
|
|
'TO': '13', # Tokyo
|
|
'TK': '13', # Tokyo alternate
|
|
'KA': '14', # Kanagawa
|
|
'NI': '15', # Niigata
|
|
'TO2': '16', # Toyama (conflict with Tokyo - rare)
|
|
'IS': '17', # Ishikawa
|
|
'FU': '18', # Fukui
|
|
'YA': '19', # Yamanashi
|
|
'NA': '20', # Nagano
|
|
'GI': '21', # Gifu
|
|
'SZO': '22', # Shizuoka (using SZO to be specific)
|
|
'AI': '23', # Aichi
|
|
'MIE': '24', # Mie
|
|
'SH': '25', # Shiga
|
|
'KY': '26', # Kyoto
|
|
'OS': '27', # Osaka
|
|
'HY': '28', # Hyogo
|
|
'NR': '29', # Nara
|
|
'WA': '30', # Wakayama
|
|
'TT': '31', # Tottori
|
|
'SM': '32', # Shimane
|
|
'OK': '33', # Okayama
|
|
'HI': '34', # Hiroshima
|
|
'YMC': '35', # Yamaguchi
|
|
'TKS': '36', # Tokushima
|
|
'KG': '37', # Kagawa
|
|
'EH': '38', # Ehime
|
|
'KC': '39', # Kochi
|
|
'FO': '40', # Fukuoka (using FO to avoid conflict)
|
|
'SG': '41', # Saga
|
|
'NS': '42', # Nagasaki
|
|
'KU': '43', # Kumamoto
|
|
'OI': '44', # Oita
|
|
'MI': '45', # Miyazaki
|
|
'KS': '46', # Kagoshima
|
|
'OO': '47', # Okinawa (using OO to be specific)
|
|
}
|
|
|
|
# Additional mappings that might appear in the data
|
|
ADDITIONAL_MAPPINGS = {
|
|
# Common variations
|
|
'TOK': '13', # Tokyo
|
|
'KAN': '14', # Kanagawa
|
|
'KN': '14', # Kanagawa (another abbreviation found in data)
|
|
'OSA': '27', # Osaka
|
|
'KYO': '26', # Kyoto
|
|
'HOK': '01', # Hokkaido
|
|
'SAI': '11', # Saitama
|
|
'CHI': '12', # Chiba
|
|
'NAG': '20', # Nagano (could also be Nagasaki - 42)
|
|
'HIR': '34', # Hiroshima
|
|
'OKI': '47', # Okinawa
|
|
'FUK': '40', # Fukuoka (most common)
|
|
'KO': '39', # Kochi (高知県) - found in data as Kochi Ken
|
|
}
|
|
|
|
# Merge mappings
|
|
LETTER_TO_ISO.update(ADDITIONAL_MAPPINGS)
|
|
|
|
# Prefecture names for documentation
|
|
PREFECTURE_NAMES = {
|
|
'01': 'Hokkaido', '02': 'Aomori', '03': 'Iwate', '04': 'Miyagi',
|
|
'05': 'Akita', '06': 'Yamagata', '07': 'Fukushima', '08': 'Ibaraki',
|
|
'09': 'Tochigi', '10': 'Gunma', '11': 'Saitama', '12': 'Chiba',
|
|
'13': 'Tokyo', '14': 'Kanagawa', '15': 'Niigata', '16': 'Toyama',
|
|
'17': 'Ishikawa', '18': 'Fukui', '19': 'Yamanashi', '20': 'Nagano',
|
|
'21': 'Gifu', '22': 'Shizuoka', '23': 'Aichi', '24': 'Mie',
|
|
'25': 'Shiga', '26': 'Kyoto', '27': 'Osaka', '28': 'Hyogo',
|
|
'29': 'Nara', '30': 'Wakayama', '31': 'Tottori', '32': 'Shimane',
|
|
'33': 'Okayama', '34': 'Hiroshima', '35': 'Yamaguchi', '36': 'Tokushima',
|
|
'37': 'Kagawa', '38': 'Ehime', '39': 'Kochi', '40': 'Fukuoka',
|
|
'41': 'Saga', '42': 'Nagasaki', '43': 'Kumamoto', '44': 'Oita',
|
|
'45': 'Miyazaki', '46': 'Kagoshima', '47': 'Okinawa',
|
|
}
|
|
|
|
|
|
def get_correct_region_code(old_code: str) -> tuple:
|
|
"""
|
|
Convert old region code to correct ISO 3166-2 code.
|
|
|
|
Returns: (correct_code, correction_type)
|
|
"""
|
|
# Already correct (2-digit numeric)
|
|
if re.match(r'^[0-4][0-9]$', old_code) and int(old_code) >= 1 and int(old_code) <= 47:
|
|
return old_code, 'already_correct'
|
|
|
|
# Check for letter code
|
|
if old_code in LETTER_TO_ISO:
|
|
return LETTER_TO_ISO[old_code], 'letter_code'
|
|
|
|
return None, 'unknown'
|
|
|
|
|
|
def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str,
|
|
old_region: str, new_region: str) -> str:
|
|
"""Fix the YAML content with new GHCID and region codes."""
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
prefecture_name = PREFECTURE_NAMES.get(new_region, 'Unknown')
|
|
|
|
reason = f"Corrected region code from JP-{old_region} (abbreviation) to JP-{new_region} ({prefecture_name}) per ISO 3166-2:JP"
|
|
|
|
# Replace GHCID in ghcid_current
|
|
content = re.sub(
|
|
r'(ghcid_current:\s*)' + re.escape(old_ghcid),
|
|
r'\g<1>' + new_ghcid,
|
|
content
|
|
)
|
|
|
|
# Replace GHCID in identifiers
|
|
content = re.sub(
|
|
r'(identifier_value:\s*)' + re.escape(old_ghcid),
|
|
r'\g<1>' + new_ghcid,
|
|
content
|
|
)
|
|
|
|
# Replace region_code in location_resolution (be careful with patterns)
|
|
content = re.sub(
|
|
r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region),
|
|
r'\g<1>' + new_region,
|
|
content,
|
|
flags=re.DOTALL
|
|
)
|
|
|
|
# Replace region_code in location section
|
|
content = re.sub(
|
|
r'(location:.*?region_code:\s*)' + re.escape(old_region),
|
|
r'\g<1>' + new_region,
|
|
content,
|
|
flags=re.DOTALL
|
|
)
|
|
|
|
# Check if ghcid_history already exists
|
|
if 'ghcid_history:' in content:
|
|
# Insert new entry at the beginning of existing history
|
|
new_history_items = f'''- ghcid: {new_ghcid}
|
|
valid_from: "{timestamp}"
|
|
valid_to: null
|
|
reason: "{reason}"
|
|
- ghcid: {old_ghcid}
|
|
valid_from: null
|
|
valid_to: "{timestamp}"
|
|
reason: "Previous GHCID with incorrect region code"
|
|
'''
|
|
content = re.sub(
|
|
r'(ghcid_history:\s*\n\s*)',
|
|
r'\g<1>' + new_history_items,
|
|
content
|
|
)
|
|
else:
|
|
# Add ghcid_history after ghcid_current
|
|
history_entry = f'''
|
|
ghcid_history:
|
|
- ghcid: {new_ghcid}
|
|
valid_from: "{timestamp}"
|
|
valid_to: null
|
|
reason: "{reason}"
|
|
- ghcid: {old_ghcid}
|
|
valid_from: null
|
|
valid_to: "{timestamp}"
|
|
reason: "Original GHCID with incorrect region code"'''
|
|
content = re.sub(
|
|
r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')',
|
|
r'\g<1>' + history_entry,
|
|
content
|
|
)
|
|
|
|
return content
|
|
|
|
|
|
def process_file(filepath: Path, dry_run: bool = False) -> dict:
|
|
"""Process a single YAML file and return results."""
|
|
|
|
filename = filepath.name
|
|
|
|
# Extract current GHCID from filename (e.g., JP-TO-ADA-L-AL.yaml)
|
|
match = re.match(r'JP-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename)
|
|
if not match:
|
|
return {'status': 'skipped', 'reason': 'filename pattern mismatch'}
|
|
|
|
old_region = match.group(1)
|
|
city = match.group(2)
|
|
inst_type = match.group(3)
|
|
abbrev = match.group(4)
|
|
|
|
# Get correct region code
|
|
new_region, correction_type = get_correct_region_code(old_region)
|
|
|
|
if correction_type == 'already_correct':
|
|
return {'status': 'skipped', 'reason': 'already correct'}
|
|
|
|
if correction_type == 'unknown':
|
|
return {'status': 'error', 'reason': f'unknown region code: {old_region}'}
|
|
|
|
old_ghcid = f"JP-{old_region}-{city}-{inst_type}-{abbrev}"
|
|
new_ghcid = f"JP-{new_region}-{city}-{inst_type}-{abbrev}"
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
if dry_run:
|
|
return {
|
|
'status': 'would_fix',
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'old_file': filename,
|
|
'new_file': new_filename,
|
|
'old_region': old_region,
|
|
'new_region': new_region
|
|
}
|
|
|
|
# Check if target file already exists (collision)
|
|
if new_filepath.exists() and filepath != new_filepath:
|
|
return {
|
|
'status': 'collision',
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'reason': f'Target file {new_filename} already exists'
|
|
}
|
|
|
|
# Read file content
|
|
content = filepath.read_text(encoding='utf-8')
|
|
|
|
# Fix content
|
|
new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region)
|
|
|
|
# Write to new file
|
|
new_filepath.write_text(new_content, encoding='utf-8')
|
|
|
|
# Remove old file if different name
|
|
if filepath != new_filepath:
|
|
filepath.unlink()
|
|
|
|
return {
|
|
'status': 'fixed',
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'old_file': filename,
|
|
'new_file': new_filename
|
|
}
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Fix Japanese region codes in custodian files')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without making changes')
|
|
parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory containing custodian files')
|
|
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path(args.dir)
|
|
|
|
# Find all JP-*.yaml files with letter codes
|
|
jp_files = list(custodian_dir.glob('JP-*.yaml'))
|
|
print(f"Found {len(jp_files)} Japanese files")
|
|
|
|
results = {
|
|
'fixed': [],
|
|
'would_fix': [],
|
|
'skipped': [],
|
|
'errors': [],
|
|
'collisions': []
|
|
}
|
|
|
|
processed = 0
|
|
for filepath in sorted(jp_files):
|
|
if args.limit > 0 and processed >= args.limit:
|
|
break
|
|
|
|
result = process_file(filepath, dry_run=args.dry_run)
|
|
status = result['status']
|
|
|
|
if status in ('fixed', 'would_fix'):
|
|
results[status].append(result)
|
|
action = 'Would fix' if args.dry_run else 'Fixed'
|
|
print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}")
|
|
processed += 1
|
|
elif status == 'collision':
|
|
results['collisions'].append(result)
|
|
print(f" COLLISION: {result['old_ghcid']} -> {result['new_ghcid']}: {result['reason']}")
|
|
processed += 1
|
|
elif status == 'error':
|
|
results['errors'].append((filepath.name, result['reason']))
|
|
print(f" ERROR: {filepath.name} - {result['reason']}")
|
|
processed += 1
|
|
# Skip already correct files silently
|
|
|
|
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
|
|
print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}")
|
|
print(f" Collisions: {len(results['collisions'])}")
|
|
print(f" Already correct: {len(jp_files) - processed}")
|
|
print(f" Errors: {len(results['errors'])}")
|
|
|
|
if results['errors']:
|
|
print("\nUnknown region codes (need mapping):")
|
|
unknown_codes = set()
|
|
for filename, reason in results['errors']:
|
|
if 'unknown region code:' in reason:
|
|
code = reason.split(':')[1].strip()
|
|
unknown_codes.add(code)
|
|
for code in sorted(unknown_codes):
|
|
count = sum(1 for f, r in results['errors'] if code in r)
|
|
print(f" {code}: {count} files")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|