glam/scripts/fix_czech_region_codes.py
2025-12-10 13:01:13 +01:00

341 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fix Czech region codes in custodian YAML files.
Problem:
- Some files use non-ISO codes:
- Letter codes (JI, HL, ST, VY, PL, OL, PA, KR, MO, US, ZL, LI, KA, PR, JM, JC)
- 3-digit district codes (317, 422, 521, 802, etc.)
- Invalid 2-digit codes (78, 79, 81-90 etc.)
ISO 3166-2:CZ uses 2-digit NUTS-based codes for 14 regions:
10 - Prague (Hlavní město Praha)
20 - Central Bohemian (Středočeský)
31 - South Bohemian (Jihočeský)
32 - Plzeň (Plzeňský)
41 - Karlovy Vary (Karlovarský)
42 - Ústí nad Labem (Ústecký)
51 - Liberec (Liberecký)
52 - Hradec Králové (Královéhradecký)
53 - Pardubice (Pardubický)
63 - Vysočina
64 - South Moravian (Jihomoravský)
71 - Olomouc (Olomoucký)
72 - Zlín (Zlínský)
80 - Moravian-Silesian (Moravskoslezský)
"""
import os
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
# Mapping from letter codes to ISO 3166-2:CZ codes
LETTER_TO_ISO = {
'HL': '10', # Hlavní město Praha (Prague)
'PR': '10', # Prague alternate
'ST': '20', # Středočeský (Central Bohemian)
'JC': '31', # Jihočeský (South Bohemian)
'PL': '32', # Plzeňský (Plzeň)
'KA': '41', # Karlovarský (Karlovy Vary)
'US': '42', # Ústecký (Ústí nad Labem)
'LI': '51', # Liberecký (Liberec)
'KR': '52', # Královéhradecký (Hradec Králové)
'PA': '53', # Pardubický (Pardubice)
'VY': '63', # Vysočina
'JI': '64', # Jihomoravský (South Moravian) - CORRECT MAPPING!
'JM': '64', # Jihomoravský alternate
'OL': '71', # Olomoucký (Olomouc)
'ZL': '72', # Zlínský (Zlín)
'MO': '80', # Moravskoslezský (Moravian-Silesian)
}
# Mapping from 3-digit district codes to region codes
# Format: first 2 digits = region code, third digit = district
DISTRICT_3DIGIT_TO_REGION = {
'317': '31', # South Bohemian district
'422': '42', # Ústí nad Labem district
'521': '52', # Hradec Králové district
'802': '80', # Moravian-Silesian district
}
# Mapping for district-level 2-digit codes to region codes
# These appear to be district sub-codes that need region extraction
# Based on ISO 3166-2:CZ structure where districts are 3-char (e.g., 20A, 31B)
DISTRICT_TO_REGION = {
# 78, 79 series - possibly old codes, need investigation
'78': '80', # Likely Moravian-Silesian area
'79': '80', # Likely Moravian-Silesian area
'81': '80', # Moravian-Silesian district
'82': '80', # Moravian-Silesian district (e.g., Bruntál)
'83': '80', # Moravian-Silesian district
'84': '80', # Moravian-Silesian district
'85': '80', # Moravian-Silesian district (e.g., Karviná)
'86': '80', # Moravian-Silesian district (e.g., Nový Jičín)
'87': '80', # Moravian-Silesian district (e.g., Opava)
'88': '80', # Moravian-Silesian district (e.g., Ostrava-město)
'89': '80', # Moravian-Silesian district (e.g., Frýdek-Místek)
'90': '80', # Moravian-Silesian district
}
# Valid ISO region codes
VALID_ISO_CODES = {'10', '20', '31', '32', '41', '42', '51', '52', '53', '63', '64', '71', '72', '80'}
# Region names for documentation
REGION_NAMES = {
'10': 'Prague (Hlavní město Praha)',
'20': 'Central Bohemian (Středočeský)',
'31': 'South Bohemian (Jihočeský)',
'32': 'Plzeň (Plzeňský)',
'41': 'Karlovy Vary (Karlovarský)',
'42': 'Ústí nad Labem (Ústecký)',
'51': 'Liberec (Liberecký)',
'52': 'Hradec Králové (Královéhradecký)',
'53': 'Pardubice (Pardubický)',
'63': 'Vysočina',
'64': 'South Moravian (Jihomoravský)',
'71': 'Olomouc (Olomoucký)',
'72': 'Zlín (Zlínský)',
'80': 'Moravian-Silesian (Moravskoslezský)',
}
def get_correct_region_code(old_code: str) -> tuple:
"""
Convert old region code to correct ISO 3166-2 code.
Returns: (correct_code, correction_type)
"""
# Already correct ISO code
if old_code in VALID_ISO_CODES:
return old_code, 'already_correct'
# Check for letter code
if old_code in LETTER_TO_ISO:
return LETTER_TO_ISO[old_code], 'letter_code'
# Check for 3-digit district code
if old_code in DISTRICT_3DIGIT_TO_REGION:
return DISTRICT_3DIGIT_TO_REGION[old_code], 'district_3digit'
# Check for 2-digit district code (78-90 series)
if old_code in DISTRICT_TO_REGION:
return DISTRICT_TO_REGION[old_code], 'district_2digit'
return None, 'unknown'
def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str,
old_region: str, new_region: str) -> str:
"""Fix the YAML content with new GHCID and region codes."""
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
region_name = REGION_NAMES.get(new_region, 'Unknown')
reason = f"Corrected region code from CZ-{old_region} to CZ-{new_region} ({region_name}) per ISO 3166-2:CZ"
# Replace GHCID in ghcid_current
content = re.sub(
r'(ghcid_current:\s*)' + re.escape(old_ghcid),
r'\g<1>' + new_ghcid,
content
)
# Replace GHCID in identifiers
content = re.sub(
r'(identifier_value:\s*)' + re.escape(old_ghcid),
r'\g<1>' + new_ghcid,
content
)
# Replace region_code in location_resolution (be careful with patterns)
content = re.sub(
r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region),
r'\g<1>' + new_region,
content,
flags=re.DOTALL
)
# Replace region_code in location section
content = re.sub(
r'(location:.*?region_code:\s*)' + re.escape(old_region),
r'\g<1>' + new_region,
content,
flags=re.DOTALL
)
# Check if ghcid_history already exists
if 'ghcid_history:' in content:
# Insert new entry at the beginning of existing history
new_history_items = f'''- ghcid: {new_ghcid}
valid_from: "{timestamp}"
valid_to: null
reason: "{reason}"
- ghcid: {old_ghcid}
valid_from: null
valid_to: "{timestamp}"
reason: "Previous GHCID with incorrect region code"
'''
content = re.sub(
r'(ghcid_history:\s*\n\s*)',
r'\g<1>' + new_history_items,
content
)
else:
# Add ghcid_history after ghcid_current
history_entry = f'''
ghcid_history:
- ghcid: {new_ghcid}
valid_from: "{timestamp}"
valid_to: null
reason: "{reason}"
- ghcid: {old_ghcid}
valid_from: null
valid_to: "{timestamp}"
reason: "Original GHCID with incorrect region code"'''
content = re.sub(
r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')',
r'\g<1>' + history_entry,
content
)
return content
def process_file(filepath: Path, dry_run: bool = False) -> dict:
"""Process a single YAML file and return results."""
filename = filepath.name
# Extract current GHCID from filename (e.g., CZ-JI-BRN-M-MBM.yaml)
match = re.match(r'CZ-([A-Z0-9]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename)
if not match:
return {'status': 'skipped', 'reason': 'filename pattern mismatch'}
old_region = match.group(1)
city = match.group(2)
inst_type = match.group(3)
abbrev = match.group(4)
# Get correct region code
new_region, correction_type = get_correct_region_code(old_region)
if correction_type == 'already_correct':
return {'status': 'skipped', 'reason': 'already correct'}
if correction_type == 'unknown':
return {'status': 'error', 'reason': f'unknown region code: {old_region}'}
old_ghcid = f"CZ-{old_region}-{city}-{inst_type}-{abbrev}"
new_ghcid = f"CZ-{new_region}-{city}-{inst_type}-{abbrev}"
new_filename = f"{new_ghcid}.yaml"
new_filepath = filepath.parent / new_filename
if dry_run:
return {
'status': 'would_fix',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'old_file': filename,
'new_file': new_filename,
'old_region': old_region,
'new_region': new_region
}
# Check if target file already exists (collision)
if new_filepath.exists() and filepath != new_filepath:
return {
'status': 'collision',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'reason': f'Target file {new_filename} already exists'
}
# Read file content
content = filepath.read_text(encoding='utf-8')
# Fix content
new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region)
# Write to new file
new_filepath.write_text(new_content, encoding='utf-8')
# Remove old file if different name
if filepath != new_filepath:
filepath.unlink()
return {
'status': 'fixed',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'old_file': filename,
'new_file': new_filename
}
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix Czech region codes in custodian files')
parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without making changes')
parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian', help='Directory containing custodian files')
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)')
args = parser.parse_args()
custodian_dir = Path(args.dir)
# Find all CZ-*.yaml files
cz_files = list(custodian_dir.glob('CZ-*.yaml'))
print(f"Found {len(cz_files)} Czech files")
results = {
'fixed': [],
'would_fix': [],
'skipped': [],
'errors': [],
'collisions': []
}
processed = 0
for filepath in sorted(cz_files):
if args.limit > 0 and processed >= args.limit:
break
result = process_file(filepath, dry_run=args.dry_run)
status = result['status']
if status in ('fixed', 'would_fix'):
results[status].append(result)
action = 'Would fix' if args.dry_run else 'Fixed'
print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}")
processed += 1
elif status == 'collision':
results['collisions'].append(result)
print(f" COLLISION: {result['old_ghcid']} -> {result['new_ghcid']}: {result['reason']}")
processed += 1
elif status == 'error':
results['errors'].append((filepath.name, result['reason']))
print(f" ERROR: {filepath.name} - {result['reason']}")
processed += 1
# Skip already correct files silently
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}")
print(f" Collisions: {len(results['collisions'])}")
print(f" Already correct: {len(cz_files) - processed}")
print(f" Errors: {len(results['errors'])}")
if results['errors']:
print("\nUnknown region codes (need mapping):")
unknown_codes = set()
for filename, reason in results['errors']:
if 'unknown region code:' in reason:
code = reason.split(':')[1].strip()
unknown_codes.add(code)
for code in sorted(unknown_codes):
count = sum(1 for f, r in results['errors'] if code in r)
print(f" {code}: {count} files")
if __name__ == '__main__':
main()