glam/scripts/fix_belgium_region_codes.py
2025-12-10 13:01:13 +01:00

218 lines
6.7 KiB
Python

#!/usr/bin/env python3
"""
Fix Belgian region codes in custodian YAML files.
Valid ISO 3166-2:BE codes:
- Regions: BRU (Brussels), VLG (Flanders), WAL (Wallonia)
- Provinces: VAN (Antwerp), VBR (Flemish Brabant), VLI (Limburg), VOV (East Flanders),
VWV (West Flanders), WBR (Walloon Brabant), WHT (Hainaut), WLG (Liège),
WLX (Luxembourg), WNA (Namur)
Issues to fix:
- VL -> VLG (Flanders region shorthand)
- WA -> WAL (Wallonia shorthand)
- BW -> WBR (Walloon Brabant)
"""
import os
import re
from pathlib import Path
from datetime import datetime, timezone
# Mapping from incorrect codes to ISO 3166-2:BE codes
CODE_FIXES = {
'VL': 'VLG', # Flanders (Vlaams Gewest)
'WA': 'WAL', # Wallonia (Waals Gewest)
'BW': 'WBR', # Walloon Brabant
}
# Valid ISO codes (no change needed)
VALID_CODES = {'BRU', 'VLG', 'WAL', 'VAN', 'VBR', 'VLI', 'VOV', 'VWV', 'WBR', 'WHT', 'WLG', 'WLX', 'WNA'}
REGION_NAMES = {
'BRU': 'Brussels Capital Region',
'VLG': 'Flemish Region (Flanders)',
'WAL': 'Walloon Region (Wallonia)',
'VAN': 'Antwerp Province',
'VBR': 'Flemish Brabant',
'VLI': 'Limburg',
'VOV': 'East Flanders',
'VWV': 'West Flanders',
'WBR': 'Walloon Brabant',
'WHT': 'Hainaut',
'WLG': 'Liège',
'WLX': 'Luxembourg',
'WNA': 'Namur',
}
def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str,
old_region: str, new_region: str) -> str:
"""Fix the YAML content with new GHCID and region codes."""
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
region_name = REGION_NAMES.get(new_region, 'Unknown')
reason = f"Corrected region code from BE-{old_region} to BE-{new_region} ({region_name}) per ISO 3166-2:BE"
# Replace GHCID in ghcid_current
content = re.sub(
r'(ghcid_current:\s*)' + re.escape(old_ghcid),
r'\g<1>' + new_ghcid,
content
)
# Replace GHCID in identifiers
content = re.sub(
r'(identifier_value:\s*)' + re.escape(old_ghcid),
r'\g<1>' + new_ghcid,
content
)
# Replace region_code in location_resolution
content = re.sub(
r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region),
r'\g<1>' + new_region,
content,
flags=re.DOTALL
)
# Replace region_code in location section
content = re.sub(
r'(location:.*?region_code:\s*)' + re.escape(old_region),
r'\g<1>' + new_region,
content,
flags=re.DOTALL
)
# Check if ghcid_history already exists
if 'ghcid_history:' in content:
new_history_items = f'''- ghcid: {new_ghcid}
valid_from: "{timestamp}"
valid_to: null
reason: "{reason}"
- ghcid: {old_ghcid}
valid_from: null
valid_to: "{timestamp}"
reason: "Previous GHCID with incorrect region code"
'''
content = re.sub(
r'(ghcid_history:\s*\n\s*)',
r'\g<1>' + new_history_items,
content
)
else:
history_entry = f'''
ghcid_history:
- ghcid: {new_ghcid}
valid_from: "{timestamp}"
valid_to: null
reason: "{reason}"
- ghcid: {old_ghcid}
valid_from: null
valid_to: "{timestamp}"
reason: "Original GHCID with incorrect region code"'''
content = re.sub(
r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')',
r'\g<1>' + history_entry,
content
)
return content
def process_file(filepath: Path, dry_run: bool = False) -> dict:
"""Process a single YAML file."""
filename = filepath.name
# Extract current GHCID from filename
match = re.match(r'BE-([A-Z]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename)
if not match:
return {'status': 'skipped', 'reason': 'filename pattern mismatch'}
old_region = match.group(1)
city = match.group(2)
inst_type = match.group(3)
abbrev = match.group(4)
# Check if code needs fixing
if old_region in VALID_CODES:
return {'status': 'skipped', 'reason': 'already correct'}
if old_region not in CODE_FIXES:
return {'status': 'error', 'reason': f'unknown region code: {old_region}'}
new_region = CODE_FIXES[old_region]
old_ghcid = f"BE-{old_region}-{city}-{inst_type}-{abbrev}"
new_ghcid = f"BE-{new_region}-{city}-{inst_type}-{abbrev}"
new_filename = f"{new_ghcid}.yaml"
new_filepath = filepath.parent / new_filename
if dry_run:
return {
'status': 'would_fix',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
}
# Check for collision
if new_filepath.exists() and filepath != new_filepath:
return {
'status': 'collision',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
}
# Read, fix, and write
content = filepath.read_text(encoding='utf-8')
new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region)
new_filepath.write_text(new_content, encoding='utf-8')
if filepath != new_filepath:
filepath.unlink()
return {
'status': 'fixed',
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
}
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix Belgian region codes')
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian')
args = parser.parse_args()
custodian_dir = Path(args.dir)
be_files = list(custodian_dir.glob('BE-*.yaml'))
print(f"Found {len(be_files)} Belgian files")
results = {'fixed': [], 'would_fix': [], 'errors': [], 'collisions': []}
for filepath in sorted(be_files):
result = process_file(filepath, dry_run=args.dry_run)
status = result['status']
if status in ('fixed', 'would_fix'):
results[status].append(result)
action = 'Would fix' if args.dry_run else 'Fixed'
print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}")
elif status == 'collision':
results['collisions'].append(result)
print(f" COLLISION: {result['old_ghcid']} -> {result['new_ghcid']}")
elif status == 'error':
results['errors'].append(result)
print(f" ERROR: {filepath.name} - {result['reason']}")
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}")
print(f" Collisions: {len(results['collisions'])}")
print(f" Errors: {len(results['errors'])}")
if __name__ == '__main__':
main()