#!/usr/bin/env python3 """ Fix Belgian region codes in custodian YAML files. Valid ISO 3166-2:BE codes: - Regions: BRU (Brussels), VLG (Flanders), WAL (Wallonia) - Provinces: VAN (Antwerp), VBR (Flemish Brabant), VLI (Limburg), VOV (East Flanders), VWV (West Flanders), WBR (Walloon Brabant), WHT (Hainaut), WLG (Liège), WLX (Luxembourg), WNA (Namur) Issues to fix: - VL -> VLG (Flanders region shorthand) - WA -> WAL (Wallonia shorthand) - BW -> WBR (Walloon Brabant) """ import os import re from pathlib import Path from datetime import datetime, timezone # Mapping from incorrect codes to ISO 3166-2:BE codes CODE_FIXES = { 'VL': 'VLG', # Flanders (Vlaams Gewest) 'WA': 'WAL', # Wallonia (Waals Gewest) 'BW': 'WBR', # Walloon Brabant } # Valid ISO codes (no change needed) VALID_CODES = {'BRU', 'VLG', 'WAL', 'VAN', 'VBR', 'VLI', 'VOV', 'VWV', 'WBR', 'WHT', 'WLG', 'WLX', 'WNA'} REGION_NAMES = { 'BRU': 'Brussels Capital Region', 'VLG': 'Flemish Region (Flanders)', 'WAL': 'Walloon Region (Wallonia)', 'VAN': 'Antwerp Province', 'VBR': 'Flemish Brabant', 'VLI': 'Limburg', 'VOV': 'East Flanders', 'VWV': 'West Flanders', 'WBR': 'Walloon Brabant', 'WHT': 'Hainaut', 'WLG': 'Liège', 'WLX': 'Luxembourg', 'WNA': 'Namur', } def fix_yaml_content(content: str, old_ghcid: str, new_ghcid: str, old_region: str, new_region: str) -> str: """Fix the YAML content with new GHCID and region codes.""" timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') region_name = REGION_NAMES.get(new_region, 'Unknown') reason = f"Corrected region code from BE-{old_region} to BE-{new_region} ({region_name}) per ISO 3166-2:BE" # Replace GHCID in ghcid_current content = re.sub( r'(ghcid_current:\s*)' + re.escape(old_ghcid), r'\g<1>' + new_ghcid, content ) # Replace GHCID in identifiers content = re.sub( r'(identifier_value:\s*)' + re.escape(old_ghcid), r'\g<1>' + new_ghcid, content ) # Replace region_code in location_resolution content = re.sub( r'(location_resolution:.*?region_code:\s*)' + re.escape(old_region), r'\g<1>' + new_region, content, flags=re.DOTALL ) # Replace region_code in location section content = re.sub( r'(location:.*?region_code:\s*)' + re.escape(old_region), r'\g<1>' + new_region, content, flags=re.DOTALL ) # Check if ghcid_history already exists if 'ghcid_history:' in content: new_history_items = f'''- ghcid: {new_ghcid} valid_from: "{timestamp}" valid_to: null reason: "{reason}" - ghcid: {old_ghcid} valid_from: null valid_to: "{timestamp}" reason: "Previous GHCID with incorrect region code" ''' content = re.sub( r'(ghcid_history:\s*\n\s*)', r'\g<1>' + new_history_items, content ) else: history_entry = f''' ghcid_history: - ghcid: {new_ghcid} valid_from: "{timestamp}" valid_to: null reason: "{reason}" - ghcid: {old_ghcid} valid_from: null valid_to: "{timestamp}" reason: "Original GHCID with incorrect region code"''' content = re.sub( r'(ghcid_current:\s*' + re.escape(new_ghcid) + r')', r'\g<1>' + history_entry, content ) return content def process_file(filepath: Path, dry_run: bool = False) -> dict: """Process a single YAML file.""" filename = filepath.name # Extract current GHCID from filename match = re.match(r'BE-([A-Z]+)-([A-Z]+)-([A-Z])-(.+)\.yaml', filename) if not match: return {'status': 'skipped', 'reason': 'filename pattern mismatch'} old_region = match.group(1) city = match.group(2) inst_type = match.group(3) abbrev = match.group(4) # Check if code needs fixing if old_region in VALID_CODES: return {'status': 'skipped', 'reason': 'already correct'} if old_region not in CODE_FIXES: return {'status': 'error', 'reason': f'unknown region code: {old_region}'} new_region = CODE_FIXES[old_region] old_ghcid = f"BE-{old_region}-{city}-{inst_type}-{abbrev}" new_ghcid = f"BE-{new_region}-{city}-{inst_type}-{abbrev}" new_filename = f"{new_ghcid}.yaml" new_filepath = filepath.parent / new_filename if dry_run: return { 'status': 'would_fix', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, } # Check for collision if new_filepath.exists() and filepath != new_filepath: return { 'status': 'collision', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, } # Read, fix, and write content = filepath.read_text(encoding='utf-8') new_content = fix_yaml_content(content, old_ghcid, new_ghcid, old_region, new_region) new_filepath.write_text(new_content, encoding='utf-8') if filepath != new_filepath: filepath.unlink() return { 'status': 'fixed', 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, } def main(): import argparse parser = argparse.ArgumentParser(description='Fix Belgian region codes') parser.add_argument('--dry-run', action='store_true') parser.add_argument('--dir', default='/Users/kempersc/apps/glam/data/custodian') args = parser.parse_args() custodian_dir = Path(args.dir) be_files = list(custodian_dir.glob('BE-*.yaml')) print(f"Found {len(be_files)} Belgian files") results = {'fixed': [], 'would_fix': [], 'errors': [], 'collisions': []} for filepath in sorted(be_files): result = process_file(filepath, dry_run=args.dry_run) status = result['status'] if status in ('fixed', 'would_fix'): results[status].append(result) action = 'Would fix' if args.dry_run else 'Fixed' print(f" {action}: {result['old_ghcid']} -> {result['new_ghcid']}") elif status == 'collision': results['collisions'].append(result) print(f" COLLISION: {result['old_ghcid']} -> {result['new_ghcid']}") elif status == 'error': results['errors'].append(result) print(f" ERROR: {filepath.name} - {result['reason']}") print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:") print(f" Fixed/Would fix: {len(results['fixed']) + len(results['would_fix'])}") print(f" Collisions: {len(results['collisions'])}") print(f" Errors: {len(results['errors'])}") if __name__ == '__main__': main()