#!/usr/bin/env python3 """ Resolve XX region codes using city names extracted from institution names. This script handles files without coordinates or Wikidata IDs by: 1. Extracting city names from institution names 2. Looking up cities in GeoNames database 3. Mapping to ISO 3166-2 region codes Following AGENTS.md Rules: - Rule 5: Additive only - never delete existing data """ import os import sys import yaml import sqlite3 import re from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any, List, Tuple # Belgian city name patterns BELGIAN_CITIES = { 'brussel': 'BRU', 'bruxelles': 'BRU', 'brussels': 'BRU', 'antwerpen': 'VAN', 'anvers': 'VAN', 'antwerp': 'VAN', 'gent': 'VOV', 'ghent': 'VOV', 'gand': 'VOV', 'brugge': 'VWV', 'bruges': 'VWV', 'leuven': 'VBR', 'louvain': 'VBR', 'mechelen': 'VAN', 'malines': 'VAN', 'hasselt': 'VLI', 'luik': 'WLG', 'liège': 'WLG', 'liege': 'WLG', 'charleroi': 'WHT', 'namur': 'WNA', 'namen': 'WNA', 'mons': 'WHT', 'bergen': 'WHT', 'tournai': 'WHT', 'doornik': 'WHT', 'kortrijk': 'VWV', 'courtrai': 'VWV', 'oostende': 'VWV', 'ostende': 'VWV', 'aalst': 'VOV', 'alost': 'VOV', 'sint-niklaas': 'VOV', 'dendermonde': 'VOV', 'genk': 'VLI', 'roeselare': 'VWV', 'mouscron': 'WHT', 'moeskroen': 'WHT', 'tienen': 'VBR', 'tirlemont': 'VBR', 'ieper': 'VWV', 'ypres': 'VWV', 'turnhout': 'VAN', 'waregem': 'VWV', 'lokeren': 'VOV', 'beveren': 'VOV', 'vilvoorde': 'VBR', 'dilbeek': 'VBR', 'schoten': 'VAN', 'brasschaat': 'VAN', 'boom': 'VAN', 'mortsel': 'VAN', 'temse': 'VOV', 'herzele': 'VOV', 'brecht': 'VAN', 'oudenaarde': 'VOV', 'rotselaar': 'VBR', 'niel': 'VAN', 'lint': 'VAN', 'ravels': 'VAN', 'bree': 'VLI', 'peer': 'VLI', 'meeuwen': 'VLI', 'gruitrode': 'VLI', 'arlon': 'WLX', 'aarlen': 'WLX', 'bastogne': 'WLX', 'bastenaken': 'WLX', } # Austrian state codes AUSTRIAN_STATES = { 'wien': '9', 'vienna': '9', 'salzburg': '5', 'tirol': '7', 'tyrol': '7', 'innsbruck': '7', 'vorarlberg': '8', 'bregenz': '8', 'kärnten': '2', 'carinthia': '2', 'klagenfurt': '2', 'steiermark': '6', 'styria': '6', 'graz': '6', 'oberösterreich': '4', 'upper austria': '4', 'linz': '4', 'niederösterreich': '3', 'lower austria': '3', 'st. pölten': '3', 'burgenland': '1', 'eisenstadt': '1', } # Bulgarian province codes BULGARIAN_PROVINCES = { 'sofia': '22', 'софія': '22', 'plovdiv': '16', 'пловдив': '16', 'varna': '03', 'варна': '03', 'burgas': '02', 'бургас': '02', 'ruse': '18', 'русе': '18', 'stara zagora': '24', 'pleven': '15', 'плевен': '15', } # Swiss canton codes (abbreviated) SWISS_CANTONS = { 'zürich': 'ZH', 'zurich': 'ZH', 'bern': 'BE', 'berne': 'BE', 'luzern': 'LU', 'lucerne': 'LU', 'genève': 'GE', 'geneva': 'GE', 'genf': 'GE', 'basel': 'BS', 'lausanne': 'VD', 'winterthur': 'ZH', 'st. gallen': 'SG', 'st gallen': 'SG', 'lugano': 'TI', 'biel': 'BE', 'bienne': 'BE', 'thun': 'BE', 'fribourg': 'FR', 'freiburg': 'FR', 'schaffhausen': 'SH', 'chur': 'GR', 'neuchâtel': 'NE', 'neuchatel': 'NE', 'sion': 'VS', 'aarau': 'AG', 'baden': 'AG', } def extract_city_from_name(name: str, country: str) -> Optional[Tuple[str, str]]: """ Extract city name from institution name. Returns (city_name, region_code) or None. """ name_lower = name.lower() if country == 'BE': for city, region in BELGIAN_CITIES.items(): if city in name_lower: return (city.title(), region) elif country == 'AT': for city, region in AUSTRIAN_STATES.items(): if city in name_lower: return (city.title(), region) elif country == 'BG': for city, region in BULGARIAN_PROVINCES.items(): if city in name_lower: return (city.title(), region) elif country == 'CH': for city, region in SWISS_CANTONS.items(): if city in name_lower: return (city.title(), region) return None def update_file_with_region(filepath: Path, region_code: str, city_name: str, dry_run: bool = True) -> Tuple[bool, Optional[Path]]: """Update a custodian file with resolved region code.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: print(f" Error reading {filepath}: {e}") return False, None if 'ghcid' not in data: return False, None ghcid = data['ghcid'] if 'location_resolution' not in ghcid: ghcid['location_resolution'] = {} loc_res = ghcid['location_resolution'] country_code = loc_res.get('country_code', '') if not country_code: return False, None old_region = loc_res.get('region_code', 'XX') if old_region != 'XX': return False, None # Update location resolution loc_res['region_code'] = region_code loc_res['region_name'] = city_name loc_res['method'] = 'NAME_LOOKUP' loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() # Update GHCID string old_ghcid = ghcid.get('ghcid_current', '') new_ghcid = old_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') if new_ghcid != old_ghcid: ghcid['ghcid_current'] = new_ghcid if 'ghcid_history' not in ghcid: ghcid['ghcid_history'] = [] ghcid['ghcid_history'].append({ 'ghcid': new_ghcid, 'valid_from': datetime.now(timezone.utc).isoformat(), 'reason': f"Region resolved via name lookup: XX->{region_code} (city: {city_name})" }) # Add provenance note if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance']: data['provenance']['notes'] = [] elif isinstance(data['provenance']['notes'], str): data['provenance']['notes'] = [data['provenance']['notes']] data['provenance']['notes'].append( f"Region resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: " f"XX->{region_code} via name lookup (city: {city_name})" ) # Determine new filename new_filename = filepath.name.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') new_filepath = filepath.parent / new_filename if not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) if new_filepath != filepath and not new_filepath.exists(): filepath.rename(new_filepath) return True, new_filepath if new_filepath != filepath else None def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser( description='Resolve XX region codes using city names from institution names' ) parser.add_argument('--apply', action='store_true', help='Actually apply the fixes (default: dry run)') parser.add_argument('--path', type=str, default='data/custodian', help='Path to custodian files directory') parser.add_argument('--limit', type=int, default=100, help='Limit number of files to process') parser.add_argument('--country', type=str, help='Only process files for a specific country') args = parser.parse_args() custodian_dir = Path(args.path) if not custodian_dir.exists(): print(f"Error: Directory {custodian_dir} does not exist") sys.exit(1) dry_run = not args.apply print("=" * 70) print("REGION RESOLUTION VIA NAME LOOKUP") print("=" * 70) print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}") print() # Find files with XX region codes files_to_process = [] for filepath in custodian_dir.glob('*-XX-*.yaml'): files_to_process.append(filepath) print(f"Found {len(files_to_process)} files with XX region codes") # Load files and extract institution names file_data = [] for filepath in files_to_process[:args.limit]: try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Get country code country = None if 'ghcid' in data and 'location_resolution' in data['ghcid']: country = data['ghcid']['location_resolution'].get('country_code') if not country: continue if args.country and country != args.country: continue # Get institution name name = None if 'custodian_name' in data: name = data['custodian_name'].get('claim_value') if not name and 'original_entry' in data: name = data['original_entry'].get('name') if not name: continue file_data.append({ 'filepath': filepath, 'data': data, 'country': country, 'name': name }) except Exception as e: print(f"Error loading {filepath}: {e}") print(f"Processing {len(file_data)} files with institution names") print() # Process each file resolved = 0 renamed = 0 no_match = 0 for f in file_data: filepath = f['filepath'] name = f['name'] country = f['country'] # Try to extract city from name result = extract_city_from_name(name, country) if not result: no_match += 1 continue city_name, region_code = result print(f"Processing {filepath.name}...") print(f" Name: {name}") print(f" City: {city_name} -> Region: {region_code}") # Update file success, new_path = update_file_with_region(filepath, region_code, city_name, dry_run=dry_run) if success: resolved += 1 if new_path: renamed += 1 print(f" {filepath.name} -> {new_path.name}") print() print("=" * 70) print("SUMMARY") print("=" * 70) print(f"Files processed: {len(file_data)}") print(f"Resolved: {resolved}") print(f"Renamed: {renamed}") print(f"No city match: {no_match}") if dry_run: print() print("This was a DRY RUN. Use --apply to make changes.") if __name__ == '__main__': main()