#!/usr/bin/env python3 """ Resolve XX region codes and XXX settlement codes using GeoNames reverse geocoding. This script: 1. Finds files with XX region or XXX settlement codes 2. Extracts coordinates from the file or queries Wikidata P625 3. Uses GeoNames database for reverse geocoding 4. Updates files with resolved region and settlement codes 5. Renames files to match new GHCID Following AGENTS.md Rules: - Rule 5: Additive only - never delete existing data - GHCID GeoNames rules: Filter by feature_code (exclude PPLX neighborhoods) """ import os import sys import yaml import sqlite3 import json import re import math import urllib.request import urllib.parse from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any, List, Tuple # GeoNames admin1 code to ISO 3166-2 mapping # Format: country_code: {geonames_admin1: iso_region_code} ADMIN1_TO_ISO = { 'FR': { '11': 'IDF', # Île-de-France '24': 'CVL', # Centre-Val de Loire '27': 'BFC', # Bourgogne-Franche-Comté '28': 'NOR', # Normandy '32': 'HDF', # Hauts-de-France '44': 'GES', # Grand Est '52': 'PDL', # Pays de la Loire '53': 'BRE', # Brittany '75': 'NAQ', # Nouvelle-Aquitaine '76': 'OCC', # Occitanie '84': 'ARA', # Auvergne-Rhône-Alpes '93': 'PAC', # Provence-Alpes-Côte d'Azur '94': 'COR', # Corsica }, 'DE': { '01': 'BW', # Baden-Württemberg '02': 'BY', # Bavaria '03': 'BE', # Berlin '04': 'BB', # Brandenburg '05': 'HB', # Bremen '06': 'HH', # Hamburg '07': 'HE', # Hesse '08': 'MV', # Mecklenburg-Vorpommern '09': 'NI', # Lower Saxony '10': 'NW', # North Rhine-Westphalia '11': 'RP', # Rhineland-Palatinate '12': 'SL', # Saarland '13': 'SN', # Saxony '14': 'ST', # Saxony-Anhalt '15': 'SH', # Schleswig-Holstein '16': 'TH', # Thuringia }, 'US': { 'AL': 'AL', 'AK': 'AK', 'AZ': 'AZ', 'AR': 'AR', 'CA': 'CA', 'CO': 'CO', 'CT': 'CT', 'DE': 'DE', 'FL': 'FL', 'GA': 'GA', 'HI': 'HI', 'ID': 'ID', 'IL': 'IL', 'IN': 'IN', 'IA': 'IA', 'KS': 'KS', 'KY': 'KY', 'LA': 'LA', 'ME': 'ME', 'MD': 'MD', 'MA': 'MA', 'MI': 'MI', 'MN': 'MN', 'MS': 'MS', 'MO': 'MO', 'MT': 'MT', 'NE': 'NE', 'NV': 'NV', 'NH': 'NH', 'NJ': 'NJ', 'NM': 'NM', 'NY': 'NY', 'NC': 'NC', 'ND': 'ND', 'OH': 'OH', 'OK': 'OK', 'OR': 'OR', 'PA': 'PA', 'RI': 'RI', 'SC': 'SC', 'SD': 'SD', 'TN': 'TN', 'TX': 'TX', 'UT': 'UT', 'VT': 'VT', 'VA': 'VA', 'WA': 'WA', 'WV': 'WV', 'WI': 'WI', 'WY': 'WY', 'DC': 'DC', }, 'GB': { 'ENG': 'ENG', 'NIR': 'NIR', 'SCT': 'SCT', 'WLS': 'WLS', }, 'AU': { '01': 'ACT', # Australian Capital Territory '02': 'NSW', # New South Wales '03': 'NT', # Northern Territory '04': 'QLD', # Queensland '05': 'SA', # South Australia '06': 'TAS', # Tasmania '07': 'VIC', # Victoria '08': 'WA', # Western Australia }, 'CA': { '01': 'AB', # Alberta '02': 'BC', # British Columbia '03': 'MB', # Manitoba '04': 'NB', # New Brunswick '05': 'NL', # Newfoundland and Labrador '07': 'NS', # Nova Scotia '08': 'ON', # Ontario '09': 'PE', # Prince Edward Island '10': 'QC', # Quebec '11': 'SK', # Saskatchewan '12': 'YT', # Yukon '13': 'NT', # Northwest Territories '14': 'NU', # Nunavut }, 'JP': { '01': 'HKD', # Hokkaido '02': 'AOM', # Aomori '03': 'IWT', # Iwate '04': 'MYG', # Miyagi '05': 'AKT', # Akita '06': 'YGT', # Yamagata '07': 'FKS', # Fukushima '08': 'IBR', # Ibaraki '09': 'TCG', # Tochigi '10': 'GNM', # Gunma '11': 'SIT', # Saitama '12': 'CHB', # Chiba '13': 'TKY', # Tokyo '14': 'KGW', # Kanagawa '15': 'NGT', # Niigata '16': 'TYM', # Toyama '17': 'ISK', # Ishikawa '18': 'FKI', # Fukui '19': 'YMN', # Yamanashi '20': 'NGN', # Nagano '21': 'GFU', # Gifu '22': 'SZO', # Shizuoka '23': 'AIC', # Aichi '24': 'MIE', # Mie '25': 'SIG', # Shiga '26': 'KYO', # Kyoto '27': 'OSK', # Osaka '28': 'HYG', # Hyogo '29': 'NAR', # Nara '30': 'WKY', # Wakayama '31': 'TTR', # Tottori '32': 'SMN', # Shimane '33': 'OKY', # Okayama '34': 'HIR', # Hiroshima '35': 'YGC', # Yamaguchi '36': 'TKS', # Tokushima '37': 'KGW', # Kagawa '38': 'EHM', # Ehime '39': 'KOC', # Kochi '40': 'FKO', # Fukuoka '41': 'SAG', # Saga '42': 'NGS', # Nagasaki '43': 'KMM', # Kumamoto '44': 'OIT', # Oita '45': 'MYZ', # Miyazaki '46': 'KGS', # Kagoshima '47': 'OKN', # Okinawa }, 'CN': { '01': 'AH', # Anhui '02': 'ZJ', # Zhejiang '03': 'JX', # Jiangxi '04': 'JS', # Jiangsu '05': 'JL', # Jilin '06': 'QH', # Qinghai '07': 'FJ', # Fujian '08': 'HL', # Heilongjiang '09': 'HN', # Henan '10': 'HB', # Hebei '11': 'HN', # Hunan '12': 'HA', # Hubei '13': 'XZ', # Tibet '14': 'XJ', # Xinjiang '15': 'NX', # Ningxia '16': 'NM', # Inner Mongolia '18': 'SD', # Shandong '19': 'SX', # Shanxi '20': 'SN', # Shaanxi '21': 'TJ', # Tianjin '22': 'BJ', # Beijing '23': 'SH', # Shanghai '24': 'HI', # Hainan '25': 'CQ', # Chongqing '26': 'GS', # Gansu '28': 'GX', # Guangxi '29': 'SC', # Sichuan '30': 'GD', # Guangdong '31': 'YN', # Yunnan '32': 'GZ', # Guizhou '33': 'LN', # Liaoning }, 'KR': { '01': 'SO', # Seoul '02': 'BS', # Busan '03': 'TG', # Daegu '04': 'IN', # Incheon '05': 'GJ', # Gwangju '06': 'DJ', # Daejeon '07': 'US', # Ulsan '08': 'SJ', # Sejong '10': 'KG', # Gyeonggi '11': 'KW', # Gangwon '12': 'CB', # North Chungcheong '13': 'CN', # South Chungcheong '14': 'JB', # North Jeolla '15': 'JN', # South Jeolla '16': 'KB', # North Gyeongsang '17': 'KN', # South Gyeongsang '18': 'JJ', # Jeju }, 'BR': { '01': 'AC', # Acre '02': 'AL', # Alagoas '03': 'AP', # Amapá '04': 'AM', # Amazonas '05': 'BA', # Bahia '06': 'CE', # Ceará '07': 'DF', # Distrito Federal '08': 'ES', # Espírito Santo '11': 'MS', # Mato Grosso do Sul '14': 'RN', # Rio Grande do Norte '16': 'RS', # Rio Grande do Sul '17': 'RJ', # Rio de Janeiro '18': 'RO', # Rondônia '19': 'RR', # Roraima '20': 'SC', # Santa Catarina '21': 'GO', # Goiás '22': 'MA', # Maranhão '23': 'MT', # Mato Grosso '24': 'MG', # Minas Gerais '25': 'PA', # Pará '26': 'PB', # Paraíba '27': 'SP', # São Paulo '28': 'SE', # Sergipe '29': 'TO', # Tocantins '30': 'PE', # Pernambuco '31': 'PI', # Piauí '32': 'PR', # Paraná }, 'AT': { '01': '1', # Burgenland (AT-1) '02': '2', # Carinthia (AT-2) '03': '3', # Lower Austria (AT-3) '04': '4', # Upper Austria (AT-4) '05': '5', # Salzburg (AT-5) '06': '6', # Styria (AT-6) '07': '7', # Tyrol (AT-7) '08': '8', # Vorarlberg (AT-8) '09': '9', # Vienna (AT-9) }, 'AR': { '01': 'B', # Buenos Aires Province '02': 'K', # Catamarca '03': 'H', # Chaco '04': 'U', # Chubut '05': 'C', # Ciudad Autónoma de Buenos Aires '06': 'X', # Córdoba '07': 'W', # Corrientes '08': 'E', # Entre Ríos '09': 'P', # Formosa '10': 'Y', # Jujuy '11': 'L', # La Pampa '12': 'F', # La Rioja '13': 'M', # Mendoza '14': 'N', # Misiones '15': 'Q', # Neuquén '16': 'R', # Río Negro '17': 'A', # Salta '18': 'J', # San Juan '19': 'D', # San Luis '20': 'Z', # Santa Cruz '21': 'S', # Santa Fe '22': 'G', # Santiago del Estero '23': 'V', # Tierra del Fuego '24': 'T', # Tucumán }, 'NL': { '01': 'DR', # Drenthe '02': 'FR', # Friesland '03': 'GE', # Gelderland '04': 'GR', # Groningen '05': 'LI', # Limburg '06': 'NB', # Noord-Brabant '07': 'NH', # Noord-Holland '09': 'UT', # Utrecht '10': 'ZE', # Zeeland '11': 'ZH', # Zuid-Holland '15': 'OV', # Overijssel '16': 'FL', # Flevoland }, 'BE': { 'BRU': 'BRU', # Brussels-Capital Region 'VAN': 'VAN', # Antwerp 'VBR': 'VBR', # Flemish Brabant 'VLI': 'VLI', # Limburg (BE) 'VOV': 'VOV', # East Flanders 'VWV': 'VWV', # West Flanders 'WAL': 'WAL', # Wallonia (general) 'WBR': 'WBR', # Walloon Brabant 'WHT': 'WHT', # Hainaut 'WLG': 'WLG', # Liège 'WLX': 'WLX', # Luxembourg (BE) 'WNA': 'WNA', # Namur }, 'CH': { '01': 'AG', # Aargau '02': 'AI', # Appenzell Innerrhoden '03': 'AR', # Appenzell Ausserrhoden '04': 'BE', # Bern '05': 'BL', # Basel-Landschaft '06': 'BS', # Basel-Stadt '07': 'FR', # Fribourg '08': 'GE', # Geneva '09': 'GL', # Glarus '10': 'GR', # Graubünden '11': 'JU', # Jura '12': 'LU', # Lucerne '13': 'NE', # Neuchâtel '14': 'NW', # Nidwalden '15': 'OW', # Obwalden '16': 'SG', # St. Gallen '17': 'SH', # Schaffhausen '18': 'SO', # Solothurn '19': 'SZ', # Schwyz '20': 'TG', # Thurgau '21': 'TI', # Ticino '22': 'UR', # Uri '23': 'VD', # Vaud '24': 'VS', # Valais '25': 'ZG', # Zug '26': 'ZH', # Zürich }, 'ES': { '51': 'AN', # Andalusia '52': 'AR', # Aragon '53': 'AS', # Asturias '54': 'CN', # Canary Islands '55': 'CB', # Cantabria '56': 'CM', # Castile-La Mancha '57': 'CL', # Castile and León '58': 'CT', # Catalonia '59': 'EX', # Extremadura '60': 'GA', # Galicia '29': 'IB', # Balearic Islands '31': 'RI', # La Rioja '32': 'MD', # Madrid '33': 'MC', # Murcia '34': 'NC', # Navarre '35': 'PV', # Basque Country '36': 'VC', # Valencia '37': 'CE', # Ceuta '38': 'ML', # Melilla }, 'IT': { '01': 'PIE', # Piedmont '02': 'VDA', # Aosta Valley '03': 'LOM', # Lombardy '04': 'TAA', # Trentino-Alto Adige '05': 'VEN', # Veneto '06': 'FVG', # Friuli-Venezia Giulia '07': 'LIG', # Liguria '08': 'EMR', # Emilia-Romagna '09': 'TOS', # Tuscany '10': 'UMB', # Umbria '11': 'MAR', # Marche '12': 'LAZ', # Lazio '13': 'ABR', # Abruzzo '14': 'MOL', # Molise '15': 'CAM', # Campania '16': 'PUG', # Apulia '17': 'BAS', # Basilicata '18': 'CAL', # Calabria '19': 'SIC', # Sicily '20': 'SAR', # Sardinia }, # City-states (use country code as region) 'SG': {}, # Singapore → fallback to 'SG' 'MC': {}, # Monaco → fallback to 'MC' 'VA': {}, # Vatican → fallback to 'VA' 'LU': {}, # Luxembourg → fallback to 'LU' (sometimes) } # Valid feature codes for settlements (exclude PPLX neighborhoods) VALID_FEATURE_CODES = {'PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG'} def get_geonames_connection(db_path: str = 'data/reference/geonames.db') -> sqlite3.Connection: """Get a connection to the GeoNames database.""" return sqlite3.connect(db_path) def reverse_geocode(lat: float, lon: float, country_code: str, conn: sqlite3.Connection) -> Optional[Dict[str, Any]]: """Reverse geocode coordinates to find nearest city using GeoNames. Following AGENTS.md: Filter by feature_code to exclude PPLX (neighborhoods). """ # Query for nearest city with valid feature code query = """ SELECT geonames_id, name, ascii_name, admin1_code, admin1_name, latitude, longitude, population, feature_code, ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq FROM cities WHERE country_code = ? AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY distance_sq LIMIT 1 """ cursor = conn.execute(query, (lat, lat, lon, lon, country_code)) row = cursor.fetchone() if not row: return None # Calculate actual distance in km distance_km = math.sqrt(row[9]) * 111 # Rough conversion (1 degree ≈ 111km) return { 'geonames_id': row[0], 'name': row[1], 'ascii_name': row[2], 'admin1_code': row[3], 'admin1_name': row[4], 'latitude': row[5], 'longitude': row[6], 'population': row[7], 'feature_code': row[8], 'distance_km': distance_km } def generate_city_code(name: str) -> str: """Generate 3-letter city code from name. Rules from AGENTS.md: - Single word: First 3 letters - Dutch article: Article initial + 2 from main word - Multi-word: Initials (up to 3) """ # Normalize name = name.strip() # Dutch/French articles to check articles = ["'s", "de", "den", "het", "la", "le", "les", "l'", "the"] words = name.split() if len(words) == 1: # Single word - first 3 letters return name[:3].upper() # Check for leading article if words[0].lower() in articles: if len(words) >= 2: # Article initial + 2 from main word return (words[0][0] + words[1][:2]).upper() # Multi-word: initials initials = ''.join(w[0] for w in words if w[0].isalpha())[:3] return initials.upper() def get_iso_region_code(country_code: str, admin1_code: str) -> str: """Map GeoNames admin1_code to ISO 3166-2 region code.""" # City-states: use country code as region code city_states = {'SG', 'MC', 'VA', 'AD', 'SM', 'LI', 'MT', 'BH', 'QA', 'KW'} if country_code in city_states: return country_code if country_code in ADMIN1_TO_ISO: mapping = ADMIN1_TO_ISO[country_code] if admin1_code in mapping: return mapping[admin1_code] # Fallback: use admin1_code directly (works for many countries) return admin1_code if admin1_code else 'XX' def extract_coordinates(data: Dict[str, Any]) -> Optional[Tuple[float, float]]: """Extract coordinates from custodian data.""" # Try locations array if 'locations' in data and data['locations']: loc = data['locations'][0] if 'latitude' in loc and 'longitude' in loc: return (float(loc['latitude']), float(loc['longitude'])) # Try ghcid.location_resolution.source_coordinates if 'ghcid' in data: loc_res = data['ghcid'].get('location_resolution', {}) src_coords = loc_res.get('source_coordinates', {}) if 'latitude' in src_coords and 'longitude' in src_coords: return (float(src_coords['latitude']), float(src_coords['longitude'])) # Try wikidata_enrichment coordinates if 'wikidata_enrichment' in data: wd = data['wikidata_enrichment'] if 'coordinates' in wd: coords = wd['coordinates'] if 'latitude' in coords and 'longitude' in coords: return (float(coords['latitude']), float(coords['longitude'])) return None def get_wikidata_id(data: Dict[str, Any]) -> Optional[str]: """Extract Wikidata entity ID from custodian data.""" if 'wikidata_enrichment' in data: wd_id = data['wikidata_enrichment'].get('wikidata_entity_id') if wd_id: return wd_id if 'original_entry' in data: wd_id = data['original_entry'].get('wikidata_id') if wd_id: return wd_id return None def query_wikidata_coordinates(wikidata_ids: List[str]) -> Dict[str, Tuple[float, float]]: """Query Wikidata for P625 coordinates in batch.""" if not wikidata_ids: return {} values = ' '.join([f'wd:{qid}' for qid in wikidata_ids]) query = f""" SELECT ?item ?coords WHERE {{ VALUES ?item {{ {values} }} ?item wdt:P625 ?coords. }} """ url = "https://query.wikidata.org/sparql" headers = { 'Accept': 'application/sparql-results+json', 'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)' } data = urllib.parse.urlencode({'query': query}).encode('utf-8') try: request = urllib.request.Request(url, data=data, headers=headers) with urllib.request.urlopen(request, timeout=60) as response: result = json.loads(response.read().decode('utf-8')) bindings = result.get('results', {}).get('bindings', []) except Exception as e: print(f" Wikidata SPARQL error: {e}") return {} coords_map = {} for row in bindings: item_uri = row.get('item', {}).get('value', '') coords_str = row.get('coords', {}).get('value', '') if item_uri and coords_str: qid = item_uri.split('/')[-1] # Parse "Point(lon lat)" format match = re.search(r'Point\(([^\s]+)\s+([^\)]+)\)', coords_str) if match: lon = float(match.group(1)) lat = float(match.group(2)) coords_map[qid] = (lat, lon) return coords_map def update_custodian_file(filepath: Path, geo_result: Dict[str, Any], country_code: str, dry_run: bool = True) -> Tuple[bool, Optional[Path]]: """Update a custodian file with resolved location data.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: print(f" Error reading {filepath}: {e}") return False, None if 'ghcid' not in data: return False, None ghcid = data['ghcid'] if 'location_resolution' not in ghcid: ghcid['location_resolution'] = {} loc_res = ghcid['location_resolution'] # Get ISO region code region_code = get_iso_region_code(country_code, geo_result.get('admin1_code', '')) city_code = generate_city_code(geo_result['ascii_name']) # Update location_resolution old_region = loc_res.get('region_code', 'XX') old_city = loc_res.get('city_code', 'XXX') changes = [] if old_region == 'XX' and region_code != 'XX': loc_res['region_code'] = region_code loc_res['region_name'] = geo_result.get('admin1_name', '') changes.append(f"region XX→{region_code}") if old_city == 'XXX': loc_res['city_code'] = city_code loc_res['city_name'] = geo_result['name'] changes.append(f"city XXX→{city_code}") if not changes: return False, None # Update GeoNames metadata loc_res['method'] = 'REVERSE_GEOCODE' loc_res['geonames_id'] = geo_result['geonames_id'] loc_res['geonames_name'] = geo_result['name'] loc_res['feature_code'] = geo_result['feature_code'] loc_res['population'] = geo_result.get('population') loc_res['distance_km'] = round(geo_result['distance_km'], 2) loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() # Update GHCID string old_ghcid = ghcid.get('ghcid_current', '') new_ghcid = old_ghcid if old_region == 'XX' and region_code != 'XX': new_ghcid = new_ghcid.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') if old_city == 'XXX': # Find and replace XXX in GHCID new_ghcid = new_ghcid.replace('-XXX-', f'-{city_code}-') if new_ghcid != old_ghcid: ghcid['ghcid_current'] = new_ghcid # Add to history if 'ghcid_history' not in ghcid: ghcid['ghcid_history'] = [] ghcid['ghcid_history'].append({ 'ghcid': new_ghcid, 'valid_from': datetime.now(timezone.utc).isoformat(), 'reason': f"Location resolved via GeoNames reverse geocoding: {', '.join(changes)}" }) # Add provenance note if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance']: data['provenance']['notes'] = [] elif isinstance(data['provenance']['notes'], str): data['provenance']['notes'] = [data['provenance']['notes']] data['provenance']['notes'].append( f"Location resolved {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}: " f"{', '.join(changes)} via GeoNames ({geo_result['name']}, {geo_result.get('admin1_name', '')})" ) # Determine new filename new_filename = filepath.name if old_region == 'XX' and region_code != 'XX': new_filename = new_filename.replace(f'{country_code}-XX-', f'{country_code}-{region_code}-') if old_city == 'XXX': new_filename = new_filename.replace('-XXX-', f'-{city_code}-') new_filepath = filepath.parent / new_filename if not dry_run: # Write updated file with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Rename if needed if new_filepath != filepath and not new_filepath.exists(): filepath.rename(new_filepath) return True, new_filepath if new_filepath != filepath else None def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser( description='Resolve XX/XXX location codes using GeoNames reverse geocoding' ) parser.add_argument('--apply', action='store_true', help='Actually apply the fixes (default: dry run)') parser.add_argument('--path', type=str, default='data/custodian', help='Path to custodian files directory') parser.add_argument('--db', type=str, default='data/reference/geonames.db', help='Path to GeoNames database') parser.add_argument('--limit', type=int, default=100, help='Limit number of files to process') parser.add_argument('--country', type=str, help='Only process files for a specific country') args = parser.parse_args() custodian_dir = Path(args.path) if not custodian_dir.exists(): print(f"Error: Directory {custodian_dir} does not exist") sys.exit(1) dry_run = not args.apply print("=" * 70) print("LOCATION RESOLUTION VIA GEONAMES REVERSE GEOCODING") print("=" * 70) print(f"Mode: {'DRY RUN' if dry_run else 'APPLYING CHANGES'}") print() # Connect to GeoNames DB conn = get_geonames_connection(args.db) # Find files with XX region or XXX city codes files_to_process = [] # Look for files matching patterns for filepath in custodian_dir.glob('*-XX-*.yaml'): files_to_process.append(filepath) for filepath in custodian_dir.glob('*-XXX-*.yaml'): if filepath not in files_to_process: files_to_process.append(filepath) print(f"Found {len(files_to_process)} files with XX or XXX codes") # Load files and extract info file_data = [] for filepath in files_to_process[:args.limit]: try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Get country code country = None if 'ghcid' in data and 'location_resolution' in data['ghcid']: country = data['ghcid']['location_resolution'].get('country_code') if not country: continue if args.country and country != args.country: continue # Get coordinates coords = extract_coordinates(data) wikidata_id = get_wikidata_id(data) file_data.append({ 'filepath': filepath, 'data': data, 'country': country, 'coords': coords, 'wikidata_id': wikidata_id }) except Exception as e: print(f"Error loading {filepath}: {e}") print(f"Processing {len(file_data)} files") # Separate files with and without coordinates with_coords = [f for f in file_data if f['coords']] without_coords = [f for f in file_data if not f['coords'] and f['wikidata_id']] no_location = [f for f in file_data if not f['coords'] and not f['wikidata_id']] print(f" With coordinates: {len(with_coords)}") print(f" Need Wikidata lookup: {len(without_coords)}") print(f" No location data: {len(no_location)}") print() # Query Wikidata for missing coordinates if without_coords: print("Querying Wikidata for coordinates...") wikidata_ids = [f['wikidata_id'] for f in without_coords] # Batch in groups of 50 all_coords = {} for i in range(0, len(wikidata_ids), 50): batch = wikidata_ids[i:i+50] coords = query_wikidata_coordinates(batch) all_coords.update(coords) if i + 50 < len(wikidata_ids): import time time.sleep(1) # Rate limiting print(f" Retrieved coordinates for {len(all_coords)} entities") # Update file_data with Wikidata coordinates for f in without_coords: if f['wikidata_id'] in all_coords: f['coords'] = all_coords[f['wikidata_id']] with_coords.append(f) print() print(f"Files with resolvable coordinates: {len(with_coords)}") print() # Process files with coordinates resolved = 0 renamed = 0 for f in with_coords: filepath = f['filepath'] country = f['country'] lat, lon = f['coords'] # Reverse geocode geo_result = reverse_geocode(lat, lon, country, conn) if not geo_result: print(f" No GeoNames match: {filepath.name}") continue # Update file success, new_path = update_custodian_file(filepath, geo_result, country, dry_run=dry_run) if success: resolved += 1 if new_path: renamed += 1 print(f" {filepath.name} → {new_path.name}") else: print(f" Updated: {filepath.name}") conn.close() print() print("=" * 70) print("SUMMARY") print("=" * 70) print(f"Files processed: {len(file_data)}") print(f"Resolved: {resolved}") print(f"Renamed: {renamed}") print(f"No location data: {len(no_location)}") if dry_run: print() print("This was a DRY RUN. Use --apply to make changes.") if __name__ == '__main__': main()