#!/usr/bin/env python3 """ Geocode remaining 26 custodian files using GHCID city codes and GeoNames. """ import yaml import sqlite3 from pathlib import Path from datetime import datetime, timezone # Files to process FILES = [ "data/custodian/CZ-80-BOH-A-ASZG.yaml", "data/custodian/CZ-63-KNL-M-MVMKNL.yaml", "data/custodian/CZ-52-DKR-M-MMDKNL.yaml", "data/custodian/BE-VLG-ONZ-L-SUIS.yaml", "data/custodian/AR-C-BUE-A-APDH.yaml", "data/custodian/CZ-71-VLO-M-RPVLSMP.yaml", "data/custodian/CZ-64-BLA-M-MBP.yaml", "data/custodian/CZ-63-PEL-M-MVPPO.yaml", "data/custodian/CZ-80-BRU-M-MVBPO.yaml", "data/custodian/CZ-71-JES-M-VMJPO.yaml", "data/custodian/CZ-80-BNB-M-MMBVBNB.yaml", "data/custodian/CZ-72-ROZ-M-VMVPNKP.yaml", "data/custodian/CZ-64-STZ-A-MSVS.yaml", "data/custodian/LB-BA-BEI-A-NA.yaml", "data/custodian/ML-6-TIM-L-MHCL.yaml", "data/custodian/CZ-80-KOP-M-RMKPSTM.yaml", "data/custodian/CZ-63-PRB-A-KZMPPO.yaml", "data/custodian/CZ-63-POL-M-MMPPO.yaml", "data/custodian/BE-VLG-OST-L-VIZV.yaml", "data/custodian/CZ-80-RYM-M-MMVRPO.yaml", "data/custodian/BE-VLG-ANT-A-MPMP.yaml", "data/custodian/BE-VLG-ANT-A-UABWIUC.yaml", "data/custodian/CZ-53-UNO-A-AUMAVESUNO.yaml", "data/custodian/CZ-63-ZNS-A-AUMAVESZNS.yaml", "data/custodian/BE-BRU-WOL-L-LL.yaml", "data/custodian/CZ-72-UBR-M-MJAKVUB.yaml", ] # Manual coordinates for known cities CITY_COORDS = { # Argentina ("AR", "BUE"): ((-34.6037, -58.3816), "Buenos Aires"), # Lebanon ("LB", "BEI"): ((33.8938, 35.5018), "Beirut"), # Mali ("ML", "TIM"): ((16.7666, -3.0026), "Timbuktu"), # Belgium ("BE", "ONZ"): ((50.9667, 3.8167), "Onze-Lieve-Vrouw-Waver"), # Approximation ("BE", "OST"): ((51.2194, 2.9264), "Ostend"), ("BE", "ANT"): ((51.2194, 4.4025), "Antwerp"), ("BE", "WOL"): ((50.8503, 4.3517), "Woluwe-Saint-Lambert (Brussels)"), # Czech Republic - will query GeoNames } # Czech city codes to city names (for GeoNames lookup) CZ_CITY_MAP = { "BOH": "Bohumín", "KNL": "Kralovice", # Unclear - will research "DKR": "Dvůr Králové nad Labem", "VLO": "Valašské Meziříčí", # VLO region "BLA": "Blansko", "PEL": "Pelhřimov", "BRU": "Bruntál", "JES": "Jeseník", "BNB": "Bohumín", # BNB unclear - check file "ROZ": "Rožnov pod Radhoštěm", "STZ": "Strakonice", # STZ unclear "KOP": "Kopřivnice", "PRB": "Příbram", "POL": "Polička", "RYM": "Rýmařov", "UNO": "Ústí nad Orlicí", "ZNS": "Znojmo", # ZNS unclear "UBR": "Uherský Brod", } def get_geonames_coords(city_name: str, country_code: str) -> tuple | None: """Query GeoNames database for city coordinates.""" db_path = Path("data/reference/geonames.db") if not db_path.exists(): print(f" GeoNames DB not found at {db_path}") return None conn = sqlite3.connect(db_path) cursor = conn.cursor() # Try exact match first cursor.execute(""" SELECT latitude, longitude, name FROM cities WHERE country_code = ? AND (name LIKE ? OR ascii_name LIKE ?) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC') ORDER BY population DESC LIMIT 1 """, (country_code, f"%{city_name}%", f"%{city_name}%")) row = cursor.fetchone() conn.close() if row: return ((row[0], row[1]), row[2]) return None def process_file(filepath: str) -> bool: """Add coordinates to a single custodian file.""" path = Path(filepath) if not path.exists(): print(f" File not found: {filepath}") return False with open(path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Check if already has coordinates if data.get('location', {}).get('latitude'): print(f" Already has coordinates: {filepath}") return False # Extract country and city code from GHCID ghcid = data.get('ghcid', {}).get('ghcid_current', '') parts = ghcid.split('-') if len(parts) < 4: print(f" Invalid GHCID format: {ghcid}") return False country_code = parts[0] city_code = parts[2] coords = None city_name = None method = "CITY_CODE_LOOKUP" # Try manual mapping first key = (country_code, city_code) if key in CITY_COORDS: coords, city_name = CITY_COORDS[key] elif country_code == "CZ" and city_code in CZ_CITY_MAP: # Query GeoNames for Czech cities cz_city = CZ_CITY_MAP[city_code] result = get_geonames_coords(cz_city, "CZ") if result: coords, city_name = result method = "GEONAMES_LOOKUP" else: print(f" No GeoNames match for: {cz_city}") return False else: print(f" Unknown city code: {country_code}-{city_code}") return False if not coords: print(f" No coordinates found for {filepath}") return False # Update location if 'location' not in data: data['location'] = {} data['location']['latitude'] = coords[0] data['location']['longitude'] = coords[1] data['location']['city'] = city_name data['location']['geocoding_method'] = method data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat() # Write back with open(path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f" ✓ {filepath}: {city_name} ({coords[0]:.4f}, {coords[1]:.4f})") return True def main(): print(f"Geocoding {len(FILES)} remaining files...") updated = 0 for filepath in FILES: result = process_file(filepath) if result: updated += 1 print(f"\nUpdated {updated}/{len(FILES)} files") if __name__ == "__main__": main()