#!/usr/bin/env python3 """ Add lat/lon coordinates to Egyptian custodian files based on city codes. City code mapping for Egypt: - CAI = Cairo - ALX = Alexandria - ASS = Assiut - NIL = Nile (various locations, use Cairo as proxy) - GIZ = Giza - LUX = Luxor """ import sqlite3 from pathlib import Path from datetime import datetime, timezone from ruamel.yaml import YAML GEONAMES_DB = Path("data/reference/geonames.db") CUSTODIAN_DIR = Path("data/custodian") yaml = YAML() yaml.preserve_quotes = True yaml.width = 4096 # Egypt city code mappings (city_code -> GeoNames city name) EG_CITY_MAPPING = { 'CAI': 'Cairo', 'ALX': 'Alexandria', 'ASS': 'Assiut', # Assiut city 'NIL': 'Cairo', # Default to Cairo for Nile-related 'GIZ': 'Giza', 'LUX': 'Luxor', 'ASW': 'Aswan', 'POR': 'Port Said', 'SUE': 'Suez', 'MAN': 'Mansoura', 'TAN': 'Tanta', 'ISM': 'Ismailia', } def get_coords_for_city(conn: sqlite3.Connection, city_name: str, country_code: str = 'EG') -> tuple[float, float, int] | None: """Get lat/lon and geonames_id for a city.""" cursor = conn.execute( """SELECT latitude, longitude, geonames_id FROM cities WHERE country_code = ? AND (name = ? OR ascii_name = ?) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLC') ORDER BY population DESC LIMIT 1""", (country_code, city_name, city_name) ) row = cursor.fetchone() if row: return row[0], row[1], row[2] return None def process_file(filepath: Path, conn: sqlite3.Connection) -> bool: """Process a single custodian file. Returns True if updated.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if not data: return False # Check if already has coordinates location = data.get('location', {}) if location.get('latitude') and location.get('longitude'): return False # Get city code from location_resolution or GHCID city_code = None ghcid = data.get('ghcid', {}) loc_res = ghcid.get('location_resolution', {}) if loc_res.get('city_code'): city_code = loc_res['city_code'] # Also try to extract from ghcid_current (e.g., EG-C-CAI-L-...) if not city_code and ghcid.get('ghcid_current'): parts = ghcid['ghcid_current'].split('-') if len(parts) >= 3 and parts[0] == 'EG': city_code = parts[2] if not city_code: print(f" No city code found: {filepath.name}") return False # Map city code to city name city_name = EG_CITY_MAPPING.get(city_code) if not city_name: print(f" Unknown city code {city_code}: {filepath.name}") return False # Look up coordinates result = get_coords_for_city(conn, city_name) if not result: print(f" City not found in GeoNames: {city_name} ({city_code}): {filepath.name}") return False lat, lon, geonames_id = result # Update location block if 'location' not in data: data['location'] = {} data['location']['city'] = city_name data['location']['latitude'] = lat data['location']['longitude'] = lon data['location']['geonames_id'] = geonames_id data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat() data['location']['geocoding_method'] = 'EG_CITY_CODE_LOOKUP' # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f) return True def main(): import argparse parser = argparse.ArgumentParser(description='Geocode Egyptian institutions by city code') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') args = parser.parse_args() conn = sqlite3.connect(GEONAMES_DB) # Find EG files files = list(CUSTODIAN_DIR.glob("EG-*.yaml")) print(f"Found {len(files)} EG files") updated = 0 skipped = 0 for filepath in files: if not filepath.is_file(): continue try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if not data: continue location = data.get('location', {}) if location.get('latitude'): skipped += 1 continue if args.dry_run: # Check what would happen ghcid = data.get('ghcid', {}) city_code = ghcid.get('location_resolution', {}).get('city_code') if not city_code and ghcid.get('ghcid_current'): parts = ghcid['ghcid_current'].split('-') if len(parts) >= 3 and parts[0] == 'EG': city_code = parts[2] if city_code and city_code in EG_CITY_MAPPING: city_name = EG_CITY_MAPPING[city_code] result = get_coords_for_city(conn, city_name) if result: print(f"Would update: {filepath.name} -> {city_name} ({result[0]}, {result[1]})") updated += 1 else: print(f" City not in GeoNames: {city_name}") else: print(f" Unknown/no city code: {filepath.name} ({city_code})") else: if process_file(filepath, conn): print(f"Updated: {filepath.name}") updated += 1 except Exception as e: print(f"Error: {filepath.name}: {e}") conn.close() print(f"\nSummary:") print(f" Updated: {updated}") print(f" Skipped (already has coords): {skipped}") if __name__ == "__main__": main()