#!/usr/bin/env python3 """ Enrich custodian files with city/region data using Google Places API. This is a generic script that works for any country's XXX files. Usage: python scripts/enrich_cities_google.py --country KR [--dry-run] [--limit N] python scripts/enrich_cities_google.py --country AR [--dry-run] [--limit N] python scripts/enrich_cities_google.py --all [--dry-run] [--limit N] Environment Variables: GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled """ import os import sys import time import sqlite3 import re import argparse from pathlib import Path from datetime import datetime, timezone from typing import Optional import yaml import httpx from dotenv import load_dotenv # Load environment variables load_dotenv() # Configuration GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "") GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db") CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Google Places API TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" REQUEST_DELAY = 0.3 # Country name mapping for search queries COUNTRY_NAMES = { 'KR': 'South Korea', 'AR': 'Argentina', 'US': 'United States', 'IN': 'India', 'JM': 'Jamaica', 'UZ': 'Uzbekistan', 'UA': 'Ukraine', 'TJ': 'Tajikistan', 'OM': 'Oman', 'NL': 'Netherlands', 'NA': 'Namibia', 'ML': 'Mali', 'LK': 'Sri Lanka', 'LB': 'Lebanon', 'IT': 'Italy', 'IR': 'Iran', 'EC': 'Ecuador', 'DK': 'Denmark', 'CU': 'Cuba', 'CO': 'Colombia', 'BR': 'Brazil', 'MX': 'Mexico', 'JP': 'Japan', 'CZ': 'Czech Republic', 'DE': 'Germany', 'FR': 'France', 'GB': 'United Kingdom', } def get_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" name = city_name.strip() # Remove common suffixes for suffix in [' City', ' Town', '-shi', '-ku', '-gun', '-cho', ' District']: if name.endswith(suffix): name = name[:-len(suffix)] words = name.split() if len(words) == 1: return name[:3].upper() elif len(words) == 2: return (words[0][0] + words[1][:2]).upper() else: return ''.join(w[0] for w in words[:3]).upper() def search_google_places(query: str, api_key: str) -> Optional[dict]: """Search Google Places API for a location.""" headers = { "Content-Type": "application/json", "X-Goog-Api-Key": api_key, "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri" } payload = { "textQuery": query, "languageCode": "en" } try: response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30) response.raise_for_status() data = response.json() if "places" in data and len(data["places"]) > 0: return data["places"][0] return None except Exception as e: print(f" Error searching Google Places: {e}") return None def extract_location_from_google(place: dict) -> dict: """Extract location information from Google Places result.""" result = { 'city': None, 'region': None, 'latitude': None, 'longitude': None, 'formatted_address': None, 'place_id': None, 'website': None, } if not place: return result result['place_id'] = place.get('id') result['formatted_address'] = place.get('formattedAddress') result['website'] = place.get('websiteUri') location = place.get('location', {}) result['latitude'] = location.get('latitude') result['longitude'] = location.get('longitude') components = place.get('addressComponents', []) for comp in components: types = comp.get('types', []) long_name = comp.get('longText', '') if 'locality' in types: result['city'] = long_name elif 'administrative_area_level_1' in types: result['region'] = long_name elif 'sublocality_level_1' in types and not result['city']: result['city'] = long_name return result def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, country_code: str) -> Optional[dict]: """Reverse geocode coordinates to find nearest city in GeoNames.""" cursor = conn.cursor() cursor.execute(""" SELECT name, ascii_name, admin1_code, admin1_name, geonames_id, latitude, longitude, population, feature_code, ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq FROM cities WHERE country_code = ? AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY dist_sq LIMIT 1 """, (lat, lat, lon, lon, country_code)) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'geonames_id': row[4], 'latitude': row[5], 'longitude': row[6], 'population': row[7], 'feature_code': row[8], } return None def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str: """Get ISO-style region code from GeoNames admin1_code.""" if not admin1_code: return 'XX' # For most countries, use first 2-3 characters of admin1_code or name if len(admin1_code) <= 3: return admin1_code.upper() # Use abbreviation from name if admin1_name: words = admin1_name.split() if len(words) == 1: return admin1_name[:2].upper() else: return ''.join(w[0] for w in words[:2]).upper() return admin1_code[:2].upper() def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str, country_code: str, country_name: str, dry_run: bool = False) -> dict: """Process a single custodian file.""" result = { 'file': str(filepath), 'status': 'skipped', 'old_ghcid': None, 'new_ghcid': None, 'city': None, 'region': None, 'error': None, } try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: result['status'] = 'error' result['error'] = f'Failed to load YAML: {e}' return result if not data: result['status'] = 'error' result['error'] = 'Empty YAML file' return result ghcid_data = data.get('ghcid', {}) old_ghcid = ghcid_data.get('ghcid_current', '') result['old_ghcid'] = old_ghcid # Match both patterns: # 1. {country}-XX-XXX-... (no region, no city) # 2. {country}-{region}-XXX-... (has region, no city) xxx_pattern = re.compile(rf'^{country_code}-[A-Z]{{2}}-XXX-') if not xxx_pattern.match(old_ghcid): result['status'] = 'skipped' result['error'] = f'Not a {country_code}-*-XXX file' return result # Get institution name name = data.get('custodian_name', {}).get('claim_value', '') if not name: name = data.get('original_entry', {}).get('name', '') if not name: result['status'] = 'error' result['error'] = 'No institution name found' return result # Search Google Places search_query = f"{name} {country_name}" print(f" Searching: {name[:50]}...") place = search_google_places(search_query, api_key) time.sleep(REQUEST_DELAY) if not place: result['status'] = 'error' result['error'] = 'Not found in Google Places' return result location_info = extract_location_from_google(place) if not location_info['latitude'] or not location_info['longitude']: result['status'] = 'error' result['error'] = 'No coordinates from Google' return result # Lookup in GeoNames city_info = lookup_city_geonames(conn, location_info['latitude'], location_info['longitude'], country_code) if not city_info: result['status'] = 'error' result['error'] = 'City not found in GeoNames' return result region_code = get_region_code(city_info['admin1_code'], country_code, city_info['admin1_name']) city_code = get_city_code(city_info['ascii_name']) result['city'] = city_info['ascii_name'] result['region'] = city_info['admin1_name'] # Build new GHCID parts = old_ghcid.split('-') if len(parts) >= 5: inst_type = parts[3] abbreviation = '-'.join(parts[4:]) else: result['status'] = 'error' result['error'] = f'Invalid GHCID format: {old_ghcid}' return result new_ghcid = f'{country_code}-{region_code}-{city_code}-{inst_type}-{abbreviation}' result['new_ghcid'] = new_ghcid if dry_run: result['status'] = 'would_update' return result # Update the data timestamp = datetime.now(timezone.utc).isoformat() data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['location_resolution'] = { 'method': 'GOOGLE_PLACES_GEONAMES', 'country_code': country_code, 'region_code': region_code, 'region_name': city_info['admin1_name'], 'city_code': city_code, 'city_name': city_info['ascii_name'], 'geonames_id': city_info['geonames_id'], 'feature_code': city_info['feature_code'], 'google_place_id': location_info.get('place_id'), 'latitude': location_info['latitude'], 'longitude': location_info['longitude'], 'resolution_date': timestamp, } data['google_maps_enrichment'] = { 'place_id': location_info.get('place_id'), 'formatted_address': location_info.get('formatted_address'), 'website': location_info.get('website'), 'latitude': location_info['latitude'], 'longitude': location_info['longitude'], 'enriched_at': timestamp, 'source': 'Google Places API (New)', } # Update GHCID history if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] for entry in data['ghcid']['ghcid_history']: if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'): entry['valid_to'] = timestamp data['ghcid']['ghcid_history'].append({ 'ghcid': new_ghcid, 'ghcid_numeric': data['ghcid'].get('ghcid_numeric'), 'valid_from': timestamp, 'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})', }) if 'identifiers' in data: for identifier in data['identifiers']: if identifier.get('identifier_scheme') == 'GHCID': identifier['identifier_value'] = new_ghcid # Write and rename with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) new_filename = f'{new_ghcid}.yaml' new_filepath = filepath.parent / new_filename if filepath != new_filepath and not new_filepath.exists(): filepath.rename(new_filepath) result['new_file'] = str(new_filepath) elif new_filepath.exists() and filepath != new_filepath: result['status'] = 'collision' result['error'] = f'Target file exists: {new_filepath.name}' return result result['status'] = 'updated' return result def main(): parser = argparse.ArgumentParser(description='Enrich custodian files with Google Places data') parser.add_argument('--country', type=str, help='Country code (e.g., KR, AR, US)') parser.add_argument('--all', action='store_true', help='Process all countries with XXX files') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') parser.add_argument('--limit', type=int, help='Limit number of files per country') args = parser.parse_args() if not GOOGLE_PLACES_TOKEN: print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required") sys.exit(1) if not GEONAMES_DB.exists(): print(f"ERROR: GeoNames database not found: {GEONAMES_DB}") sys.exit(1) # Determine which countries to process if args.all: # Find all countries with XXX files (either XX-XXX or {region}-XXX) countries = set() for f in CUSTODIAN_DIR.glob('*-*-XXX-*.yaml'): cc = f.name[:2] if cc in COUNTRY_NAMES: countries.add(cc) countries = sorted(countries) elif args.country: countries = [args.country.upper()] else: print("ERROR: Specify --country CODE or --all") sys.exit(1) conn = sqlite3.connect(str(GEONAMES_DB)) total_stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0} for country_code in countries: country_name = COUNTRY_NAMES.get(country_code, country_code) files = sorted(CUSTODIAN_DIR.glob(f'{country_code}-*-XXX-*.yaml')) if args.limit: files = files[:args.limit] if not files: continue print(f"\n{'='*60}") print(f"Processing {country_code} ({country_name}): {len(files)} files") print('='*60) stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0} for filepath in files: print(f"Processing: {filepath.name}") result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN, country_code, country_name, dry_run=args.dry_run) stats[result['status']] = stats.get(result['status'], 0) + 1 if result['status'] in ('updated', 'would_update'): print(f" ✓ {result['city']} ({result['region']}): {result['old_ghcid']} → {result['new_ghcid']}") elif result['status'] == 'error': print(f" ✗ {result['error']}") elif result['status'] == 'collision': print(f" ⚠ {result['error']}") print(f"\n{country_code} Summary: Updated={stats.get('updated', 0)}, " f"Would update={stats.get('would_update', 0)}, " f"Errors={stats.get('error', 0)}") for k, v in stats.items(): total_stats[k] = total_stats.get(k, 0) + v conn.close() print() print('='*60) print('TOTAL Summary:') print(f" Updated: {total_stats.get('updated', 0)}") print(f" Would update: {total_stats.get('would_update', 0)}") print(f" Errors: {total_stats.get('error', 0)}") print(f" Collisions: {total_stats.get('collision', 0)}") print(f" Skipped: {total_stats.get('skipped', 0)}") if __name__ == '__main__': main()