#!/usr/bin/env python3 """ Geocode Global Heritage Institutions This script geocodes heritage institutions from the global dataset using the Nominatim API. Features: - Persistent SQLite cache (preserves geocoding across runs) - Rate limiting (1 request/second for Nominatim) - Progress tracking and resume capability - Country-specific query optimization - Error handling and retry logic - Detailed logging and statistics Usage: python scripts/geocode_global_institutions.py [--dry-run] [--limit N] [--country CODE] Options: --dry-run Show what would be geocoded without making API calls --limit N Only geocode first N institutions (for testing) --country CODE Only geocode institutions from specific country (e.g., JP, NL, MX) --force Re-geocode institutions that already have coordinates --verbose Show detailed progress for each institution """ import argparse import sqlite3 import time import yaml import requests from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Optional, Tuple import sys class GeocodingCache: """Persistent SQLite cache for geocoding results.""" def __init__(self, cache_file: Path): self.cache_file = cache_file self.conn = sqlite3.connect(cache_file) self._initialize_cache() def _initialize_cache(self): """Create cache table if it doesn't exist.""" self.conn.execute(""" CREATE TABLE IF NOT EXISTS geocoding_cache ( query TEXT PRIMARY KEY, latitude REAL, longitude REAL, geonames_id INTEGER, display_name TEXT, timestamp TEXT, success INTEGER ) """) self.conn.commit() def get(self, query: str) -> Optional[Dict]: """Retrieve cached geocoding result.""" cursor = self.conn.execute( "SELECT latitude, longitude, geonames_id, display_name, success FROM geocoding_cache WHERE query = ?", (query,) ) row = cursor.fetchone() if row: if row[4]: # success return { 'latitude': row[0], 'longitude': row[1], 'geonames_id': row[2], 'display_name': row[3] } else: return None # Cached failure return None # Not in cache def put(self, query: str, result: Optional[Dict]): """Store geocoding result in cache.""" if result: self.conn.execute(""" INSERT OR REPLACE INTO geocoding_cache (query, latitude, longitude, geonames_id, display_name, timestamp, success) VALUES (?, ?, ?, ?, ?, ?, 1) """, ( query, result.get('latitude'), result.get('longitude'), result.get('geonames_id'), result.get('display_name'), datetime.now(timezone.utc).isoformat() )) else: # Cache failure to avoid retrying self.conn.execute(""" INSERT OR REPLACE INTO geocoding_cache (query, latitude, longitude, geonames_id, display_name, timestamp, success) VALUES (?, NULL, NULL, NULL, NULL, ?, 0) """, (query, datetime.now(timezone.utc).isoformat())) self.conn.commit() def stats(self) -> Dict[str, int]: """Get cache statistics.""" cursor = self.conn.execute("SELECT COUNT(*), SUM(success) FROM geocoding_cache") total, successful = cursor.fetchone() return { 'total_queries': total or 0, 'successful': successful or 0, 'failed': (total or 0) - (successful or 0) } def close(self): """Close database connection.""" self.conn.close() class GlobalGeocoder: """Geocode heritage institutions with caching and rate limiting.""" NOMINATIM_URL = "https://nominatim.openstreetmap.org/search" USER_AGENT = "GLAM-Heritage-Data-Extraction/1.0 (https://github.com/cultural-heritage/glam-extractor)" def __init__(self, cache_file: Path, rate_limit: float = 1.0): """ Initialize geocoder. Args: cache_file: Path to SQLite cache database rate_limit: Minimum seconds between API calls """ self.cache = GeocodingCache(cache_file) self.rate_limit = rate_limit self.last_request_time = 0 self.stats = { 'total': 0, 'already_geocoded': 0, 'cache_hits': 0, 'api_calls': 0, 'successful': 0, 'failed': 0, 'skipped': 0 } def _wait_for_rate_limit(self): """Enforce rate limiting.""" elapsed = time.time() - self.last_request_time if elapsed < self.rate_limit: time.sleep(self.rate_limit - elapsed) self.last_request_time = time.time() def _build_query(self, location: Dict, country: str) -> str: """ Build geocoding query optimized for each country. Args: location: Location dict with city, region, country, etc. country: ISO 3166-1 alpha-2 country code Returns: Query string for geocoding API """ city = location.get('city', '').strip() region = location.get('region', '').strip() country_code = location.get('country', country).strip() # Country-specific query optimization if country_code == 'JP': # Japanese cities often include administrative level (e.g., "SAPPORO SHI KITA KU") # Remove " SHI" suffix for better matching if city: city_clean = city.replace(' SHI', '').replace(' KU', '').replace(' CHO', '').replace(' MURA', '') if region: # Region is prefecture (e.g., "HOKKAIDO") return f"{city_clean}, {region}, Japan" else: return f"{city_clean}, Japan" elif country_code == 'NL': # Dutch addresses: prioritize postal code + city postal_code = location.get('postal_code', '').strip() if postal_code and city: return f"{postal_code}, {city}, Netherlands" elif city: return f"{city}, Netherlands" elif country_code == 'MX': # Mexican locations: city + state + country if city and region: return f"{city}, {region}, Mexico" elif city: return f"{city}, Mexico" elif country_code == 'BR': # Brazilian locations: city + state abbreviation + country if city and region: return f"{city}, {region}, Brazil" elif city: return f"{city}, Brazil" elif country_code == 'CL': # Chilean locations: city + region + country if city and region: return f"{city}, {region}, Chile" elif city: return f"{city}, Chile" # Generic fallback parts = [] if city: parts.append(city) if region: parts.append(region) if country_code: parts.append(country_code) return ', '.join(parts) if parts else '' def geocode_location(self, location: Dict, country: str, dry_run: bool = False) -> Optional[Dict]: """ Geocode a single location. Args: location: Location dict from institution record country: ISO country code dry_run: If True, don't make API calls Returns: Dict with latitude, longitude, geonames_id (if available) """ query = self._build_query(location, country) if not query: self.stats['skipped'] += 1 return None # Check cache first cached_result = self.cache.get(query) if cached_result is not None: self.stats['cache_hits'] += 1 return cached_result if dry_run: print(f" [DRY RUN] Would geocode: {query}") return None # Make API call self._wait_for_rate_limit() self.stats['api_calls'] += 1 try: params = { 'q': query, 'format': 'json', 'limit': 1, 'addressdetails': 1, 'extratags': 1 } headers = { 'User-Agent': self.USER_AGENT } response = requests.get( self.NOMINATIM_URL, params=params, headers=headers, timeout=10 ) response.raise_for_status() results = response.json() if results and len(results) > 0: location_result = results[0] result = { 'latitude': float(location_result['lat']), 'longitude': float(location_result['lon']), 'display_name': location_result.get('display_name', '') } # Extract geonames_id if available extratags = location_result.get('extratags') if extratags and isinstance(extratags, dict) and 'geonames:id' in extratags: result['geonames_id'] = int(extratags['geonames:id']) self.cache.put(query, result) self.stats['successful'] += 1 return result else: # No result found - cache failure self.cache.put(query, None) self.stats['failed'] += 1 return None except (requests.RequestException, ValueError, KeyError) as e: print(f" āš ļø Geocoding error for '{query}': {e}") # Don't cache errors - allow retry later self.stats['failed'] += 1 return None def geocode_institution(self, institution: Dict, dry_run: bool = False, force: bool = False, verbose: bool = False) -> bool: """ Geocode all locations for an institution. Args: institution: Institution record dry_run: If True, don't make API calls or modify data force: If True, re-geocode institutions that already have coordinates verbose: If True, print detailed progress Returns: True if any location was updated """ self.stats['total'] += 1 locations = institution.get('locations', []) if not locations: self.stats['skipped'] += 1 return False country = locations[0].get('country', 'Unknown') name = institution.get('name', 'Unknown') updated = False for i, location in enumerate(locations): # Skip if already geocoded (unless --force) if not force and location.get('latitude') and location.get('longitude'): self.stats['already_geocoded'] += 1 if verbose: print(f" āœ“ Location {i+1} already geocoded") continue if verbose: query = self._build_query(location, country) print(f" šŸŒ Geocoding location {i+1}: {query}") result = self.geocode_location(location, country, dry_run=dry_run) if result and not dry_run: location['latitude'] = result['latitude'] location['longitude'] = result['longitude'] if 'geonames_id' in result: location['geonames_id'] = result['geonames_id'] updated = True if verbose: print(f" āœ“ ({result['latitude']:.4f}, {result['longitude']:.4f})") elif verbose and not dry_run: print(f" āœ— Geocoding failed") return updated def print_stats(self): """Print geocoding statistics.""" cache_stats = self.cache.stats() print("\n" + "=" * 80) print("GEOCODING STATISTICS") print("=" * 80) print(f"Institutions processed: {self.stats['total']:,}") print(f"Already geocoded (skipped): {self.stats['already_geocoded']:,}") print(f"Cache hits: {self.stats['cache_hits']:,}") print(f"API calls: {self.stats['api_calls']:,}") print(f"Successful geocoding: {self.stats['successful']:,}") print(f"Failed geocoding: {self.stats['failed']:,}") print(f"Skipped (no location): {self.stats['skipped']:,}") print(f"\nCache Statistics:") print(f"Total cached queries: {cache_stats['total_queries']:,}") print(f"Successful: {cache_stats['successful']:,}") print(f"Failed: {cache_stats['failed']:,}") print("=" * 80) def close(self): """Close resources.""" self.cache.close() def main(): parser = argparse.ArgumentParser( description="Geocode global heritage institutions using Nominatim API" ) parser.add_argument( '--dry-run', action='store_true', help="Show what would be geocoded without making API calls" ) parser.add_argument( '--limit', type=int, help="Only geocode first N institutions (for testing)" ) parser.add_argument( '--country', type=str, help="Only geocode institutions from specific country (e.g., JP, NL, MX)" ) parser.add_argument( '--force', action='store_true', help="Re-geocode institutions that already have coordinates" ) parser.add_argument( '--verbose', action='store_true', help="Show detailed progress for each institution" ) args = parser.parse_args() # Paths base_dir = Path(__file__).parent.parent global_file = base_dir / 'data' / 'instances' / 'global' / 'global_heritage_institutions.yaml' cache_file = base_dir / 'data' / 'cache' / 'geocoding_cache.db' # Create cache directory if needed cache_file.parent.mkdir(parents=True, exist_ok=True) # Load global dataset print(f"Loading global dataset from {global_file}") with open(global_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Loaded {len(institutions):,} institutions") # Filter by country if specified if args.country: institutions = [ inst for inst in institutions if inst.get('locations') and inst['locations'][0].get('country') == args.country ] print(f"Filtered to {len(institutions):,} institutions in {args.country}") # Limit if specified institutions_to_process = institutions if args.limit: institutions_to_process = institutions[:args.limit] print(f"āš ļø Processing only first {len(institutions_to_process):,} institutions (--limit flag)") print(f" Full dataset of {len(institutions):,} institutions will be preserved") if args.dry_run: print("\nāš ļø DRY RUN MODE - No API calls or file modifications will be made") # Initialize geocoder print(f"Initializing geocoder with cache at {cache_file}") geocoder = GlobalGeocoder(cache_file) # Show cache stats cache_stats = geocoder.cache.stats() print(f"Cache contains {cache_stats['total_queries']:,} queries ({cache_stats['successful']:,} successful)") # Geocode institutions print(f"\nGeocoding institutions...") print("=" * 80) updated_count = 0 start_time = time.time() for i, institution in enumerate(institutions_to_process, 1): name = institution.get('name', 'Unknown') ghcid = institution.get('ghcid', 'Unknown') country = institution.get('locations', [{}])[0].get('country', '??') if institution.get('locations') else '??' if args.verbose or i % 100 == 0 or i <= 10: print(f"\n[{i}/{len(institutions_to_process)}] {country} - {name}") print(f" GHCID: {ghcid}") updated = geocoder.geocode_institution( institution, dry_run=args.dry_run, force=args.force, verbose=args.verbose ) if updated: updated_count += 1 # Save progress every 100 institutions if not args.dry_run and i % 100 == 0 and updated_count > 0: print(f"\nšŸ’¾ Saving progress at {i}/{len(institutions_to_process)} institutions...") with open(global_file, 'w', encoding='utf-8') as f: yaml.dump( institutions, # Save FULL dataset, not just processed subset f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120 ) print(f"āœ… Progress saved ({updated_count:,} institutions updated so far)") # Progress indicator (without newline) if not args.verbose and i % 10 == 0: elapsed = time.time() - start_time rate = i / elapsed if elapsed > 0 else 0 remaining = (len(institutions_to_process) - i) / rate if rate > 0 else 0 print(f"\rProgress: {i}/{len(institutions_to_process)} ({i/len(institutions_to_process)*100:.1f}%) | " f"Rate: {rate:.1f}/sec | ETA: {remaining/60:.1f} min", end='', flush=True) print() # Newline after progress # Print statistics geocoder.print_stats() # Save updated dataset (final save) if not args.dry_run and updated_count > 0: output_file = global_file print(f"\nšŸ’¾ Saving final results to {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump( institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120 ) print(f"āœ… Saved {len(institutions):,} institutions ({updated_count:,} updated with coordinates)") elif args.dry_run: print(f"\nāš ļø DRY RUN - No files were modified") print(f"Would have updated {updated_count:,} institutions") else: print(f"\nāœ“ No institutions needed updating") # Close geocoder geocoder.close() # Final summary elapsed = time.time() - start_time print(f"\nTotal execution time: {elapsed/60:.1f} minutes") if geocoder.stats['api_calls'] > 0: print(f"Average API call rate: {geocoder.stats['api_calls']/elapsed:.2f} requests/second") if __name__ == '__main__': main()