#!/usr/bin/env python3 """ Geocode Canadian heritage institutions using GeoNames database. Adds latitude, longitude, and GeoNames IDs to location records. Uses offline GeoNames database for fast, reliable lookups. """ import json import sys import time from pathlib import Path from typing import List, Dict, Any, Optional import requests # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.geocoding import GeoNamesDB # Canadian amalgamated municipality mappings # Cities that merged into larger municipalities but still appear in historical records CANADIAN_CITY_ALIASES = { # Ontario amalgamations (1998 Toronto megacity merger) "North York": "Toronto", "Scarborough": "Toronto", "East York": "Toronto", "Etobicoke": "Toronto", "York": "Toronto", # Quebec amalgamations "Ste-Foy": "Quebec", "Sainte-Foy": "Quebec", "Sillery": "Quebec", "Cap-Rouge": "Quebec", # Ontario - Greater Sudbury (2001) "Sudbury": "Greater Sudbury", # Other known amalgamations "St. Catharines": "St Catharines", # Punctuation normalization "St Catharines": "St. Catharines", } def geocode_with_nominatim(city: str, region: str, country: str) -> Optional[Dict[str, Any]]: """ Geocode using Nominatim API (fallback for small communities not in GeoNames). Rate limit: 1 request per second per Nominatim usage policy. Args: city: City name region: Province/state name country: Country code (e.g., 'CA') Returns: Dict with lat, lon, display_name if found, None otherwise """ # Respect Nominatim rate limit time.sleep(1.1) # 1 second + buffer base_url = "https://nominatim.openstreetmap.org/search" # Build query: "city, region, country" query = f"{city}, {region}, {country}" params = { 'q': query, 'format': 'json', 'limit': 1, 'addressdetails': 1 } headers = { 'User-Agent': 'GLAM-Heritage-Project/1.0 (https://github.com/example/glam-heritage)' } try: response = requests.get(base_url, params=params, headers=headers, timeout=10) response.raise_for_status() results = response.json() if results and len(results) > 0: result = results[0] return { 'latitude': float(result['lat']), 'longitude': float(result['lon']), 'display_name': result['display_name'] } except Exception as e: print(f" Nominatim error for {query}: {e}") return None def geocode_institution(institution: Dict[str, Any], geonames_db: GeoNamesDB, use_nominatim: bool = False) -> bool: """ Geocode a single institution by looking up its city. Args: institution: Institution record from Canadian dataset geonames_db: GeoNames database instance Returns: True if geocoded successfully, False otherwise """ if not institution.get('locations'): return False location = institution['locations'][0] # Canadian institutions have single location city = location.get('city') region = location.get('region') country = location.get('country', 'CA') if not city: return False # Apply amalgamation mapping if needed original_city = city if city in CANADIAN_CITY_ALIASES: city = CANADIAN_CITY_ALIASES[city] print(f" Mapping {original_city} → {city}") # Look up city in GeoNames city_info = geonames_db.lookup_city(city, country) if city_info: # Add geocoding data location['latitude'] = city_info.latitude location['longitude'] = city_info.longitude location['geonames_id'] = str(city_info.geonames_id) # Verify admin1 (province) matches if available if city_info.admin1_name and region: if city_info.admin1_name.lower() != region.lower(): # Province mismatch - log but still use coordinates print(f" Warning: Province mismatch for {city}: " f"expected {region}, GeoNames has {city_info.admin1_name}") return True # GeoNames failed - try Nominatim fallback if enabled if use_nominatim and region: print(f" GeoNames failed for {city}, {region} - trying Nominatim...") nominatim_result = geocode_with_nominatim(city, region, country) if nominatim_result: location['latitude'] = nominatim_result['latitude'] location['longitude'] = nominatim_result['longitude'] location['geocoding_source'] = 'Nominatim' print(f" ✓ Nominatim success: {city}, {region}") return True else: print(f" ✗ Nominatim also failed: {city}, {region}") else: print(f" Geocoding failed: {city}, {country} not found in GeoNames") return False def geocode_canadian_institutions( input_file: Path, output_file: Path, geonames_db_path: Optional[Path] = None, use_nominatim: bool = False ) -> Dict[str, Any]: """ Geocode all Canadian heritage institutions. Args: input_file: Path to canadian_heritage_custodians.json output_file: Path to write geocoded output geonames_db_path: Optional path to GeoNames database Returns: Statistics dictionary with counts """ print(f"Loading Canadian institutions from {input_file}") with open(input_file, 'r', encoding='utf-8') as f: institutions = json.load(f) print(f"Loaded {len(institutions):,} institutions") print("Initializing GeoNames database...") # Initialize GeoNames database try: geonames_db = GeoNamesDB(db_path=geonames_db_path) stats = geonames_db.get_stats() print(f" GeoNames DB: {stats['total_cities']:,} cities, " f"{stats['total_countries']} countries") except FileNotFoundError as e: print(f"Error: {e}") print("Run scripts/build_geonames_db.py first to create the database") sys.exit(1) print("\nGeocoding institutions...") if use_nominatim: print(" Nominatim fallback: ENABLED (rate limit: 1 req/sec)") else: print(" Nominatim fallback: DISABLED") geocoded_count = 0 failed_count = 0 no_location_count = 0 nominatim_count = 0 for i, institution in enumerate(institutions, 1): if i % 1000 == 0: print(f" Progress: {i:,}/{len(institutions):,} " f"({geocoded_count:,} geocoded, {failed_count} failed, {nominatim_count} via Nominatim)") if not institution.get('locations'): no_location_count += 1 continue # Check if already geocoded location = institution['locations'][0] if location.get('latitude') and location.get('longitude'): geocoded_count += 1 # Count Nominatim geocoded institutions if location.get('geocoding_source') == 'Nominatim': nominatim_count += 1 continue # Geocode was_nominatim_before = location.get('geocoding_source') == 'Nominatim' if geocode_institution(institution, geonames_db, use_nominatim): geocoded_count += 1 if location.get('geocoding_source') == 'Nominatim' and not was_nominatim_before: nominatim_count += 1 else: failed_count += 1 # Save geocoded data print(f"\nSaving geocoded data to {output_file}") with open(output_file, 'w', encoding='utf-8') as f: json.dump(institutions, f, indent=2, ensure_ascii=False) # Statistics stats = { 'total_institutions': len(institutions), 'geocoded': geocoded_count, 'failed': failed_count, 'no_location': no_location_count, 'nominatim_geocoded': nominatim_count, 'success_rate': geocoded_count / len(institutions) if institutions else 0 } print("\n" + "="*60) print("GEOCODING COMPLETE") print("="*60) print(f"Total institutions: {stats['total_institutions']:>8,}") print(f"Successfully geocoded: {stats['geocoded']:>8,} ({stats['success_rate']:.1%})") print(f" - GeoNames: {stats['geocoded'] - stats['nominatim_geocoded']:>8,}") print(f" - Nominatim: {stats['nominatim_geocoded']:>8,}") print(f"Failed to geocode: {stats['failed']:>8}") print(f"No location data: {stats['no_location']:>8}") print("="*60) return stats def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser(description='Geocode Canadian heritage institutions') parser.add_argument('--nominatim', action='store_true', help='Enable Nominatim API fallback for failed GeoNames lookups') args = parser.parse_args() project_root = Path(__file__).parent.parent input_file = project_root / "data/instances/canada/canadian_heritage_custodians.json" output_file = project_root / "data/instances/canada/canadian_heritage_custodians_geocoded.json" geonames_db = project_root / "data/reference/geonames.db" if not input_file.exists(): print(f"Error: Input file not found: {input_file}") sys.exit(1) if not geonames_db.exists(): print(f"Error: GeoNames database not found: {geonames_db}") print("Run scripts/build_geonames_db.py first to create the database") sys.exit(1) # Create output directory if needed output_file.parent.mkdir(parents=True, exist_ok=True) # Run geocoding stats = geocode_canadian_institutions(input_file, output_file, geonames_db, use_nominatim=args.nominatim) print(f"\nOutput written to: {output_file}") print(f"File size: {output_file.stat().st_size / 1024 / 1024:.1f} MB") if not args.nominatim and stats['failed'] > 0: print(f"\nTip: {stats['failed']} institutions still need geocoding.") print("Run with --nominatim flag to use Nominatim API fallback for small communities.") if __name__ == '__main__': main()