glam/scripts/geocode_canadian_institutions.py
2025-11-19 23:25:22 +01:00

307 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Geocode Canadian heritage institutions using GeoNames database.
Adds latitude, longitude, and GeoNames IDs to location records.
Uses offline GeoNames database for fast, reliable lookups.
"""
import json
import sys
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
import requests
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.geocoding import GeoNamesDB
# Canadian amalgamated municipality mappings
# Cities that merged into larger municipalities but still appear in historical records
CANADIAN_CITY_ALIASES = {
# Ontario amalgamations (1998 Toronto megacity merger)
"North York": "Toronto",
"Scarborough": "Toronto",
"East York": "Toronto",
"Etobicoke": "Toronto",
"York": "Toronto",
# Quebec amalgamations
"Ste-Foy": "Quebec",
"Sainte-Foy": "Quebec",
"Sillery": "Quebec",
"Cap-Rouge": "Quebec",
# Ontario - Greater Sudbury (2001)
"Sudbury": "Greater Sudbury",
# Other known amalgamations
"St. Catharines": "St Catharines", # Punctuation normalization
"St Catharines": "St. Catharines",
}
def geocode_with_nominatim(city: str, region: str, country: str) -> Optional[Dict[str, Any]]:
"""
Geocode using Nominatim API (fallback for small communities not in GeoNames).
Rate limit: 1 request per second per Nominatim usage policy.
Args:
city: City name
region: Province/state name
country: Country code (e.g., 'CA')
Returns:
Dict with lat, lon, display_name if found, None otherwise
"""
# Respect Nominatim rate limit
time.sleep(1.1) # 1 second + buffer
base_url = "https://nominatim.openstreetmap.org/search"
# Build query: "city, region, country"
query = f"{city}, {region}, {country}"
params = {
'q': query,
'format': 'json',
'limit': 1,
'addressdetails': 1
}
headers = {
'User-Agent': 'GLAM-Heritage-Project/1.0 (https://github.com/example/glam-heritage)'
}
try:
response = requests.get(base_url, params=params, headers=headers, timeout=10)
response.raise_for_status()
results = response.json()
if results and len(results) > 0:
result = results[0]
return {
'latitude': float(result['lat']),
'longitude': float(result['lon']),
'display_name': result['display_name']
}
except Exception as e:
print(f" Nominatim error for {query}: {e}")
return None
def geocode_institution(institution: Dict[str, Any], geonames_db: GeoNamesDB, use_nominatim: bool = False) -> bool:
"""
Geocode a single institution by looking up its city.
Args:
institution: Institution record from Canadian dataset
geonames_db: GeoNames database instance
Returns:
True if geocoded successfully, False otherwise
"""
if not institution.get('locations'):
return False
location = institution['locations'][0] # Canadian institutions have single location
city = location.get('city')
region = location.get('region')
country = location.get('country', 'CA')
if not city:
return False
# Apply amalgamation mapping if needed
original_city = city
if city in CANADIAN_CITY_ALIASES:
city = CANADIAN_CITY_ALIASES[city]
print(f" Mapping {original_city}{city}")
# Look up city in GeoNames
city_info = geonames_db.lookup_city(city, country)
if city_info:
# Add geocoding data
location['latitude'] = city_info.latitude
location['longitude'] = city_info.longitude
location['geonames_id'] = str(city_info.geonames_id)
# Verify admin1 (province) matches if available
if city_info.admin1_name and region:
if city_info.admin1_name.lower() != region.lower():
# Province mismatch - log but still use coordinates
print(f" Warning: Province mismatch for {city}: "
f"expected {region}, GeoNames has {city_info.admin1_name}")
return True
# GeoNames failed - try Nominatim fallback if enabled
if use_nominatim and region:
print(f" GeoNames failed for {city}, {region} - trying Nominatim...")
nominatim_result = geocode_with_nominatim(city, region, country)
if nominatim_result:
location['latitude'] = nominatim_result['latitude']
location['longitude'] = nominatim_result['longitude']
location['geocoding_source'] = 'Nominatim'
print(f" ✓ Nominatim success: {city}, {region}")
return True
else:
print(f" ✗ Nominatim also failed: {city}, {region}")
else:
print(f" Geocoding failed: {city}, {country} not found in GeoNames")
return False
def geocode_canadian_institutions(
input_file: Path,
output_file: Path,
geonames_db_path: Optional[Path] = None,
use_nominatim: bool = False
) -> Dict[str, Any]:
"""
Geocode all Canadian heritage institutions.
Args:
input_file: Path to canadian_heritage_custodians.json
output_file: Path to write geocoded output
geonames_db_path: Optional path to GeoNames database
Returns:
Statistics dictionary with counts
"""
print(f"Loading Canadian institutions from {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = json.load(f)
print(f"Loaded {len(institutions):,} institutions")
print("Initializing GeoNames database...")
# Initialize GeoNames database
try:
geonames_db = GeoNamesDB(db_path=geonames_db_path)
stats = geonames_db.get_stats()
print(f" GeoNames DB: {stats['total_cities']:,} cities, "
f"{stats['total_countries']} countries")
except FileNotFoundError as e:
print(f"Error: {e}")
print("Run scripts/build_geonames_db.py first to create the database")
sys.exit(1)
print("\nGeocoding institutions...")
if use_nominatim:
print(" Nominatim fallback: ENABLED (rate limit: 1 req/sec)")
else:
print(" Nominatim fallback: DISABLED")
geocoded_count = 0
failed_count = 0
no_location_count = 0
nominatim_count = 0
for i, institution in enumerate(institutions, 1):
if i % 1000 == 0:
print(f" Progress: {i:,}/{len(institutions):,} "
f"({geocoded_count:,} geocoded, {failed_count} failed, {nominatim_count} via Nominatim)")
if not institution.get('locations'):
no_location_count += 1
continue
# Check if already geocoded
location = institution['locations'][0]
if location.get('latitude') and location.get('longitude'):
geocoded_count += 1
# Count Nominatim geocoded institutions
if location.get('geocoding_source') == 'Nominatim':
nominatim_count += 1
continue
# Geocode
was_nominatim_before = location.get('geocoding_source') == 'Nominatim'
if geocode_institution(institution, geonames_db, use_nominatim):
geocoded_count += 1
if location.get('geocoding_source') == 'Nominatim' and not was_nominatim_before:
nominatim_count += 1
else:
failed_count += 1
# Save geocoded data
print(f"\nSaving geocoded data to {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(institutions, f, indent=2, ensure_ascii=False)
# Statistics
stats = {
'total_institutions': len(institutions),
'geocoded': geocoded_count,
'failed': failed_count,
'no_location': no_location_count,
'nominatim_geocoded': nominatim_count,
'success_rate': geocoded_count / len(institutions) if institutions else 0
}
print("\n" + "="*60)
print("GEOCODING COMPLETE")
print("="*60)
print(f"Total institutions: {stats['total_institutions']:>8,}")
print(f"Successfully geocoded: {stats['geocoded']:>8,} ({stats['success_rate']:.1%})")
print(f" - GeoNames: {stats['geocoded'] - stats['nominatim_geocoded']:>8,}")
print(f" - Nominatim: {stats['nominatim_geocoded']:>8,}")
print(f"Failed to geocode: {stats['failed']:>8}")
print(f"No location data: {stats['no_location']:>8}")
print("="*60)
return stats
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(description='Geocode Canadian heritage institutions')
parser.add_argument('--nominatim', action='store_true',
help='Enable Nominatim API fallback for failed GeoNames lookups')
args = parser.parse_args()
project_root = Path(__file__).parent.parent
input_file = project_root / "data/instances/canada/canadian_heritage_custodians.json"
output_file = project_root / "data/instances/canada/canadian_heritage_custodians_geocoded.json"
geonames_db = project_root / "data/reference/geonames.db"
if not input_file.exists():
print(f"Error: Input file not found: {input_file}")
sys.exit(1)
if not geonames_db.exists():
print(f"Error: GeoNames database not found: {geonames_db}")
print("Run scripts/build_geonames_db.py first to create the database")
sys.exit(1)
# Create output directory if needed
output_file.parent.mkdir(parents=True, exist_ok=True)
# Run geocoding
stats = geocode_canadian_institutions(input_file, output_file, geonames_db, use_nominatim=args.nominatim)
print(f"\nOutput written to: {output_file}")
print(f"File size: {output_file.stat().st_size / 1024 / 1024:.1f} MB")
if not args.nominatim and stats['failed'] > 0:
print(f"\nTip: {stats['failed']} institutions still need geocoding.")
print("Run with --nominatim flag to use Nominatim API fallback for small communities.")
if __name__ == '__main__':
main()