307 lines
10 KiB
Python
307 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Geocode Canadian heritage institutions using GeoNames database.
|
|
|
|
Adds latitude, longitude, and GeoNames IDs to location records.
|
|
Uses offline GeoNames database for fast, reliable lookups.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
import requests
|
|
|
|
# Add src to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.geocoding import GeoNamesDB
|
|
|
|
|
|
# Canadian amalgamated municipality mappings
|
|
# Cities that merged into larger municipalities but still appear in historical records
|
|
CANADIAN_CITY_ALIASES = {
|
|
# Ontario amalgamations (1998 Toronto megacity merger)
|
|
"North York": "Toronto",
|
|
"Scarborough": "Toronto",
|
|
"East York": "Toronto",
|
|
"Etobicoke": "Toronto",
|
|
"York": "Toronto",
|
|
|
|
# Quebec amalgamations
|
|
"Ste-Foy": "Quebec",
|
|
"Sainte-Foy": "Quebec",
|
|
"Sillery": "Quebec",
|
|
"Cap-Rouge": "Quebec",
|
|
|
|
# Ontario - Greater Sudbury (2001)
|
|
"Sudbury": "Greater Sudbury",
|
|
|
|
# Other known amalgamations
|
|
"St. Catharines": "St Catharines", # Punctuation normalization
|
|
"St Catharines": "St. Catharines",
|
|
}
|
|
|
|
|
|
def geocode_with_nominatim(city: str, region: str, country: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Geocode using Nominatim API (fallback for small communities not in GeoNames).
|
|
|
|
Rate limit: 1 request per second per Nominatim usage policy.
|
|
|
|
Args:
|
|
city: City name
|
|
region: Province/state name
|
|
country: Country code (e.g., 'CA')
|
|
|
|
Returns:
|
|
Dict with lat, lon, display_name if found, None otherwise
|
|
"""
|
|
# Respect Nominatim rate limit
|
|
time.sleep(1.1) # 1 second + buffer
|
|
|
|
base_url = "https://nominatim.openstreetmap.org/search"
|
|
|
|
# Build query: "city, region, country"
|
|
query = f"{city}, {region}, {country}"
|
|
|
|
params = {
|
|
'q': query,
|
|
'format': 'json',
|
|
'limit': 1,
|
|
'addressdetails': 1
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': 'GLAM-Heritage-Project/1.0 (https://github.com/example/glam-heritage)'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(base_url, params=params, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
|
|
if results and len(results) > 0:
|
|
result = results[0]
|
|
return {
|
|
'latitude': float(result['lat']),
|
|
'longitude': float(result['lon']),
|
|
'display_name': result['display_name']
|
|
}
|
|
except Exception as e:
|
|
print(f" Nominatim error for {query}: {e}")
|
|
|
|
return None
|
|
|
|
|
|
def geocode_institution(institution: Dict[str, Any], geonames_db: GeoNamesDB, use_nominatim: bool = False) -> bool:
|
|
"""
|
|
Geocode a single institution by looking up its city.
|
|
|
|
Args:
|
|
institution: Institution record from Canadian dataset
|
|
geonames_db: GeoNames database instance
|
|
|
|
Returns:
|
|
True if geocoded successfully, False otherwise
|
|
"""
|
|
if not institution.get('locations'):
|
|
return False
|
|
|
|
location = institution['locations'][0] # Canadian institutions have single location
|
|
|
|
city = location.get('city')
|
|
region = location.get('region')
|
|
country = location.get('country', 'CA')
|
|
|
|
if not city:
|
|
return False
|
|
|
|
# Apply amalgamation mapping if needed
|
|
original_city = city
|
|
if city in CANADIAN_CITY_ALIASES:
|
|
city = CANADIAN_CITY_ALIASES[city]
|
|
print(f" Mapping {original_city} → {city}")
|
|
|
|
# Look up city in GeoNames
|
|
city_info = geonames_db.lookup_city(city, country)
|
|
|
|
if city_info:
|
|
# Add geocoding data
|
|
location['latitude'] = city_info.latitude
|
|
location['longitude'] = city_info.longitude
|
|
location['geonames_id'] = str(city_info.geonames_id)
|
|
|
|
# Verify admin1 (province) matches if available
|
|
if city_info.admin1_name and region:
|
|
if city_info.admin1_name.lower() != region.lower():
|
|
# Province mismatch - log but still use coordinates
|
|
print(f" Warning: Province mismatch for {city}: "
|
|
f"expected {region}, GeoNames has {city_info.admin1_name}")
|
|
|
|
return True
|
|
|
|
# GeoNames failed - try Nominatim fallback if enabled
|
|
if use_nominatim and region:
|
|
print(f" GeoNames failed for {city}, {region} - trying Nominatim...")
|
|
nominatim_result = geocode_with_nominatim(city, region, country)
|
|
|
|
if nominatim_result:
|
|
location['latitude'] = nominatim_result['latitude']
|
|
location['longitude'] = nominatim_result['longitude']
|
|
location['geocoding_source'] = 'Nominatim'
|
|
print(f" ✓ Nominatim success: {city}, {region}")
|
|
return True
|
|
else:
|
|
print(f" ✗ Nominatim also failed: {city}, {region}")
|
|
else:
|
|
print(f" Geocoding failed: {city}, {country} not found in GeoNames")
|
|
|
|
return False
|
|
|
|
|
|
def geocode_canadian_institutions(
|
|
input_file: Path,
|
|
output_file: Path,
|
|
geonames_db_path: Optional[Path] = None,
|
|
use_nominatim: bool = False
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Geocode all Canadian heritage institutions.
|
|
|
|
Args:
|
|
input_file: Path to canadian_heritage_custodians.json
|
|
output_file: Path to write geocoded output
|
|
geonames_db_path: Optional path to GeoNames database
|
|
|
|
Returns:
|
|
Statistics dictionary with counts
|
|
"""
|
|
print(f"Loading Canadian institutions from {input_file}")
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = json.load(f)
|
|
|
|
print(f"Loaded {len(institutions):,} institutions")
|
|
print("Initializing GeoNames database...")
|
|
|
|
# Initialize GeoNames database
|
|
try:
|
|
geonames_db = GeoNamesDB(db_path=geonames_db_path)
|
|
stats = geonames_db.get_stats()
|
|
print(f" GeoNames DB: {stats['total_cities']:,} cities, "
|
|
f"{stats['total_countries']} countries")
|
|
except FileNotFoundError as e:
|
|
print(f"Error: {e}")
|
|
print("Run scripts/build_geonames_db.py first to create the database")
|
|
sys.exit(1)
|
|
|
|
print("\nGeocoding institutions...")
|
|
if use_nominatim:
|
|
print(" Nominatim fallback: ENABLED (rate limit: 1 req/sec)")
|
|
else:
|
|
print(" Nominatim fallback: DISABLED")
|
|
|
|
geocoded_count = 0
|
|
failed_count = 0
|
|
no_location_count = 0
|
|
nominatim_count = 0
|
|
|
|
for i, institution in enumerate(institutions, 1):
|
|
if i % 1000 == 0:
|
|
print(f" Progress: {i:,}/{len(institutions):,} "
|
|
f"({geocoded_count:,} geocoded, {failed_count} failed, {nominatim_count} via Nominatim)")
|
|
|
|
if not institution.get('locations'):
|
|
no_location_count += 1
|
|
continue
|
|
|
|
# Check if already geocoded
|
|
location = institution['locations'][0]
|
|
if location.get('latitude') and location.get('longitude'):
|
|
geocoded_count += 1
|
|
# Count Nominatim geocoded institutions
|
|
if location.get('geocoding_source') == 'Nominatim':
|
|
nominatim_count += 1
|
|
continue
|
|
|
|
# Geocode
|
|
was_nominatim_before = location.get('geocoding_source') == 'Nominatim'
|
|
if geocode_institution(institution, geonames_db, use_nominatim):
|
|
geocoded_count += 1
|
|
if location.get('geocoding_source') == 'Nominatim' and not was_nominatim_before:
|
|
nominatim_count += 1
|
|
else:
|
|
failed_count += 1
|
|
|
|
# Save geocoded data
|
|
print(f"\nSaving geocoded data to {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(institutions, f, indent=2, ensure_ascii=False)
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total_institutions': len(institutions),
|
|
'geocoded': geocoded_count,
|
|
'failed': failed_count,
|
|
'no_location': no_location_count,
|
|
'nominatim_geocoded': nominatim_count,
|
|
'success_rate': geocoded_count / len(institutions) if institutions else 0
|
|
}
|
|
|
|
print("\n" + "="*60)
|
|
print("GEOCODING COMPLETE")
|
|
print("="*60)
|
|
print(f"Total institutions: {stats['total_institutions']:>8,}")
|
|
print(f"Successfully geocoded: {stats['geocoded']:>8,} ({stats['success_rate']:.1%})")
|
|
print(f" - GeoNames: {stats['geocoded'] - stats['nominatim_geocoded']:>8,}")
|
|
print(f" - Nominatim: {stats['nominatim_geocoded']:>8,}")
|
|
print(f"Failed to geocode: {stats['failed']:>8}")
|
|
print(f"No location data: {stats['no_location']:>8}")
|
|
print("="*60)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Geocode Canadian heritage institutions')
|
|
parser.add_argument('--nominatim', action='store_true',
|
|
help='Enable Nominatim API fallback for failed GeoNames lookups')
|
|
args = parser.parse_args()
|
|
|
|
project_root = Path(__file__).parent.parent
|
|
|
|
input_file = project_root / "data/instances/canada/canadian_heritage_custodians.json"
|
|
output_file = project_root / "data/instances/canada/canadian_heritage_custodians_geocoded.json"
|
|
geonames_db = project_root / "data/reference/geonames.db"
|
|
|
|
if not input_file.exists():
|
|
print(f"Error: Input file not found: {input_file}")
|
|
sys.exit(1)
|
|
|
|
if not geonames_db.exists():
|
|
print(f"Error: GeoNames database not found: {geonames_db}")
|
|
print("Run scripts/build_geonames_db.py first to create the database")
|
|
sys.exit(1)
|
|
|
|
# Create output directory if needed
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Run geocoding
|
|
stats = geocode_canadian_institutions(input_file, output_file, geonames_db, use_nominatim=args.nominatim)
|
|
|
|
print(f"\nOutput written to: {output_file}")
|
|
print(f"File size: {output_file.stat().st_size / 1024 / 1024:.1f} MB")
|
|
|
|
if not args.nominatim and stats['failed'] > 0:
|
|
print(f"\nTip: {stats['failed']} institutions still need geocoding.")
|
|
print("Run with --nominatim flag to use Nominatim API fallback for small communities.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|