glam/scripts/enrich_cities_google.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

459 lines
15 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich custodian files with city/region data using Google Places API.
This is a generic script that works for any country's XXX files.
Usage:
python scripts/enrich_cities_google.py --country KR [--dry-run] [--limit N]
python scripts/enrich_cities_google.py --country AR [--dry-run] [--limit N]
python scripts/enrich_cities_google.py --all [--dry-run] [--limit N]
Environment Variables:
GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled
"""
import os
import sys
import time
import sqlite3
import re
import argparse
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import yaml
import httpx
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# Google Places API
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
REQUEST_DELAY = 0.3
# Country name mapping for search queries
COUNTRY_NAMES = {
'KR': 'South Korea',
'AR': 'Argentina',
'US': 'United States',
'IN': 'India',
'JM': 'Jamaica',
'UZ': 'Uzbekistan',
'UA': 'Ukraine',
'TJ': 'Tajikistan',
'OM': 'Oman',
'NL': 'Netherlands',
'NA': 'Namibia',
'ML': 'Mali',
'LK': 'Sri Lanka',
'LB': 'Lebanon',
'IT': 'Italy',
'IR': 'Iran',
'EC': 'Ecuador',
'DK': 'Denmark',
'CU': 'Cuba',
'CO': 'Colombia',
'BR': 'Brazil',
'MX': 'Mexico',
'JP': 'Japan',
'CZ': 'Czech Republic',
'DE': 'Germany',
'FR': 'France',
'GB': 'United Kingdom',
}
def get_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name."""
name = city_name.strip()
# Remove common suffixes
for suffix in [' City', ' Town', '-shi', '-ku', '-gun', '-cho', ' District']:
if name.endswith(suffix):
name = name[:-len(suffix)]
words = name.split()
if len(words) == 1:
return name[:3].upper()
elif len(words) == 2:
return (words[0][0] + words[1][:2]).upper()
else:
return ''.join(w[0] for w in words[:3]).upper()
def search_google_places(query: str, api_key: str) -> Optional[dict]:
"""Search Google Places API for a location."""
headers = {
"Content-Type": "application/json",
"X-Goog-Api-Key": api_key,
"X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri"
}
payload = {
"textQuery": query,
"languageCode": "en"
}
try:
response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
if "places" in data and len(data["places"]) > 0:
return data["places"][0]
return None
except Exception as e:
print(f" Error searching Google Places: {e}")
return None
def extract_location_from_google(place: dict) -> dict:
"""Extract location information from Google Places result."""
result = {
'city': None,
'region': None,
'latitude': None,
'longitude': None,
'formatted_address': None,
'place_id': None,
'website': None,
}
if not place:
return result
result['place_id'] = place.get('id')
result['formatted_address'] = place.get('formattedAddress')
result['website'] = place.get('websiteUri')
location = place.get('location', {})
result['latitude'] = location.get('latitude')
result['longitude'] = location.get('longitude')
components = place.get('addressComponents', [])
for comp in components:
types = comp.get('types', [])
long_name = comp.get('longText', '')
if 'locality' in types:
result['city'] = long_name
elif 'administrative_area_level_1' in types:
result['region'] = long_name
elif 'sublocality_level_1' in types and not result['city']:
result['city'] = long_name
return result
def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, country_code: str) -> Optional[dict]:
"""Reverse geocode coordinates to find nearest city in GeoNames."""
cursor = conn.cursor()
cursor.execute("""
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
latitude, longitude, population, feature_code,
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq
FROM cities
WHERE country_code = ?
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY dist_sq
LIMIT 1
""", (lat, lat, lon, lon, country_code))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'geonames_id': row[4],
'latitude': row[5],
'longitude': row[6],
'population': row[7],
'feature_code': row[8],
}
return None
def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str:
"""Get ISO-style region code from GeoNames admin1_code."""
if not admin1_code:
return 'XX'
# For most countries, use first 2-3 characters of admin1_code or name
if len(admin1_code) <= 3:
return admin1_code.upper()
# Use abbreviation from name
if admin1_name:
words = admin1_name.split()
if len(words) == 1:
return admin1_name[:2].upper()
else:
return ''.join(w[0] for w in words[:2]).upper()
return admin1_code[:2].upper()
def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str,
country_code: str, country_name: str, dry_run: bool = False) -> dict:
"""Process a single custodian file."""
result = {
'file': str(filepath),
'status': 'skipped',
'old_ghcid': None,
'new_ghcid': None,
'city': None,
'region': None,
'error': None,
}
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
result['status'] = 'error'
result['error'] = f'Failed to load YAML: {e}'
return result
if not data:
result['status'] = 'error'
result['error'] = 'Empty YAML file'
return result
ghcid_data = data.get('ghcid', {})
old_ghcid = ghcid_data.get('ghcid_current', '')
result['old_ghcid'] = old_ghcid
# Match both patterns:
# 1. {country}-XX-XXX-... (no region, no city)
# 2. {country}-{region}-XXX-... (has region, no city)
xxx_pattern = re.compile(rf'^{country_code}-[A-Z]{{2}}-XXX-')
if not xxx_pattern.match(old_ghcid):
result['status'] = 'skipped'
result['error'] = f'Not a {country_code}-*-XXX file'
return result
# Get institution name
name = data.get('custodian_name', {}).get('claim_value', '')
if not name:
name = data.get('original_entry', {}).get('name', '')
if not name:
result['status'] = 'error'
result['error'] = 'No institution name found'
return result
# Search Google Places
search_query = f"{name} {country_name}"
print(f" Searching: {name[:50]}...")
place = search_google_places(search_query, api_key)
time.sleep(REQUEST_DELAY)
if not place:
result['status'] = 'error'
result['error'] = 'Not found in Google Places'
return result
location_info = extract_location_from_google(place)
if not location_info['latitude'] or not location_info['longitude']:
result['status'] = 'error'
result['error'] = 'No coordinates from Google'
return result
# Lookup in GeoNames
city_info = lookup_city_geonames(conn, location_info['latitude'],
location_info['longitude'], country_code)
if not city_info:
result['status'] = 'error'
result['error'] = 'City not found in GeoNames'
return result
region_code = get_region_code(city_info['admin1_code'], country_code, city_info['admin1_name'])
city_code = get_city_code(city_info['ascii_name'])
result['city'] = city_info['ascii_name']
result['region'] = city_info['admin1_name']
# Build new GHCID
parts = old_ghcid.split('-')
if len(parts) >= 5:
inst_type = parts[3]
abbreviation = '-'.join(parts[4:])
else:
result['status'] = 'error'
result['error'] = f'Invalid GHCID format: {old_ghcid}'
return result
new_ghcid = f'{country_code}-{region_code}-{city_code}-{inst_type}-{abbreviation}'
result['new_ghcid'] = new_ghcid
if dry_run:
result['status'] = 'would_update'
return result
# Update the data
timestamp = datetime.now(timezone.utc).isoformat()
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['location_resolution'] = {
'method': 'GOOGLE_PLACES_GEONAMES',
'country_code': country_code,
'region_code': region_code,
'region_name': city_info['admin1_name'],
'city_code': city_code,
'city_name': city_info['ascii_name'],
'geonames_id': city_info['geonames_id'],
'feature_code': city_info['feature_code'],
'google_place_id': location_info.get('place_id'),
'latitude': location_info['latitude'],
'longitude': location_info['longitude'],
'resolution_date': timestamp,
}
data['google_maps_enrichment'] = {
'place_id': location_info.get('place_id'),
'formatted_address': location_info.get('formatted_address'),
'website': location_info.get('website'),
'latitude': location_info['latitude'],
'longitude': location_info['longitude'],
'enriched_at': timestamp,
'source': 'Google Places API (New)',
}
# Update GHCID history
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
for entry in data['ghcid']['ghcid_history']:
if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
entry['valid_to'] = timestamp
data['ghcid']['ghcid_history'].append({
'ghcid': new_ghcid,
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
'valid_from': timestamp,
'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})',
})
if 'identifiers' in data:
for identifier in data['identifiers']:
if identifier.get('identifier_scheme') == 'GHCID':
identifier['identifier_value'] = new_ghcid
# Write and rename
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
new_filename = f'{new_ghcid}.yaml'
new_filepath = filepath.parent / new_filename
if filepath != new_filepath and not new_filepath.exists():
filepath.rename(new_filepath)
result['new_file'] = str(new_filepath)
elif new_filepath.exists() and filepath != new_filepath:
result['status'] = 'collision'
result['error'] = f'Target file exists: {new_filepath.name}'
return result
result['status'] = 'updated'
return result
def main():
parser = argparse.ArgumentParser(description='Enrich custodian files with Google Places data')
parser.add_argument('--country', type=str, help='Country code (e.g., KR, AR, US)')
parser.add_argument('--all', action='store_true', help='Process all countries with XXX files')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
parser.add_argument('--limit', type=int, help='Limit number of files per country')
args = parser.parse_args()
if not GOOGLE_PLACES_TOKEN:
print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required")
sys.exit(1)
if not GEONAMES_DB.exists():
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
sys.exit(1)
# Determine which countries to process
if args.all:
# Find all countries with XXX files (either XX-XXX or {region}-XXX)
countries = set()
for f in CUSTODIAN_DIR.glob('*-*-XXX-*.yaml'):
cc = f.name[:2]
if cc in COUNTRY_NAMES:
countries.add(cc)
countries = sorted(countries)
elif args.country:
countries = [args.country.upper()]
else:
print("ERROR: Specify --country CODE or --all")
sys.exit(1)
conn = sqlite3.connect(str(GEONAMES_DB))
total_stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
for country_code in countries:
country_name = COUNTRY_NAMES.get(country_code, country_code)
files = sorted(CUSTODIAN_DIR.glob(f'{country_code}-*-XXX-*.yaml'))
if args.limit:
files = files[:args.limit]
if not files:
continue
print(f"\n{'='*60}")
print(f"Processing {country_code} ({country_name}): {len(files)} files")
print('='*60)
stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
for filepath in files:
print(f"Processing: {filepath.name}")
result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN,
country_code, country_name, dry_run=args.dry_run)
stats[result['status']] = stats.get(result['status'], 0) + 1
if result['status'] in ('updated', 'would_update'):
print(f"{result['city']} ({result['region']}): {result['old_ghcid']}{result['new_ghcid']}")
elif result['status'] == 'error':
print(f"{result['error']}")
elif result['status'] == 'collision':
print(f"{result['error']}")
print(f"\n{country_code} Summary: Updated={stats.get('updated', 0)}, "
f"Would update={stats.get('would_update', 0)}, "
f"Errors={stats.get('error', 0)}")
for k, v in stats.items():
total_stats[k] = total_stats.get(k, 0) + v
conn.close()
print()
print('='*60)
print('TOTAL Summary:')
print(f" Updated: {total_stats.get('updated', 0)}")
print(f" Would update: {total_stats.get('would_update', 0)}")
print(f" Errors: {total_stats.get('error', 0)}")
print(f" Collisions: {total_stats.get('collision', 0)}")
print(f" Skipped: {total_stats.get('skipped', 0)}")
if __name__ == '__main__':
main()