Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
459 lines
15 KiB
Python
Executable file
459 lines
15 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich custodian files with city/region data using Google Places API.
|
|
|
|
This is a generic script that works for any country's XXX files.
|
|
|
|
Usage:
|
|
python scripts/enrich_cities_google.py --country KR [--dry-run] [--limit N]
|
|
python scripts/enrich_cities_google.py --country AR [--dry-run] [--limit N]
|
|
python scripts/enrich_cities_google.py --all [--dry-run] [--limit N]
|
|
|
|
Environment Variables:
|
|
GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import sqlite3
|
|
import re
|
|
import argparse
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
import yaml
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Configuration
|
|
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
|
|
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
# Google Places API
|
|
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
|
|
REQUEST_DELAY = 0.3
|
|
|
|
# Country name mapping for search queries
|
|
COUNTRY_NAMES = {
|
|
'KR': 'South Korea',
|
|
'AR': 'Argentina',
|
|
'US': 'United States',
|
|
'IN': 'India',
|
|
'JM': 'Jamaica',
|
|
'UZ': 'Uzbekistan',
|
|
'UA': 'Ukraine',
|
|
'TJ': 'Tajikistan',
|
|
'OM': 'Oman',
|
|
'NL': 'Netherlands',
|
|
'NA': 'Namibia',
|
|
'ML': 'Mali',
|
|
'LK': 'Sri Lanka',
|
|
'LB': 'Lebanon',
|
|
'IT': 'Italy',
|
|
'IR': 'Iran',
|
|
'EC': 'Ecuador',
|
|
'DK': 'Denmark',
|
|
'CU': 'Cuba',
|
|
'CO': 'Colombia',
|
|
'BR': 'Brazil',
|
|
'MX': 'Mexico',
|
|
'JP': 'Japan',
|
|
'CZ': 'Czech Republic',
|
|
'DE': 'Germany',
|
|
'FR': 'France',
|
|
'GB': 'United Kingdom',
|
|
}
|
|
|
|
|
|
def get_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from city name."""
|
|
name = city_name.strip()
|
|
# Remove common suffixes
|
|
for suffix in [' City', ' Town', '-shi', '-ku', '-gun', '-cho', ' District']:
|
|
if name.endswith(suffix):
|
|
name = name[:-len(suffix)]
|
|
|
|
words = name.split()
|
|
|
|
if len(words) == 1:
|
|
return name[:3].upper()
|
|
elif len(words) == 2:
|
|
return (words[0][0] + words[1][:2]).upper()
|
|
else:
|
|
return ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
|
|
def search_google_places(query: str, api_key: str) -> Optional[dict]:
|
|
"""Search Google Places API for a location."""
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"X-Goog-Api-Key": api_key,
|
|
"X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri"
|
|
}
|
|
|
|
payload = {
|
|
"textQuery": query,
|
|
"languageCode": "en"
|
|
}
|
|
|
|
try:
|
|
response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if "places" in data and len(data["places"]) > 0:
|
|
return data["places"][0]
|
|
return None
|
|
except Exception as e:
|
|
print(f" Error searching Google Places: {e}")
|
|
return None
|
|
|
|
|
|
def extract_location_from_google(place: dict) -> dict:
|
|
"""Extract location information from Google Places result."""
|
|
result = {
|
|
'city': None,
|
|
'region': None,
|
|
'latitude': None,
|
|
'longitude': None,
|
|
'formatted_address': None,
|
|
'place_id': None,
|
|
'website': None,
|
|
}
|
|
|
|
if not place:
|
|
return result
|
|
|
|
result['place_id'] = place.get('id')
|
|
result['formatted_address'] = place.get('formattedAddress')
|
|
result['website'] = place.get('websiteUri')
|
|
|
|
location = place.get('location', {})
|
|
result['latitude'] = location.get('latitude')
|
|
result['longitude'] = location.get('longitude')
|
|
|
|
components = place.get('addressComponents', [])
|
|
for comp in components:
|
|
types = comp.get('types', [])
|
|
long_name = comp.get('longText', '')
|
|
|
|
if 'locality' in types:
|
|
result['city'] = long_name
|
|
elif 'administrative_area_level_1' in types:
|
|
result['region'] = long_name
|
|
elif 'sublocality_level_1' in types and not result['city']:
|
|
result['city'] = long_name
|
|
|
|
return result
|
|
|
|
|
|
def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, country_code: str) -> Optional[dict]:
|
|
"""Reverse geocode coordinates to find nearest city in GeoNames."""
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""
|
|
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
|
|
latitude, longitude, population, feature_code,
|
|
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY dist_sq
|
|
LIMIT 1
|
|
""", (lat, lat, lon, lon, country_code))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return {
|
|
'name': row[0],
|
|
'ascii_name': row[1],
|
|
'admin1_code': row[2],
|
|
'admin1_name': row[3],
|
|
'geonames_id': row[4],
|
|
'latitude': row[5],
|
|
'longitude': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
}
|
|
return None
|
|
|
|
|
|
def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str:
|
|
"""Get ISO-style region code from GeoNames admin1_code."""
|
|
if not admin1_code:
|
|
return 'XX'
|
|
|
|
# For most countries, use first 2-3 characters of admin1_code or name
|
|
if len(admin1_code) <= 3:
|
|
return admin1_code.upper()
|
|
|
|
# Use abbreviation from name
|
|
if admin1_name:
|
|
words = admin1_name.split()
|
|
if len(words) == 1:
|
|
return admin1_name[:2].upper()
|
|
else:
|
|
return ''.join(w[0] for w in words[:2]).upper()
|
|
|
|
return admin1_code[:2].upper()
|
|
|
|
|
|
def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str,
|
|
country_code: str, country_name: str, dry_run: bool = False) -> dict:
|
|
"""Process a single custodian file."""
|
|
result = {
|
|
'file': str(filepath),
|
|
'status': 'skipped',
|
|
'old_ghcid': None,
|
|
'new_ghcid': None,
|
|
'city': None,
|
|
'region': None,
|
|
'error': None,
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
result['status'] = 'error'
|
|
result['error'] = f'Failed to load YAML: {e}'
|
|
return result
|
|
|
|
if not data:
|
|
result['status'] = 'error'
|
|
result['error'] = 'Empty YAML file'
|
|
return result
|
|
|
|
ghcid_data = data.get('ghcid', {})
|
|
old_ghcid = ghcid_data.get('ghcid_current', '')
|
|
result['old_ghcid'] = old_ghcid
|
|
|
|
# Match both patterns:
|
|
# 1. {country}-XX-XXX-... (no region, no city)
|
|
# 2. {country}-{region}-XXX-... (has region, no city)
|
|
xxx_pattern = re.compile(rf'^{country_code}-[A-Z]{{2}}-XXX-')
|
|
if not xxx_pattern.match(old_ghcid):
|
|
result['status'] = 'skipped'
|
|
result['error'] = f'Not a {country_code}-*-XXX file'
|
|
return result
|
|
|
|
# Get institution name
|
|
name = data.get('custodian_name', {}).get('claim_value', '')
|
|
if not name:
|
|
name = data.get('original_entry', {}).get('name', '')
|
|
|
|
if not name:
|
|
result['status'] = 'error'
|
|
result['error'] = 'No institution name found'
|
|
return result
|
|
|
|
# Search Google Places
|
|
search_query = f"{name} {country_name}"
|
|
print(f" Searching: {name[:50]}...")
|
|
place = search_google_places(search_query, api_key)
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
if not place:
|
|
result['status'] = 'error'
|
|
result['error'] = 'Not found in Google Places'
|
|
return result
|
|
|
|
location_info = extract_location_from_google(place)
|
|
|
|
if not location_info['latitude'] or not location_info['longitude']:
|
|
result['status'] = 'error'
|
|
result['error'] = 'No coordinates from Google'
|
|
return result
|
|
|
|
# Lookup in GeoNames
|
|
city_info = lookup_city_geonames(conn, location_info['latitude'],
|
|
location_info['longitude'], country_code)
|
|
|
|
if not city_info:
|
|
result['status'] = 'error'
|
|
result['error'] = 'City not found in GeoNames'
|
|
return result
|
|
|
|
region_code = get_region_code(city_info['admin1_code'], country_code, city_info['admin1_name'])
|
|
city_code = get_city_code(city_info['ascii_name'])
|
|
|
|
result['city'] = city_info['ascii_name']
|
|
result['region'] = city_info['admin1_name']
|
|
|
|
# Build new GHCID
|
|
parts = old_ghcid.split('-')
|
|
if len(parts) >= 5:
|
|
inst_type = parts[3]
|
|
abbreviation = '-'.join(parts[4:])
|
|
else:
|
|
result['status'] = 'error'
|
|
result['error'] = f'Invalid GHCID format: {old_ghcid}'
|
|
return result
|
|
|
|
new_ghcid = f'{country_code}-{region_code}-{city_code}-{inst_type}-{abbreviation}'
|
|
result['new_ghcid'] = new_ghcid
|
|
|
|
if dry_run:
|
|
result['status'] = 'would_update'
|
|
return result
|
|
|
|
# Update the data
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': 'GOOGLE_PLACES_GEONAMES',
|
|
'country_code': country_code,
|
|
'region_code': region_code,
|
|
'region_name': city_info['admin1_name'],
|
|
'city_code': city_code,
|
|
'city_name': city_info['ascii_name'],
|
|
'geonames_id': city_info['geonames_id'],
|
|
'feature_code': city_info['feature_code'],
|
|
'google_place_id': location_info.get('place_id'),
|
|
'latitude': location_info['latitude'],
|
|
'longitude': location_info['longitude'],
|
|
'resolution_date': timestamp,
|
|
}
|
|
|
|
data['google_maps_enrichment'] = {
|
|
'place_id': location_info.get('place_id'),
|
|
'formatted_address': location_info.get('formatted_address'),
|
|
'website': location_info.get('website'),
|
|
'latitude': location_info['latitude'],
|
|
'longitude': location_info['longitude'],
|
|
'enriched_at': timestamp,
|
|
'source': 'Google Places API (New)',
|
|
}
|
|
|
|
# Update GHCID history
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
|
|
for entry in data['ghcid']['ghcid_history']:
|
|
if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
|
|
entry['valid_to'] = timestamp
|
|
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
|
|
'valid_from': timestamp,
|
|
'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})',
|
|
})
|
|
|
|
if 'identifiers' in data:
|
|
for identifier in data['identifiers']:
|
|
if identifier.get('identifier_scheme') == 'GHCID':
|
|
identifier['identifier_value'] = new_ghcid
|
|
|
|
# Write and rename
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
new_filename = f'{new_ghcid}.yaml'
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
if filepath != new_filepath and not new_filepath.exists():
|
|
filepath.rename(new_filepath)
|
|
result['new_file'] = str(new_filepath)
|
|
elif new_filepath.exists() and filepath != new_filepath:
|
|
result['status'] = 'collision'
|
|
result['error'] = f'Target file exists: {new_filepath.name}'
|
|
return result
|
|
|
|
result['status'] = 'updated'
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Enrich custodian files with Google Places data')
|
|
parser.add_argument('--country', type=str, help='Country code (e.g., KR, AR, US)')
|
|
parser.add_argument('--all', action='store_true', help='Process all countries with XXX files')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
|
parser.add_argument('--limit', type=int, help='Limit number of files per country')
|
|
args = parser.parse_args()
|
|
|
|
if not GOOGLE_PLACES_TOKEN:
|
|
print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required")
|
|
sys.exit(1)
|
|
|
|
if not GEONAMES_DB.exists():
|
|
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
|
|
sys.exit(1)
|
|
|
|
# Determine which countries to process
|
|
if args.all:
|
|
# Find all countries with XXX files (either XX-XXX or {region}-XXX)
|
|
countries = set()
|
|
for f in CUSTODIAN_DIR.glob('*-*-XXX-*.yaml'):
|
|
cc = f.name[:2]
|
|
if cc in COUNTRY_NAMES:
|
|
countries.add(cc)
|
|
countries = sorted(countries)
|
|
elif args.country:
|
|
countries = [args.country.upper()]
|
|
else:
|
|
print("ERROR: Specify --country CODE or --all")
|
|
sys.exit(1)
|
|
|
|
conn = sqlite3.connect(str(GEONAMES_DB))
|
|
|
|
total_stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
|
|
|
|
for country_code in countries:
|
|
country_name = COUNTRY_NAMES.get(country_code, country_code)
|
|
|
|
files = sorted(CUSTODIAN_DIR.glob(f'{country_code}-*-XXX-*.yaml'))
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
if not files:
|
|
continue
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Processing {country_code} ({country_name}): {len(files)} files")
|
|
print('='*60)
|
|
|
|
stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
|
|
|
|
for filepath in files:
|
|
print(f"Processing: {filepath.name}")
|
|
result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN,
|
|
country_code, country_name, dry_run=args.dry_run)
|
|
stats[result['status']] = stats.get(result['status'], 0) + 1
|
|
|
|
if result['status'] in ('updated', 'would_update'):
|
|
print(f" ✓ {result['city']} ({result['region']}): {result['old_ghcid']} → {result['new_ghcid']}")
|
|
elif result['status'] == 'error':
|
|
print(f" ✗ {result['error']}")
|
|
elif result['status'] == 'collision':
|
|
print(f" ⚠ {result['error']}")
|
|
|
|
print(f"\n{country_code} Summary: Updated={stats.get('updated', 0)}, "
|
|
f"Would update={stats.get('would_update', 0)}, "
|
|
f"Errors={stats.get('error', 0)}")
|
|
|
|
for k, v in stats.items():
|
|
total_stats[k] = total_stats.get(k, 0) + v
|
|
|
|
conn.close()
|
|
|
|
print()
|
|
print('='*60)
|
|
print('TOTAL Summary:')
|
|
print(f" Updated: {total_stats.get('updated', 0)}")
|
|
print(f" Would update: {total_stats.get('would_update', 0)}")
|
|
print(f" Errors: {total_stats.get('error', 0)}")
|
|
print(f" Collisions: {total_stats.get('collision', 0)}")
|
|
print(f" Skipped: {total_stats.get('skipped', 0)}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|