#!/usr/bin/env python3 """ Enrich Japanese custodian files with city/region data using Google Places API. This script: 1. Finds Japanese XXX files (no city/region resolved) 2. Uses Google Places API to search for each institution 3. Extracts location data (city, prefecture, coordinates) 4. Updates GHCID with proper region/city codes 5. Adds Google Maps enrichment data Usage: python scripts/enrich_japanese_cities.py [--dry-run] [--limit N] Environment Variables: GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled """ import os import sys import time import sqlite3 import re import argparse from pathlib import Path from datetime import datetime, timezone from typing import Optional import yaml import httpx from dotenv import load_dotenv # Load environment variables load_dotenv() # Configuration GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "") GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db") CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Google Places API TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText" REQUEST_DELAY = 0.3 # Rate limiting # Japanese prefecture GeoNames admin1_code to ISO 3166-2:JP mapping ADMIN1_TO_ISO = { '01': 'AI', # Aichi '02': 'AK', # Akita '03': 'AO', # Aomori '04': 'CH', # Chiba '05': 'EH', # Ehime '06': 'FI', # Fukui '07': 'FO', # Fukuoka '08': 'FS', # Fukushima '09': 'GI', # Gifu '10': 'GU', # Gunma '11': 'HS', # Hiroshima '12': 'HO', # Hokkaido '13': 'HG', # Hyogo '14': 'IB', # Ibaraki '15': 'IS', # Ishikawa '16': 'IW', # Iwate '17': 'KA', # Kagawa '18': 'KS', # Kagoshima '19': 'KN', # Kanagawa '20': 'KC', # Kochi '21': 'KM', # Kumamoto '22': 'KY', # Kyoto '23': 'ME', # Mie '24': 'MG', # Miyagi '25': 'MZ', # Miyazaki '26': 'NN', # Nagano '27': 'NS', # Nagasaki '28': 'NR', # Nara '29': 'NI', # Niigata '30': 'OT', # Oita '31': 'OK', # Okayama '32': 'OS', # Osaka '33': 'SG', # Saga '34': 'ST', # Saitama '35': 'SI', # Shiga '36': 'SM', # Shimane '37': 'SZ', # Shizuoka '38': 'TC', # Tochigi '39': 'TS', # Tokushima '40': 'TK', # Tokyo '41': 'TT', # Tottori '42': 'TY', # Toyama '43': 'WK', # Wakayama '44': 'YG', # Yamagata '45': 'YM', # Yamaguchi '46': 'YN', # Yamanashi '47': 'ON', # Okinawa } # Reverse mapping for lookup by prefecture name PREFECTURE_TO_ISO = { 'Aichi': 'AI', 'Akita': 'AK', 'Aomori': 'AO', 'Chiba': 'CH', 'Ehime': 'EH', 'Fukui': 'FI', 'Fukuoka': 'FO', 'Fukushima': 'FS', 'Gifu': 'GI', 'Gunma': 'GU', 'Hiroshima': 'HS', 'Hokkaido': 'HO', 'Hyogo': 'HG', 'Hyōgo': 'HG', 'Ibaraki': 'IB', 'Ishikawa': 'IS', 'Iwate': 'IW', 'Kagawa': 'KA', 'Kagoshima': 'KS', 'Kanagawa': 'KN', 'Kochi': 'KC', 'Kumamoto': 'KM', 'Kyoto': 'KY', 'Mie': 'ME', 'Miyagi': 'MG', 'Miyazaki': 'MZ', 'Nagano': 'NN', 'Nagasaki': 'NS', 'Nara': 'NR', 'Niigata': 'NI', 'Oita': 'OT', 'Okayama': 'OK', 'Osaka': 'OS', 'Saga': 'SG', 'Saitama': 'ST', 'Shiga': 'SI', 'Shimane': 'SM', 'Shizuoka': 'SZ', 'Tochigi': 'TC', 'Tokushima': 'TS', 'Tokyo': 'TK', 'Tottori': 'TT', 'Toyama': 'TY', 'Wakayama': 'WK', 'Yamagata': 'YG', 'Yamaguchi': 'YM', 'Yamanashi': 'YN', 'Okinawa': 'ON', # Alternative spellings from address strings 'Tokyo To': 'TK', 'Osaka Fu': 'OS', 'Kyoto Fu': 'KY', 'Hokkaido': 'HO', 'Aichi Ken': 'AI', 'Hyogo Ken': 'HG', 'Kanagawa Ken': 'KN', } def get_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" # Clean suffixes common in Japanese city names name = city_name.strip() for suffix in [' Shi', ' Ku', ' Cho', ' Machi', ' Mura', ' Gun', ' City', '-shi', '-ku']: if name.endswith(suffix): name = name[:-len(suffix)] words = name.split() if len(words) == 1: return name[:3].upper() elif len(words) == 2: return (words[0][0] + words[1][:2]).upper() else: return ''.join(w[0] for w in words[:3]).upper() def search_google_places(query: str, api_key: str, country_bias: str = "JP") -> Optional[dict]: """Search Google Places API for a location.""" headers = { "Content-Type": "application/json", "X-Goog-Api-Key": api_key, "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri" } payload = { "textQuery": query, "languageCode": "en" } try: response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30) response.raise_for_status() data = response.json() if "places" in data and len(data["places"]) > 0: return data["places"][0] return None except Exception as e: print(f" Error searching Google Places: {e}") return None def extract_location_from_google(place: dict) -> dict: """Extract location information from Google Places result.""" result = { 'city': None, 'prefecture': None, 'prefecture_code': None, 'latitude': None, 'longitude': None, 'formatted_address': None, 'place_id': None, 'website': None, } if not place: return result result['place_id'] = place.get('id') result['formatted_address'] = place.get('formattedAddress') result['website'] = place.get('websiteUri') # Get coordinates location = place.get('location', {}) result['latitude'] = location.get('latitude') result['longitude'] = location.get('longitude') # Parse address components components = place.get('addressComponents', []) for comp in components: types = comp.get('types', []) long_name = comp.get('longText', '') if 'locality' in types: result['city'] = long_name elif 'administrative_area_level_1' in types: result['prefecture'] = long_name # Try to get ISO code result['prefecture_code'] = PREFECTURE_TO_ISO.get(long_name) elif 'sublocality_level_1' in types and not result['city']: # Use ward/sublocality as city if no locality result['city'] = long_name return result def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float) -> Optional[dict]: """Reverse geocode coordinates to find nearest city in GeoNames.""" cursor = conn.cursor() cursor.execute(""" SELECT name, ascii_name, admin1_code, admin1_name, geonames_id, latitude, longitude, population, feature_code, ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq FROM cities WHERE country_code = 'JP' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY dist_sq LIMIT 1 """, (lat, lat, lon, lon)) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'geonames_id': row[4], 'latitude': row[5], 'longitude': row[6], 'population': row[7], 'feature_code': row[8], } return None def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str, dry_run: bool = False) -> dict: """Process a single Japanese custodian file.""" result = { 'file': str(filepath), 'status': 'skipped', 'old_ghcid': None, 'new_ghcid': None, 'city': None, 'prefecture': None, 'error': None, } try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: result['status'] = 'error' result['error'] = f'Failed to load YAML: {e}' return result if not data: result['status'] = 'error' result['error'] = 'Empty YAML file' return result # Get current GHCID ghcid_data = data.get('ghcid', {}) old_ghcid = ghcid_data.get('ghcid_current', '') result['old_ghcid'] = old_ghcid if not old_ghcid.startswith('JP-XX-XXX-'): result['status'] = 'skipped' result['error'] = 'Not a JP-XX-XXX file' return result # Get institution name for search name = data.get('custodian_name', {}).get('claim_value', '') if not name: name = data.get('original_entry', {}).get('name', '') if not name: result['status'] = 'error' result['error'] = 'No institution name found' return result # Search Google Places print(f" Searching: {name[:50]}...") place = search_google_places(f"{name} Japan", api_key) time.sleep(REQUEST_DELAY) if not place: result['status'] = 'error' result['error'] = 'Not found in Google Places' return result # Extract location location_info = extract_location_from_google(place) if not location_info['latitude'] or not location_info['longitude']: result['status'] = 'error' result['error'] = 'No coordinates from Google' return result # Lookup in GeoNames for city code city_info = lookup_city_geonames(conn, location_info['latitude'], location_info['longitude']) if not city_info: result['status'] = 'error' result['error'] = 'City not found in GeoNames' return result # Determine region code admin1_code = city_info['admin1_code'] region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX') if region_code == 'XX': # Try from Google address region_code = location_info.get('prefecture_code', 'XX') # Generate city code city_code = get_city_code(city_info['ascii_name']) result['city'] = city_info['ascii_name'] result['prefecture'] = city_info['admin1_name'] # Build new GHCID parts = old_ghcid.split('-') if len(parts) >= 5: inst_type = parts[3] abbreviation = '-'.join(parts[4:]) else: result['status'] = 'error' result['error'] = f'Invalid GHCID format: {old_ghcid}' return result new_ghcid = f'JP-{region_code}-{city_code}-{inst_type}-{abbreviation}' result['new_ghcid'] = new_ghcid if dry_run: result['status'] = 'would_update' return result # Update the data timestamp = datetime.now(timezone.utc).isoformat() # Update ghcid section data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['location_resolution'] = { 'method': 'GOOGLE_PLACES_GEONAMES', 'country_code': 'JP', 'region_code': region_code, 'region_name': city_info['admin1_name'], 'city_code': city_code, 'city_name': city_info['ascii_name'], 'geonames_id': city_info['geonames_id'], 'feature_code': city_info['feature_code'], 'google_place_id': location_info.get('place_id'), 'latitude': location_info['latitude'], 'longitude': location_info['longitude'], 'resolution_date': timestamp, } # Add Google Maps enrichment data['google_maps_enrichment'] = { 'place_id': location_info.get('place_id'), 'formatted_address': location_info.get('formatted_address'), 'website': location_info.get('website'), 'latitude': location_info['latitude'], 'longitude': location_info['longitude'], 'enriched_at': timestamp, 'source': 'Google Places API (New)', } # Update location in original_entry if 'original_entry' in data and 'locations' in data['original_entry']: if data['original_entry']['locations']: data['original_entry']['locations'][0]['city'] = city_info['ascii_name'] data['original_entry']['locations'][0]['region'] = city_info['admin1_name'] if location_info['latitude']: data['original_entry']['locations'][0]['latitude'] = location_info['latitude'] data['original_entry']['locations'][0]['longitude'] = location_info['longitude'] # Add to GHCID history if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] for entry in data['ghcid']['ghcid_history']: if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'): entry['valid_to'] = timestamp data['ghcid']['ghcid_history'].append({ 'ghcid': new_ghcid, 'ghcid_numeric': data['ghcid'].get('ghcid_numeric'), 'valid_from': timestamp, 'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})', }) # Update identifiers if 'identifiers' in data: for identifier in data['identifiers']: if identifier.get('identifier_scheme') == 'GHCID': identifier['identifier_value'] = new_ghcid # Write updated data with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rename file new_filename = f'{new_ghcid}.yaml' new_filepath = filepath.parent / new_filename if filepath != new_filepath and not new_filepath.exists(): filepath.rename(new_filepath) result['new_file'] = str(new_filepath) elif new_filepath.exists() and filepath != new_filepath: result['status'] = 'collision' result['error'] = f'Target file exists: {new_filepath.name}' return result result['status'] = 'updated' return result def main(): parser = argparse.ArgumentParser(description='Enrich Japanese custodian files with Google Places data') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') parser.add_argument('--limit', type=int, help='Limit number of files to process') args = parser.parse_args() if not GOOGLE_PLACES_TOKEN: print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required") print("Set it in .env file or export GOOGLE_PLACES_TOKEN=...") sys.exit(1) if not GEONAMES_DB.exists(): print(f"ERROR: GeoNames database not found: {GEONAMES_DB}") sys.exit(1) # Find Japanese XXX files files = sorted(CUSTODIAN_DIR.glob('JP-XX-XXX-*.yaml')) if args.limit: files = files[:args.limit] print(f"Found {len(files)} Japanese XXX files") print(f"Dry run: {args.dry_run}") print() conn = sqlite3.connect(str(GEONAMES_DB)) stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0} errors = [] for filepath in files: print(f"Processing: {filepath.name}") result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN, dry_run=args.dry_run) stats[result['status']] = stats.get(result['status'], 0) + 1 if result['status'] in ('updated', 'would_update'): print(f" ✓ {result['city']} ({result['prefecture']}): {result['old_ghcid']} → {result['new_ghcid']}") elif result['status'] == 'error': print(f" ✗ {result['error']}") errors.append(result) elif result['status'] == 'collision': print(f" ⚠ {result['error']}") conn.close() print() print('=' * 60) print('Summary:') print(f" Updated: {stats.get('updated', 0)}") print(f" Would update: {stats.get('would_update', 0)}") print(f" Errors: {stats.get('error', 0)}") print(f" Collisions: {stats.get('collision', 0)}") print(f" Skipped: {stats.get('skipped', 0)}") if errors: print() print('Files with errors (may need manual research):') for err in errors[:10]: print(f" - {Path(err['file']).name}: {err['error']}") if __name__ == '__main__': main()