glam/scripts/enrich_japanese_cities.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

480 lines
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Japanese custodian files with city/region data using Google Places API.
This script:
1. Finds Japanese XXX files (no city/region resolved)
2. Uses Google Places API to search for each institution
3. Extracts location data (city, prefecture, coordinates)
4. Updates GHCID with proper region/city codes
5. Adds Google Maps enrichment data
Usage:
python scripts/enrich_japanese_cities.py [--dry-run] [--limit N]
Environment Variables:
GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled
"""
import os
import sys
import time
import sqlite3
import re
import argparse
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import yaml
import httpx
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Configuration
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# Google Places API
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
REQUEST_DELAY = 0.3 # Rate limiting
# Japanese prefecture GeoNames admin1_code to ISO 3166-2:JP mapping
ADMIN1_TO_ISO = {
'01': 'AI', # Aichi
'02': 'AK', # Akita
'03': 'AO', # Aomori
'04': 'CH', # Chiba
'05': 'EH', # Ehime
'06': 'FI', # Fukui
'07': 'FO', # Fukuoka
'08': 'FS', # Fukushima
'09': 'GI', # Gifu
'10': 'GU', # Gunma
'11': 'HS', # Hiroshima
'12': 'HO', # Hokkaido
'13': 'HG', # Hyogo
'14': 'IB', # Ibaraki
'15': 'IS', # Ishikawa
'16': 'IW', # Iwate
'17': 'KA', # Kagawa
'18': 'KS', # Kagoshima
'19': 'KN', # Kanagawa
'20': 'KC', # Kochi
'21': 'KM', # Kumamoto
'22': 'KY', # Kyoto
'23': 'ME', # Mie
'24': 'MG', # Miyagi
'25': 'MZ', # Miyazaki
'26': 'NN', # Nagano
'27': 'NS', # Nagasaki
'28': 'NR', # Nara
'29': 'NI', # Niigata
'30': 'OT', # Oita
'31': 'OK', # Okayama
'32': 'OS', # Osaka
'33': 'SG', # Saga
'34': 'ST', # Saitama
'35': 'SI', # Shiga
'36': 'SM', # Shimane
'37': 'SZ', # Shizuoka
'38': 'TC', # Tochigi
'39': 'TS', # Tokushima
'40': 'TK', # Tokyo
'41': 'TT', # Tottori
'42': 'TY', # Toyama
'43': 'WK', # Wakayama
'44': 'YG', # Yamagata
'45': 'YM', # Yamaguchi
'46': 'YN', # Yamanashi
'47': 'ON', # Okinawa
}
# Reverse mapping for lookup by prefecture name
PREFECTURE_TO_ISO = {
'Aichi': 'AI', 'Akita': 'AK', 'Aomori': 'AO', 'Chiba': 'CH', 'Ehime': 'EH',
'Fukui': 'FI', 'Fukuoka': 'FO', 'Fukushima': 'FS', 'Gifu': 'GI', 'Gunma': 'GU',
'Hiroshima': 'HS', 'Hokkaido': 'HO', 'Hyogo': 'HG', 'Hyōgo': 'HG',
'Ibaraki': 'IB', 'Ishikawa': 'IS', 'Iwate': 'IW', 'Kagawa': 'KA',
'Kagoshima': 'KS', 'Kanagawa': 'KN', 'Kochi': 'KC', 'Kumamoto': 'KM',
'Kyoto': 'KY', 'Mie': 'ME', 'Miyagi': 'MG', 'Miyazaki': 'MZ',
'Nagano': 'NN', 'Nagasaki': 'NS', 'Nara': 'NR', 'Niigata': 'NI',
'Oita': 'OT', 'Okayama': 'OK', 'Osaka': 'OS', 'Saga': 'SG',
'Saitama': 'ST', 'Shiga': 'SI', 'Shimane': 'SM', 'Shizuoka': 'SZ',
'Tochigi': 'TC', 'Tokushima': 'TS', 'Tokyo': 'TK', 'Tottori': 'TT',
'Toyama': 'TY', 'Wakayama': 'WK', 'Yamagata': 'YG', 'Yamaguchi': 'YM',
'Yamanashi': 'YN', 'Okinawa': 'ON',
# Alternative spellings from address strings
'Tokyo To': 'TK', 'Osaka Fu': 'OS', 'Kyoto Fu': 'KY', 'Hokkaido': 'HO',
'Aichi Ken': 'AI', 'Hyogo Ken': 'HG', 'Kanagawa Ken': 'KN',
}
def get_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name."""
# Clean suffixes common in Japanese city names
name = city_name.strip()
for suffix in [' Shi', ' Ku', ' Cho', ' Machi', ' Mura', ' Gun', ' City', '-shi', '-ku']:
if name.endswith(suffix):
name = name[:-len(suffix)]
words = name.split()
if len(words) == 1:
return name[:3].upper()
elif len(words) == 2:
return (words[0][0] + words[1][:2]).upper()
else:
return ''.join(w[0] for w in words[:3]).upper()
def search_google_places(query: str, api_key: str, country_bias: str = "JP") -> Optional[dict]:
"""Search Google Places API for a location."""
headers = {
"Content-Type": "application/json",
"X-Goog-Api-Key": api_key,
"X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri"
}
payload = {
"textQuery": query,
"languageCode": "en"
}
try:
response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
if "places" in data and len(data["places"]) > 0:
return data["places"][0]
return None
except Exception as e:
print(f" Error searching Google Places: {e}")
return None
def extract_location_from_google(place: dict) -> dict:
"""Extract location information from Google Places result."""
result = {
'city': None,
'prefecture': None,
'prefecture_code': None,
'latitude': None,
'longitude': None,
'formatted_address': None,
'place_id': None,
'website': None,
}
if not place:
return result
result['place_id'] = place.get('id')
result['formatted_address'] = place.get('formattedAddress')
result['website'] = place.get('websiteUri')
# Get coordinates
location = place.get('location', {})
result['latitude'] = location.get('latitude')
result['longitude'] = location.get('longitude')
# Parse address components
components = place.get('addressComponents', [])
for comp in components:
types = comp.get('types', [])
long_name = comp.get('longText', '')
if 'locality' in types:
result['city'] = long_name
elif 'administrative_area_level_1' in types:
result['prefecture'] = long_name
# Try to get ISO code
result['prefecture_code'] = PREFECTURE_TO_ISO.get(long_name)
elif 'sublocality_level_1' in types and not result['city']:
# Use ward/sublocality as city if no locality
result['city'] = long_name
return result
def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float) -> Optional[dict]:
"""Reverse geocode coordinates to find nearest city in GeoNames."""
cursor = conn.cursor()
cursor.execute("""
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
latitude, longitude, population, feature_code,
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq
FROM cities
WHERE country_code = 'JP'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY dist_sq
LIMIT 1
""", (lat, lat, lon, lon))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'geonames_id': row[4],
'latitude': row[5],
'longitude': row[6],
'population': row[7],
'feature_code': row[8],
}
return None
def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str, dry_run: bool = False) -> dict:
"""Process a single Japanese custodian file."""
result = {
'file': str(filepath),
'status': 'skipped',
'old_ghcid': None,
'new_ghcid': None,
'city': None,
'prefecture': None,
'error': None,
}
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
result['status'] = 'error'
result['error'] = f'Failed to load YAML: {e}'
return result
if not data:
result['status'] = 'error'
result['error'] = 'Empty YAML file'
return result
# Get current GHCID
ghcid_data = data.get('ghcid', {})
old_ghcid = ghcid_data.get('ghcid_current', '')
result['old_ghcid'] = old_ghcid
if not old_ghcid.startswith('JP-XX-XXX-'):
result['status'] = 'skipped'
result['error'] = 'Not a JP-XX-XXX file'
return result
# Get institution name for search
name = data.get('custodian_name', {}).get('claim_value', '')
if not name:
name = data.get('original_entry', {}).get('name', '')
if not name:
result['status'] = 'error'
result['error'] = 'No institution name found'
return result
# Search Google Places
print(f" Searching: {name[:50]}...")
place = search_google_places(f"{name} Japan", api_key)
time.sleep(REQUEST_DELAY)
if not place:
result['status'] = 'error'
result['error'] = 'Not found in Google Places'
return result
# Extract location
location_info = extract_location_from_google(place)
if not location_info['latitude'] or not location_info['longitude']:
result['status'] = 'error'
result['error'] = 'No coordinates from Google'
return result
# Lookup in GeoNames for city code
city_info = lookup_city_geonames(conn, location_info['latitude'], location_info['longitude'])
if not city_info:
result['status'] = 'error'
result['error'] = 'City not found in GeoNames'
return result
# Determine region code
admin1_code = city_info['admin1_code']
region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX')
if region_code == 'XX':
# Try from Google address
region_code = location_info.get('prefecture_code', 'XX')
# Generate city code
city_code = get_city_code(city_info['ascii_name'])
result['city'] = city_info['ascii_name']
result['prefecture'] = city_info['admin1_name']
# Build new GHCID
parts = old_ghcid.split('-')
if len(parts) >= 5:
inst_type = parts[3]
abbreviation = '-'.join(parts[4:])
else:
result['status'] = 'error'
result['error'] = f'Invalid GHCID format: {old_ghcid}'
return result
new_ghcid = f'JP-{region_code}-{city_code}-{inst_type}-{abbreviation}'
result['new_ghcid'] = new_ghcid
if dry_run:
result['status'] = 'would_update'
return result
# Update the data
timestamp = datetime.now(timezone.utc).isoformat()
# Update ghcid section
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['location_resolution'] = {
'method': 'GOOGLE_PLACES_GEONAMES',
'country_code': 'JP',
'region_code': region_code,
'region_name': city_info['admin1_name'],
'city_code': city_code,
'city_name': city_info['ascii_name'],
'geonames_id': city_info['geonames_id'],
'feature_code': city_info['feature_code'],
'google_place_id': location_info.get('place_id'),
'latitude': location_info['latitude'],
'longitude': location_info['longitude'],
'resolution_date': timestamp,
}
# Add Google Maps enrichment
data['google_maps_enrichment'] = {
'place_id': location_info.get('place_id'),
'formatted_address': location_info.get('formatted_address'),
'website': location_info.get('website'),
'latitude': location_info['latitude'],
'longitude': location_info['longitude'],
'enriched_at': timestamp,
'source': 'Google Places API (New)',
}
# Update location in original_entry
if 'original_entry' in data and 'locations' in data['original_entry']:
if data['original_entry']['locations']:
data['original_entry']['locations'][0]['city'] = city_info['ascii_name']
data['original_entry']['locations'][0]['region'] = city_info['admin1_name']
if location_info['latitude']:
data['original_entry']['locations'][0]['latitude'] = location_info['latitude']
data['original_entry']['locations'][0]['longitude'] = location_info['longitude']
# Add to GHCID history
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
for entry in data['ghcid']['ghcid_history']:
if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
entry['valid_to'] = timestamp
data['ghcid']['ghcid_history'].append({
'ghcid': new_ghcid,
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
'valid_from': timestamp,
'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})',
})
# Update identifiers
if 'identifiers' in data:
for identifier in data['identifiers']:
if identifier.get('identifier_scheme') == 'GHCID':
identifier['identifier_value'] = new_ghcid
# Write updated data
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rename file
new_filename = f'{new_ghcid}.yaml'
new_filepath = filepath.parent / new_filename
if filepath != new_filepath and not new_filepath.exists():
filepath.rename(new_filepath)
result['new_file'] = str(new_filepath)
elif new_filepath.exists() and filepath != new_filepath:
result['status'] = 'collision'
result['error'] = f'Target file exists: {new_filepath.name}'
return result
result['status'] = 'updated'
return result
def main():
parser = argparse.ArgumentParser(description='Enrich Japanese custodian files with Google Places data')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
parser.add_argument('--limit', type=int, help='Limit number of files to process')
args = parser.parse_args()
if not GOOGLE_PLACES_TOKEN:
print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required")
print("Set it in .env file or export GOOGLE_PLACES_TOKEN=...")
sys.exit(1)
if not GEONAMES_DB.exists():
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
sys.exit(1)
# Find Japanese XXX files
files = sorted(CUSTODIAN_DIR.glob('JP-XX-XXX-*.yaml'))
if args.limit:
files = files[:args.limit]
print(f"Found {len(files)} Japanese XXX files")
print(f"Dry run: {args.dry_run}")
print()
conn = sqlite3.connect(str(GEONAMES_DB))
stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
errors = []
for filepath in files:
print(f"Processing: {filepath.name}")
result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN, dry_run=args.dry_run)
stats[result['status']] = stats.get(result['status'], 0) + 1
if result['status'] in ('updated', 'would_update'):
print(f"{result['city']} ({result['prefecture']}): {result['old_ghcid']}{result['new_ghcid']}")
elif result['status'] == 'error':
print(f"{result['error']}")
errors.append(result)
elif result['status'] == 'collision':
print(f"{result['error']}")
conn.close()
print()
print('=' * 60)
print('Summary:')
print(f" Updated: {stats.get('updated', 0)}")
print(f" Would update: {stats.get('would_update', 0)}")
print(f" Errors: {stats.get('error', 0)}")
print(f" Collisions: {stats.get('collision', 0)}")
print(f" Skipped: {stats.get('skipped', 0)}")
if errors:
print()
print('Files with errors (may need manual research):')
for err in errors[:10]:
print(f" - {Path(err['file']).name}: {err['error']}")
if __name__ == '__main__':
main()