Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
480 lines
16 KiB
Python
Executable file
480 lines
16 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Japanese custodian files with city/region data using Google Places API.
|
|
|
|
This script:
|
|
1. Finds Japanese XXX files (no city/region resolved)
|
|
2. Uses Google Places API to search for each institution
|
|
3. Extracts location data (city, prefecture, coordinates)
|
|
4. Updates GHCID with proper region/city codes
|
|
5. Adds Google Maps enrichment data
|
|
|
|
Usage:
|
|
python scripts/enrich_japanese_cities.py [--dry-run] [--limit N]
|
|
|
|
Environment Variables:
|
|
GOOGLE_PLACES_TOKEN - Required: Google Cloud API key with Places API enabled
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import sqlite3
|
|
import re
|
|
import argparse
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
import yaml
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Configuration
|
|
GOOGLE_PLACES_TOKEN = os.getenv("GOOGLE_PLACES_TOKEN", "")
|
|
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
# Google Places API
|
|
TEXT_SEARCH_URL = "https://places.googleapis.com/v1/places:searchText"
|
|
REQUEST_DELAY = 0.3 # Rate limiting
|
|
|
|
# Japanese prefecture GeoNames admin1_code to ISO 3166-2:JP mapping
|
|
ADMIN1_TO_ISO = {
|
|
'01': 'AI', # Aichi
|
|
'02': 'AK', # Akita
|
|
'03': 'AO', # Aomori
|
|
'04': 'CH', # Chiba
|
|
'05': 'EH', # Ehime
|
|
'06': 'FI', # Fukui
|
|
'07': 'FO', # Fukuoka
|
|
'08': 'FS', # Fukushima
|
|
'09': 'GI', # Gifu
|
|
'10': 'GU', # Gunma
|
|
'11': 'HS', # Hiroshima
|
|
'12': 'HO', # Hokkaido
|
|
'13': 'HG', # Hyogo
|
|
'14': 'IB', # Ibaraki
|
|
'15': 'IS', # Ishikawa
|
|
'16': 'IW', # Iwate
|
|
'17': 'KA', # Kagawa
|
|
'18': 'KS', # Kagoshima
|
|
'19': 'KN', # Kanagawa
|
|
'20': 'KC', # Kochi
|
|
'21': 'KM', # Kumamoto
|
|
'22': 'KY', # Kyoto
|
|
'23': 'ME', # Mie
|
|
'24': 'MG', # Miyagi
|
|
'25': 'MZ', # Miyazaki
|
|
'26': 'NN', # Nagano
|
|
'27': 'NS', # Nagasaki
|
|
'28': 'NR', # Nara
|
|
'29': 'NI', # Niigata
|
|
'30': 'OT', # Oita
|
|
'31': 'OK', # Okayama
|
|
'32': 'OS', # Osaka
|
|
'33': 'SG', # Saga
|
|
'34': 'ST', # Saitama
|
|
'35': 'SI', # Shiga
|
|
'36': 'SM', # Shimane
|
|
'37': 'SZ', # Shizuoka
|
|
'38': 'TC', # Tochigi
|
|
'39': 'TS', # Tokushima
|
|
'40': 'TK', # Tokyo
|
|
'41': 'TT', # Tottori
|
|
'42': 'TY', # Toyama
|
|
'43': 'WK', # Wakayama
|
|
'44': 'YG', # Yamagata
|
|
'45': 'YM', # Yamaguchi
|
|
'46': 'YN', # Yamanashi
|
|
'47': 'ON', # Okinawa
|
|
}
|
|
|
|
# Reverse mapping for lookup by prefecture name
|
|
PREFECTURE_TO_ISO = {
|
|
'Aichi': 'AI', 'Akita': 'AK', 'Aomori': 'AO', 'Chiba': 'CH', 'Ehime': 'EH',
|
|
'Fukui': 'FI', 'Fukuoka': 'FO', 'Fukushima': 'FS', 'Gifu': 'GI', 'Gunma': 'GU',
|
|
'Hiroshima': 'HS', 'Hokkaido': 'HO', 'Hyogo': 'HG', 'Hyōgo': 'HG',
|
|
'Ibaraki': 'IB', 'Ishikawa': 'IS', 'Iwate': 'IW', 'Kagawa': 'KA',
|
|
'Kagoshima': 'KS', 'Kanagawa': 'KN', 'Kochi': 'KC', 'Kumamoto': 'KM',
|
|
'Kyoto': 'KY', 'Mie': 'ME', 'Miyagi': 'MG', 'Miyazaki': 'MZ',
|
|
'Nagano': 'NN', 'Nagasaki': 'NS', 'Nara': 'NR', 'Niigata': 'NI',
|
|
'Oita': 'OT', 'Okayama': 'OK', 'Osaka': 'OS', 'Saga': 'SG',
|
|
'Saitama': 'ST', 'Shiga': 'SI', 'Shimane': 'SM', 'Shizuoka': 'SZ',
|
|
'Tochigi': 'TC', 'Tokushima': 'TS', 'Tokyo': 'TK', 'Tottori': 'TT',
|
|
'Toyama': 'TY', 'Wakayama': 'WK', 'Yamagata': 'YG', 'Yamaguchi': 'YM',
|
|
'Yamanashi': 'YN', 'Okinawa': 'ON',
|
|
# Alternative spellings from address strings
|
|
'Tokyo To': 'TK', 'Osaka Fu': 'OS', 'Kyoto Fu': 'KY', 'Hokkaido': 'HO',
|
|
'Aichi Ken': 'AI', 'Hyogo Ken': 'HG', 'Kanagawa Ken': 'KN',
|
|
}
|
|
|
|
|
|
def get_city_code(city_name: str) -> str:
|
|
"""Generate 3-letter city code from city name."""
|
|
# Clean suffixes common in Japanese city names
|
|
name = city_name.strip()
|
|
for suffix in [' Shi', ' Ku', ' Cho', ' Machi', ' Mura', ' Gun', ' City', '-shi', '-ku']:
|
|
if name.endswith(suffix):
|
|
name = name[:-len(suffix)]
|
|
|
|
words = name.split()
|
|
|
|
if len(words) == 1:
|
|
return name[:3].upper()
|
|
elif len(words) == 2:
|
|
return (words[0][0] + words[1][:2]).upper()
|
|
else:
|
|
return ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
|
|
def search_google_places(query: str, api_key: str, country_bias: str = "JP") -> Optional[dict]:
|
|
"""Search Google Places API for a location."""
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"X-Goog-Api-Key": api_key,
|
|
"X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.location,places.addressComponents,places.types,places.id,places.websiteUri"
|
|
}
|
|
|
|
payload = {
|
|
"textQuery": query,
|
|
"languageCode": "en"
|
|
}
|
|
|
|
try:
|
|
response = httpx.post(TEXT_SEARCH_URL, json=payload, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if "places" in data and len(data["places"]) > 0:
|
|
return data["places"][0]
|
|
return None
|
|
except Exception as e:
|
|
print(f" Error searching Google Places: {e}")
|
|
return None
|
|
|
|
|
|
def extract_location_from_google(place: dict) -> dict:
|
|
"""Extract location information from Google Places result."""
|
|
result = {
|
|
'city': None,
|
|
'prefecture': None,
|
|
'prefecture_code': None,
|
|
'latitude': None,
|
|
'longitude': None,
|
|
'formatted_address': None,
|
|
'place_id': None,
|
|
'website': None,
|
|
}
|
|
|
|
if not place:
|
|
return result
|
|
|
|
result['place_id'] = place.get('id')
|
|
result['formatted_address'] = place.get('formattedAddress')
|
|
result['website'] = place.get('websiteUri')
|
|
|
|
# Get coordinates
|
|
location = place.get('location', {})
|
|
result['latitude'] = location.get('latitude')
|
|
result['longitude'] = location.get('longitude')
|
|
|
|
# Parse address components
|
|
components = place.get('addressComponents', [])
|
|
for comp in components:
|
|
types = comp.get('types', [])
|
|
long_name = comp.get('longText', '')
|
|
|
|
if 'locality' in types:
|
|
result['city'] = long_name
|
|
elif 'administrative_area_level_1' in types:
|
|
result['prefecture'] = long_name
|
|
# Try to get ISO code
|
|
result['prefecture_code'] = PREFECTURE_TO_ISO.get(long_name)
|
|
elif 'sublocality_level_1' in types and not result['city']:
|
|
# Use ward/sublocality as city if no locality
|
|
result['city'] = long_name
|
|
|
|
return result
|
|
|
|
|
|
def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float) -> Optional[dict]:
|
|
"""Reverse geocode coordinates to find nearest city in GeoNames."""
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""
|
|
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
|
|
latitude, longitude, population, feature_code,
|
|
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as dist_sq
|
|
FROM cities
|
|
WHERE country_code = 'JP'
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY dist_sq
|
|
LIMIT 1
|
|
""", (lat, lat, lon, lon))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return {
|
|
'name': row[0],
|
|
'ascii_name': row[1],
|
|
'admin1_code': row[2],
|
|
'admin1_name': row[3],
|
|
'geonames_id': row[4],
|
|
'latitude': row[5],
|
|
'longitude': row[6],
|
|
'population': row[7],
|
|
'feature_code': row[8],
|
|
}
|
|
return None
|
|
|
|
|
|
def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str, dry_run: bool = False) -> dict:
|
|
"""Process a single Japanese custodian file."""
|
|
result = {
|
|
'file': str(filepath),
|
|
'status': 'skipped',
|
|
'old_ghcid': None,
|
|
'new_ghcid': None,
|
|
'city': None,
|
|
'prefecture': None,
|
|
'error': None,
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
result['status'] = 'error'
|
|
result['error'] = f'Failed to load YAML: {e}'
|
|
return result
|
|
|
|
if not data:
|
|
result['status'] = 'error'
|
|
result['error'] = 'Empty YAML file'
|
|
return result
|
|
|
|
# Get current GHCID
|
|
ghcid_data = data.get('ghcid', {})
|
|
old_ghcid = ghcid_data.get('ghcid_current', '')
|
|
result['old_ghcid'] = old_ghcid
|
|
|
|
if not old_ghcid.startswith('JP-XX-XXX-'):
|
|
result['status'] = 'skipped'
|
|
result['error'] = 'Not a JP-XX-XXX file'
|
|
return result
|
|
|
|
# Get institution name for search
|
|
name = data.get('custodian_name', {}).get('claim_value', '')
|
|
if not name:
|
|
name = data.get('original_entry', {}).get('name', '')
|
|
|
|
if not name:
|
|
result['status'] = 'error'
|
|
result['error'] = 'No institution name found'
|
|
return result
|
|
|
|
# Search Google Places
|
|
print(f" Searching: {name[:50]}...")
|
|
place = search_google_places(f"{name} Japan", api_key)
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
if not place:
|
|
result['status'] = 'error'
|
|
result['error'] = 'Not found in Google Places'
|
|
return result
|
|
|
|
# Extract location
|
|
location_info = extract_location_from_google(place)
|
|
|
|
if not location_info['latitude'] or not location_info['longitude']:
|
|
result['status'] = 'error'
|
|
result['error'] = 'No coordinates from Google'
|
|
return result
|
|
|
|
# Lookup in GeoNames for city code
|
|
city_info = lookup_city_geonames(conn, location_info['latitude'], location_info['longitude'])
|
|
|
|
if not city_info:
|
|
result['status'] = 'error'
|
|
result['error'] = 'City not found in GeoNames'
|
|
return result
|
|
|
|
# Determine region code
|
|
admin1_code = city_info['admin1_code']
|
|
region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX')
|
|
|
|
if region_code == 'XX':
|
|
# Try from Google address
|
|
region_code = location_info.get('prefecture_code', 'XX')
|
|
|
|
# Generate city code
|
|
city_code = get_city_code(city_info['ascii_name'])
|
|
|
|
result['city'] = city_info['ascii_name']
|
|
result['prefecture'] = city_info['admin1_name']
|
|
|
|
# Build new GHCID
|
|
parts = old_ghcid.split('-')
|
|
if len(parts) >= 5:
|
|
inst_type = parts[3]
|
|
abbreviation = '-'.join(parts[4:])
|
|
else:
|
|
result['status'] = 'error'
|
|
result['error'] = f'Invalid GHCID format: {old_ghcid}'
|
|
return result
|
|
|
|
new_ghcid = f'JP-{region_code}-{city_code}-{inst_type}-{abbreviation}'
|
|
result['new_ghcid'] = new_ghcid
|
|
|
|
if dry_run:
|
|
result['status'] = 'would_update'
|
|
return result
|
|
|
|
# Update the data
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update ghcid section
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': 'GOOGLE_PLACES_GEONAMES',
|
|
'country_code': 'JP',
|
|
'region_code': region_code,
|
|
'region_name': city_info['admin1_name'],
|
|
'city_code': city_code,
|
|
'city_name': city_info['ascii_name'],
|
|
'geonames_id': city_info['geonames_id'],
|
|
'feature_code': city_info['feature_code'],
|
|
'google_place_id': location_info.get('place_id'),
|
|
'latitude': location_info['latitude'],
|
|
'longitude': location_info['longitude'],
|
|
'resolution_date': timestamp,
|
|
}
|
|
|
|
# Add Google Maps enrichment
|
|
data['google_maps_enrichment'] = {
|
|
'place_id': location_info.get('place_id'),
|
|
'formatted_address': location_info.get('formatted_address'),
|
|
'website': location_info.get('website'),
|
|
'latitude': location_info['latitude'],
|
|
'longitude': location_info['longitude'],
|
|
'enriched_at': timestamp,
|
|
'source': 'Google Places API (New)',
|
|
}
|
|
|
|
# Update location in original_entry
|
|
if 'original_entry' in data and 'locations' in data['original_entry']:
|
|
if data['original_entry']['locations']:
|
|
data['original_entry']['locations'][0]['city'] = city_info['ascii_name']
|
|
data['original_entry']['locations'][0]['region'] = city_info['admin1_name']
|
|
if location_info['latitude']:
|
|
data['original_entry']['locations'][0]['latitude'] = location_info['latitude']
|
|
data['original_entry']['locations'][0]['longitude'] = location_info['longitude']
|
|
|
|
# Add to GHCID history
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
|
|
for entry in data['ghcid']['ghcid_history']:
|
|
if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
|
|
entry['valid_to'] = timestamp
|
|
|
|
data['ghcid']['ghcid_history'].append({
|
|
'ghcid': new_ghcid,
|
|
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
|
|
'valid_from': timestamp,
|
|
'reason': f'Location resolved via Google Places + GeoNames: {city_info["ascii_name"]} ({region_code})',
|
|
})
|
|
|
|
# Update identifiers
|
|
if 'identifiers' in data:
|
|
for identifier in data['identifiers']:
|
|
if identifier.get('identifier_scheme') == 'GHCID':
|
|
identifier['identifier_value'] = new_ghcid
|
|
|
|
# Write updated data
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Rename file
|
|
new_filename = f'{new_ghcid}.yaml'
|
|
new_filepath = filepath.parent / new_filename
|
|
|
|
if filepath != new_filepath and not new_filepath.exists():
|
|
filepath.rename(new_filepath)
|
|
result['new_file'] = str(new_filepath)
|
|
elif new_filepath.exists() and filepath != new_filepath:
|
|
result['status'] = 'collision'
|
|
result['error'] = f'Target file exists: {new_filepath.name}'
|
|
return result
|
|
|
|
result['status'] = 'updated'
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Enrich Japanese custodian files with Google Places data')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
|
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
|
args = parser.parse_args()
|
|
|
|
if not GOOGLE_PLACES_TOKEN:
|
|
print("ERROR: GOOGLE_PLACES_TOKEN environment variable is required")
|
|
print("Set it in .env file or export GOOGLE_PLACES_TOKEN=...")
|
|
sys.exit(1)
|
|
|
|
if not GEONAMES_DB.exists():
|
|
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
|
|
sys.exit(1)
|
|
|
|
# Find Japanese XXX files
|
|
files = sorted(CUSTODIAN_DIR.glob('JP-XX-XXX-*.yaml'))
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
print(f"Found {len(files)} Japanese XXX files")
|
|
print(f"Dry run: {args.dry_run}")
|
|
print()
|
|
|
|
conn = sqlite3.connect(str(GEONAMES_DB))
|
|
|
|
stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
|
|
errors = []
|
|
|
|
for filepath in files:
|
|
print(f"Processing: {filepath.name}")
|
|
result = process_file(filepath, conn, GOOGLE_PLACES_TOKEN, dry_run=args.dry_run)
|
|
stats[result['status']] = stats.get(result['status'], 0) + 1
|
|
|
|
if result['status'] in ('updated', 'would_update'):
|
|
print(f" ✓ {result['city']} ({result['prefecture']}): {result['old_ghcid']} → {result['new_ghcid']}")
|
|
elif result['status'] == 'error':
|
|
print(f" ✗ {result['error']}")
|
|
errors.append(result)
|
|
elif result['status'] == 'collision':
|
|
print(f" ⚠ {result['error']}")
|
|
|
|
conn.close()
|
|
|
|
print()
|
|
print('=' * 60)
|
|
print('Summary:')
|
|
print(f" Updated: {stats.get('updated', 0)}")
|
|
print(f" Would update: {stats.get('would_update', 0)}")
|
|
print(f" Errors: {stats.get('error', 0)}")
|
|
print(f" Collisions: {stats.get('collision', 0)}")
|
|
print(f" Skipped: {stats.get('skipped', 0)}")
|
|
|
|
if errors:
|
|
print()
|
|
print('Files with errors (may need manual research):')
|
|
for err in errors[:10]:
|
|
print(f" - {Path(err['file']).name}: {err['error']}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|