glam/scripts/geocode_japan_postal.py
kempersc 6e2c36413e geocode: add coordinates to 540 Japanese custodian files using postal codes
- Download GeoNames JP postal code database (142K entries)
- Create geocode_japan_postal.py with postal code lookup
- Handle unicode hyphen variants in postal codes
- Add manual mappings for remote Tokyo islands (Hachijojima, Miyakejima)
- Implement prefix fallback for company postal codes
- Total JP files geocoded: 540 (99.81% coverage)

This brings overall geocoding coverage from 97.84% to 99.81%
2025-12-10 00:27:33 +01:00

283 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Geocode Japanese custodian files using postal codes.
Uses GeoNames postal code data (JP.txt) to add latitude/longitude
to Japanese custodian files that are missing coordinates.
"""
import os
import sys
import yaml
from pathlib import Path
from datetime import datetime, timezone
def load_postal_codes(postal_file: Path) -> dict:
"""Load Japanese postal code to coordinates mapping.
Returns dict: postal_code -> (lat, lon, place_name, prefecture)
"""
postal_map = {}
with open(postal_file, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) >= 11:
postal_code = parts[1] # e.g., "490-1401"
place_name = parts[2]
prefecture = parts[3] # e.g., "Aichi Ken"
city = parts[5] # e.g., "Yatomi Shi"
lat = parts[9]
lon = parts[10]
if lat and lon:
try:
lat_f = float(lat)
lon_f = float(lon)
# Store first match for each postal code
if postal_code not in postal_map:
postal_map[postal_code] = {
'latitude': lat_f,
'longitude': lon_f,
'place_name': place_name,
'prefecture': prefecture,
'city': city
}
except ValueError:
continue
return postal_map
def normalize_postal_code(postal_code: str) -> str:
"""Normalize Japanese postal code to standard format.
Handles unicode hyphen variants and other formatting issues.
"""
if not postal_code:
return postal_code
# Replace various unicode hyphens/dashes with ASCII hyphen
# U+2010 HYPHEN, U+2011 NON-BREAKING HYPHEN, U+2012 FIGURE DASH
# U+2013 EN DASH, U+2014 EM DASH, U+2015 HORIZONTAL BAR
# U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK (ー)
hyphen_chars = '\u2010\u2011\u2012\u2013\u2014\u2015\u30fc\uff0d'
for char in hyphen_chars:
postal_code = postal_code.replace(char, '-')
return postal_code.strip()
# Manual coordinates for remote Tokyo islands not in GeoNames
# These islands have 100-XXXX postal codes but are far from mainland Tokyo
MANUAL_POSTAL_COORDS = {
# Hachijojima (八丈島) - remote island ~287km south of Tokyo
'100-1511': {'latitude': 33.1038, 'longitude': 139.7977, 'place_name': 'Hachijo-machi', 'prefecture': 'Tokyo To', 'city': 'Hachijo-machi'},
'100-1512': {'latitude': 33.1038, 'longitude': 139.7977, 'place_name': 'Hachijo-machi', 'prefecture': 'Tokyo To', 'city': 'Hachijo-machi'},
# Miyakejima (三宅島) - remote island ~180km south of Tokyo
'100-1101': {'latitude': 34.0784, 'longitude': 139.5299, 'place_name': 'Miyake-mura', 'prefecture': 'Tokyo To', 'city': 'Miyake-mura'},
'100-1102': {'latitude': 34.0784, 'longitude': 139.5299, 'place_name': 'Miyake-mura', 'prefecture': 'Tokyo To', 'city': 'Miyake-mura'},
'100-1212': {'latitude': 34.0784, 'longitude': 139.5299, 'place_name': 'Miyake-mura', 'prefecture': 'Tokyo To', 'city': 'Miyake-mura'},
# Aogashima (青ヶ島) - southernmost inhabited island
'100-1701': {'latitude': 32.4557, 'longitude': 139.7567, 'place_name': 'Aogashima-mura', 'prefecture': 'Tokyo To', 'city': 'Aogashima-mura'},
# Ogasawara (小笠原) - Bonin Islands
'100-2100': {'latitude': 27.0917, 'longitude': 142.2036, 'place_name': 'Ogasawara-mura', 'prefecture': 'Tokyo To', 'city': 'Ogasawara-mura'},
'100-2101': {'latitude': 27.0917, 'longitude': 142.2036, 'place_name': 'Ogasawara-mura', 'prefecture': 'Tokyo To', 'city': 'Ogasawara-mura'},
}
def lookup_postal_code(postal_code: str, postal_map: dict) -> tuple:
"""Look up postal code with fallback strategies.
Returns (coords_dict, method) or (None, None) if not found.
"""
# Check manual overrides first (for remote islands)
if postal_code in MANUAL_POSTAL_COORDS:
return MANUAL_POSTAL_COORDS[postal_code], 'manual_override'
# Direct lookup
if postal_code in postal_map:
return postal_map[postal_code], 'exact'
# Try prefix fallback (e.g., 170-8445 -> 170-0000)
# BUT skip 100-XXXX as it would incorrectly map islands to central Tokyo
if len(postal_code) == 8 and '-' in postal_code:
prefix = postal_code[:3]
# Skip 100-XXXX prefix fallback - these are remote islands
if prefix == '100':
return None, None
fallback_code = f"{prefix}-0000"
if fallback_code in postal_map:
return postal_map[fallback_code], 'prefix_fallback'
# Try 5-digit prefix fallback (e.g., 901-2720 -> 901-2701)
prefix5 = postal_code[:6] # "901-27"
for code in postal_map:
if code.startswith(prefix5):
return postal_map[code], 'prefix5_fallback'
return None, None
def update_custodian_file(filepath: Path, postal_map: dict, dry_run: bool = False) -> bool:
"""Update a custodian file with coordinates from postal code lookup.
Returns True if file was updated, False otherwise.
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return False
# Check if already has coordinates
locations = data.get('original_entry', {}).get('locations', [])
for loc in locations:
if loc.get('latitude') and loc.get('longitude'):
return False # Already has coordinates
# Try to find postal code
postal_code = None
for loc in locations:
pc = loc.get('postal_code')
if pc:
postal_code = normalize_postal_code(str(pc))
break
if not postal_code:
return False
# Look up coordinates with fallback
coords, method = lookup_postal_code(postal_code, postal_map)
if not coords:
return False
lat = coords['latitude']
lon = coords['longitude']
if dry_run:
print(f"Would update: {filepath.name}")
print(f" Postal code: {postal_code} (method: {method})")
print(f" Coordinates: {lat}, {lon}")
return True
# Update the file
# Add coordinates to original_entry.locations
for loc in data.get('original_entry', {}).get('locations', []):
loc['latitude'] = lat
loc['longitude'] = lon
# Also add to top-level location if it exists
if 'location' in data:
data['location']['latitude'] = lat
data['location']['longitude'] = lon
# Add geocoding provenance
if 'geocoding' not in data:
data['geocoding'] = {}
data['geocoding']['method'] = 'POSTAL_CODE_LOOKUP'
data['geocoding']['lookup_method'] = method
data['geocoding']['source'] = 'GeoNames JP postal codes'
data['geocoding']['postal_code'] = postal_code
data['geocoding']['resolved_place'] = coords['place_name']
data['geocoding']['resolved_city'] = coords['city']
data['geocoding']['timestamp'] = datetime.now(timezone.utc).isoformat()
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
return True
except Exception as e:
print(f"Error processing {filepath}: {e}", file=sys.stderr)
return False
def main():
import argparse
parser = argparse.ArgumentParser(description='Geocode Japanese custodian files using postal codes')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)')
args = parser.parse_args()
# Paths
project_root = Path(__file__).parent.parent
custodian_dir = project_root / 'data' / 'custodian'
postal_file = project_root / 'data' / 'reference' / 'JP.txt'
missing_list = Path('/tmp/jp_missing.txt')
if not postal_file.exists():
print(f"Error: Postal code file not found: {postal_file}")
print("Download from: https://download.geonames.org/export/zip/JP.zip")
sys.exit(1)
print("Loading postal code database...")
postal_map = load_postal_codes(postal_file)
print(f"Loaded {len(postal_map):,} postal codes")
# Get list of files to process
if missing_list.exists():
print(f"Using file list from {missing_list}")
with open(missing_list) as f:
files = [Path(line.strip()) for line in f if line.strip()]
else:
print("Scanning for JP-*.yaml files without coordinates...")
files = list(custodian_dir.glob('JP-*.yaml'))
if args.limit > 0:
files = files[:args.limit]
print(f"Processing {len(files)} files...")
updated = 0
not_found = 0
already_has = 0
errors = 0
for filepath in files:
if not filepath.exists():
errors += 1
continue
result = update_custodian_file(filepath, postal_map, args.dry_run)
if result:
updated += 1
else:
# Check why not updated
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
locations = data.get('original_entry', {}).get('locations', [])
has_coords = any(loc.get('latitude') for loc in locations)
if has_coords:
already_has += 1
else:
not_found += 1
except:
errors += 1
print()
print("=" * 50)
print("JAPAN POSTAL CODE GEOCODING RESULTS")
print("=" * 50)
print(f"Files processed: {len(files)}")
print(f"Updated: {updated}")
print(f"Already had coords: {already_has}")
print(f"Postal code not found: {not_found}")
print(f"Errors: {errors}")
if args.dry_run:
print()
print("(Dry run - no files were modified)")
if __name__ == '__main__':
main()