- Download GeoNames JP postal code database (142K entries) - Create geocode_japan_postal.py with postal code lookup - Handle unicode hyphen variants in postal codes - Add manual mappings for remote Tokyo islands (Hachijojima, Miyakejima) - Implement prefix fallback for company postal codes - Total JP files geocoded: 540 (99.81% coverage) This brings overall geocoding coverage from 97.84% to 99.81%
283 lines
10 KiB
Python
283 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Geocode Japanese custodian files using postal codes.
|
|
|
|
Uses GeoNames postal code data (JP.txt) to add latitude/longitude
|
|
to Japanese custodian files that are missing coordinates.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
def load_postal_codes(postal_file: Path) -> dict:
|
|
"""Load Japanese postal code to coordinates mapping.
|
|
|
|
Returns dict: postal_code -> (lat, lon, place_name, prefecture)
|
|
"""
|
|
postal_map = {}
|
|
|
|
with open(postal_file, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
parts = line.strip().split('\t')
|
|
if len(parts) >= 11:
|
|
postal_code = parts[1] # e.g., "490-1401"
|
|
place_name = parts[2]
|
|
prefecture = parts[3] # e.g., "Aichi Ken"
|
|
city = parts[5] # e.g., "Yatomi Shi"
|
|
lat = parts[9]
|
|
lon = parts[10]
|
|
|
|
if lat and lon:
|
|
try:
|
|
lat_f = float(lat)
|
|
lon_f = float(lon)
|
|
# Store first match for each postal code
|
|
if postal_code not in postal_map:
|
|
postal_map[postal_code] = {
|
|
'latitude': lat_f,
|
|
'longitude': lon_f,
|
|
'place_name': place_name,
|
|
'prefecture': prefecture,
|
|
'city': city
|
|
}
|
|
except ValueError:
|
|
continue
|
|
|
|
return postal_map
|
|
|
|
|
|
def normalize_postal_code(postal_code: str) -> str:
|
|
"""Normalize Japanese postal code to standard format.
|
|
|
|
Handles unicode hyphen variants and other formatting issues.
|
|
"""
|
|
if not postal_code:
|
|
return postal_code
|
|
|
|
# Replace various unicode hyphens/dashes with ASCII hyphen
|
|
# U+2010 HYPHEN, U+2011 NON-BREAKING HYPHEN, U+2012 FIGURE DASH
|
|
# U+2013 EN DASH, U+2014 EM DASH, U+2015 HORIZONTAL BAR
|
|
# U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK (ー)
|
|
hyphen_chars = '\u2010\u2011\u2012\u2013\u2014\u2015\u30fc\uff0d'
|
|
for char in hyphen_chars:
|
|
postal_code = postal_code.replace(char, '-')
|
|
|
|
return postal_code.strip()
|
|
|
|
|
|
# Manual coordinates for remote Tokyo islands not in GeoNames
|
|
# These islands have 100-XXXX postal codes but are far from mainland Tokyo
|
|
MANUAL_POSTAL_COORDS = {
|
|
# Hachijojima (八丈島) - remote island ~287km south of Tokyo
|
|
'100-1511': {'latitude': 33.1038, 'longitude': 139.7977, 'place_name': 'Hachijo-machi', 'prefecture': 'Tokyo To', 'city': 'Hachijo-machi'},
|
|
'100-1512': {'latitude': 33.1038, 'longitude': 139.7977, 'place_name': 'Hachijo-machi', 'prefecture': 'Tokyo To', 'city': 'Hachijo-machi'},
|
|
# Miyakejima (三宅島) - remote island ~180km south of Tokyo
|
|
'100-1101': {'latitude': 34.0784, 'longitude': 139.5299, 'place_name': 'Miyake-mura', 'prefecture': 'Tokyo To', 'city': 'Miyake-mura'},
|
|
'100-1102': {'latitude': 34.0784, 'longitude': 139.5299, 'place_name': 'Miyake-mura', 'prefecture': 'Tokyo To', 'city': 'Miyake-mura'},
|
|
'100-1212': {'latitude': 34.0784, 'longitude': 139.5299, 'place_name': 'Miyake-mura', 'prefecture': 'Tokyo To', 'city': 'Miyake-mura'},
|
|
# Aogashima (青ヶ島) - southernmost inhabited island
|
|
'100-1701': {'latitude': 32.4557, 'longitude': 139.7567, 'place_name': 'Aogashima-mura', 'prefecture': 'Tokyo To', 'city': 'Aogashima-mura'},
|
|
# Ogasawara (小笠原) - Bonin Islands
|
|
'100-2100': {'latitude': 27.0917, 'longitude': 142.2036, 'place_name': 'Ogasawara-mura', 'prefecture': 'Tokyo To', 'city': 'Ogasawara-mura'},
|
|
'100-2101': {'latitude': 27.0917, 'longitude': 142.2036, 'place_name': 'Ogasawara-mura', 'prefecture': 'Tokyo To', 'city': 'Ogasawara-mura'},
|
|
}
|
|
|
|
|
|
def lookup_postal_code(postal_code: str, postal_map: dict) -> tuple:
|
|
"""Look up postal code with fallback strategies.
|
|
|
|
Returns (coords_dict, method) or (None, None) if not found.
|
|
"""
|
|
# Check manual overrides first (for remote islands)
|
|
if postal_code in MANUAL_POSTAL_COORDS:
|
|
return MANUAL_POSTAL_COORDS[postal_code], 'manual_override'
|
|
|
|
# Direct lookup
|
|
if postal_code in postal_map:
|
|
return postal_map[postal_code], 'exact'
|
|
|
|
# Try prefix fallback (e.g., 170-8445 -> 170-0000)
|
|
# BUT skip 100-XXXX as it would incorrectly map islands to central Tokyo
|
|
if len(postal_code) == 8 and '-' in postal_code:
|
|
prefix = postal_code[:3]
|
|
|
|
# Skip 100-XXXX prefix fallback - these are remote islands
|
|
if prefix == '100':
|
|
return None, None
|
|
|
|
fallback_code = f"{prefix}-0000"
|
|
if fallback_code in postal_map:
|
|
return postal_map[fallback_code], 'prefix_fallback'
|
|
|
|
# Try 5-digit prefix fallback (e.g., 901-2720 -> 901-2701)
|
|
prefix5 = postal_code[:6] # "901-27"
|
|
for code in postal_map:
|
|
if code.startswith(prefix5):
|
|
return postal_map[code], 'prefix5_fallback'
|
|
|
|
return None, None
|
|
|
|
|
|
def update_custodian_file(filepath: Path, postal_map: dict, dry_run: bool = False) -> bool:
|
|
"""Update a custodian file with coordinates from postal code lookup.
|
|
|
|
Returns True if file was updated, False otherwise.
|
|
"""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return False
|
|
|
|
# Check if already has coordinates
|
|
locations = data.get('original_entry', {}).get('locations', [])
|
|
for loc in locations:
|
|
if loc.get('latitude') and loc.get('longitude'):
|
|
return False # Already has coordinates
|
|
|
|
# Try to find postal code
|
|
postal_code = None
|
|
for loc in locations:
|
|
pc = loc.get('postal_code')
|
|
if pc:
|
|
postal_code = normalize_postal_code(str(pc))
|
|
break
|
|
|
|
if not postal_code:
|
|
return False
|
|
|
|
# Look up coordinates with fallback
|
|
coords, method = lookup_postal_code(postal_code, postal_map)
|
|
if not coords:
|
|
return False
|
|
|
|
lat = coords['latitude']
|
|
lon = coords['longitude']
|
|
|
|
if dry_run:
|
|
print(f"Would update: {filepath.name}")
|
|
print(f" Postal code: {postal_code} (method: {method})")
|
|
print(f" Coordinates: {lat}, {lon}")
|
|
return True
|
|
|
|
# Update the file
|
|
# Add coordinates to original_entry.locations
|
|
for loc in data.get('original_entry', {}).get('locations', []):
|
|
loc['latitude'] = lat
|
|
loc['longitude'] = lon
|
|
|
|
# Also add to top-level location if it exists
|
|
if 'location' in data:
|
|
data['location']['latitude'] = lat
|
|
data['location']['longitude'] = lon
|
|
|
|
# Add geocoding provenance
|
|
if 'geocoding' not in data:
|
|
data['geocoding'] = {}
|
|
|
|
data['geocoding']['method'] = 'POSTAL_CODE_LOOKUP'
|
|
data['geocoding']['lookup_method'] = method
|
|
data['geocoding']['source'] = 'GeoNames JP postal codes'
|
|
data['geocoding']['postal_code'] = postal_code
|
|
data['geocoding']['resolved_place'] = coords['place_name']
|
|
data['geocoding']['resolved_city'] = coords['city']
|
|
data['geocoding']['timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {filepath}: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Geocode Japanese custodian files using postal codes')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)')
|
|
args = parser.parse_args()
|
|
|
|
# Paths
|
|
project_root = Path(__file__).parent.parent
|
|
custodian_dir = project_root / 'data' / 'custodian'
|
|
postal_file = project_root / 'data' / 'reference' / 'JP.txt'
|
|
missing_list = Path('/tmp/jp_missing.txt')
|
|
|
|
if not postal_file.exists():
|
|
print(f"Error: Postal code file not found: {postal_file}")
|
|
print("Download from: https://download.geonames.org/export/zip/JP.zip")
|
|
sys.exit(1)
|
|
|
|
print("Loading postal code database...")
|
|
postal_map = load_postal_codes(postal_file)
|
|
print(f"Loaded {len(postal_map):,} postal codes")
|
|
|
|
# Get list of files to process
|
|
if missing_list.exists():
|
|
print(f"Using file list from {missing_list}")
|
|
with open(missing_list) as f:
|
|
files = [Path(line.strip()) for line in f if line.strip()]
|
|
else:
|
|
print("Scanning for JP-*.yaml files without coordinates...")
|
|
files = list(custodian_dir.glob('JP-*.yaml'))
|
|
|
|
if args.limit > 0:
|
|
files = files[:args.limit]
|
|
|
|
print(f"Processing {len(files)} files...")
|
|
|
|
updated = 0
|
|
not_found = 0
|
|
already_has = 0
|
|
errors = 0
|
|
|
|
for filepath in files:
|
|
if not filepath.exists():
|
|
errors += 1
|
|
continue
|
|
|
|
result = update_custodian_file(filepath, postal_map, args.dry_run)
|
|
if result:
|
|
updated += 1
|
|
else:
|
|
# Check why not updated
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
locations = data.get('original_entry', {}).get('locations', [])
|
|
has_coords = any(loc.get('latitude') for loc in locations)
|
|
|
|
if has_coords:
|
|
already_has += 1
|
|
else:
|
|
not_found += 1
|
|
except:
|
|
errors += 1
|
|
|
|
print()
|
|
print("=" * 50)
|
|
print("JAPAN POSTAL CODE GEOCODING RESULTS")
|
|
print("=" * 50)
|
|
print(f"Files processed: {len(files)}")
|
|
print(f"Updated: {updated}")
|
|
print(f"Already had coords: {already_has}")
|
|
print(f"Postal code not found: {not_found}")
|
|
print(f"Errors: {errors}")
|
|
|
|
if args.dry_run:
|
|
print()
|
|
print("(Dry run - no files were modified)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|