#!/usr/bin/env python3 """ Geocode Japanese custodian files using postal codes. Uses GeoNames postal code data (JP.txt) to add latitude/longitude to Japanese custodian files that are missing coordinates. """ import os import sys import yaml from pathlib import Path from datetime import datetime, timezone def load_postal_codes(postal_file: Path) -> dict: """Load Japanese postal code to coordinates mapping. Returns dict: postal_code -> (lat, lon, place_name, prefecture) """ postal_map = {} with open(postal_file, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split('\t') if len(parts) >= 11: postal_code = parts[1] # e.g., "490-1401" place_name = parts[2] prefecture = parts[3] # e.g., "Aichi Ken" city = parts[5] # e.g., "Yatomi Shi" lat = parts[9] lon = parts[10] if lat and lon: try: lat_f = float(lat) lon_f = float(lon) # Store first match for each postal code if postal_code not in postal_map: postal_map[postal_code] = { 'latitude': lat_f, 'longitude': lon_f, 'place_name': place_name, 'prefecture': prefecture, 'city': city } except ValueError: continue return postal_map def normalize_postal_code(postal_code: str) -> str: """Normalize Japanese postal code to standard format. Handles unicode hyphen variants and other formatting issues. """ if not postal_code: return postal_code # Replace various unicode hyphens/dashes with ASCII hyphen # U+2010 HYPHEN, U+2011 NON-BREAKING HYPHEN, U+2012 FIGURE DASH # U+2013 EN DASH, U+2014 EM DASH, U+2015 HORIZONTAL BAR # U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK (ー) hyphen_chars = '\u2010\u2011\u2012\u2013\u2014\u2015\u30fc\uff0d' for char in hyphen_chars: postal_code = postal_code.replace(char, '-') return postal_code.strip() # Manual coordinates for remote Tokyo islands not in GeoNames # These islands have 100-XXXX postal codes but are far from mainland Tokyo MANUAL_POSTAL_COORDS = { # Hachijojima (八丈島) - remote island ~287km south of Tokyo '100-1511': {'latitude': 33.1038, 'longitude': 139.7977, 'place_name': 'Hachijo-machi', 'prefecture': 'Tokyo To', 'city': 'Hachijo-machi'}, '100-1512': {'latitude': 33.1038, 'longitude': 139.7977, 'place_name': 'Hachijo-machi', 'prefecture': 'Tokyo To', 'city': 'Hachijo-machi'}, # Miyakejima (三宅島) - remote island ~180km south of Tokyo '100-1101': {'latitude': 34.0784, 'longitude': 139.5299, 'place_name': 'Miyake-mura', 'prefecture': 'Tokyo To', 'city': 'Miyake-mura'}, '100-1102': {'latitude': 34.0784, 'longitude': 139.5299, 'place_name': 'Miyake-mura', 'prefecture': 'Tokyo To', 'city': 'Miyake-mura'}, '100-1212': {'latitude': 34.0784, 'longitude': 139.5299, 'place_name': 'Miyake-mura', 'prefecture': 'Tokyo To', 'city': 'Miyake-mura'}, # Aogashima (青ヶ島) - southernmost inhabited island '100-1701': {'latitude': 32.4557, 'longitude': 139.7567, 'place_name': 'Aogashima-mura', 'prefecture': 'Tokyo To', 'city': 'Aogashima-mura'}, # Ogasawara (小笠原) - Bonin Islands '100-2100': {'latitude': 27.0917, 'longitude': 142.2036, 'place_name': 'Ogasawara-mura', 'prefecture': 'Tokyo To', 'city': 'Ogasawara-mura'}, '100-2101': {'latitude': 27.0917, 'longitude': 142.2036, 'place_name': 'Ogasawara-mura', 'prefecture': 'Tokyo To', 'city': 'Ogasawara-mura'}, } def lookup_postal_code(postal_code: str, postal_map: dict) -> tuple: """Look up postal code with fallback strategies. Returns (coords_dict, method) or (None, None) if not found. """ # Check manual overrides first (for remote islands) if postal_code in MANUAL_POSTAL_COORDS: return MANUAL_POSTAL_COORDS[postal_code], 'manual_override' # Direct lookup if postal_code in postal_map: return postal_map[postal_code], 'exact' # Try prefix fallback (e.g., 170-8445 -> 170-0000) # BUT skip 100-XXXX as it would incorrectly map islands to central Tokyo if len(postal_code) == 8 and '-' in postal_code: prefix = postal_code[:3] # Skip 100-XXXX prefix fallback - these are remote islands if prefix == '100': return None, None fallback_code = f"{prefix}-0000" if fallback_code in postal_map: return postal_map[fallback_code], 'prefix_fallback' # Try 5-digit prefix fallback (e.g., 901-2720 -> 901-2701) prefix5 = postal_code[:6] # "901-27" for code in postal_map: if code.startswith(prefix5): return postal_map[code], 'prefix5_fallback' return None, None def update_custodian_file(filepath: Path, postal_map: dict, dry_run: bool = False) -> bool: """Update a custodian file with coordinates from postal code lookup. Returns True if file was updated, False otherwise. """ try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return False # Check if already has coordinates locations = data.get('original_entry', {}).get('locations', []) for loc in locations: if loc.get('latitude') and loc.get('longitude'): return False # Already has coordinates # Try to find postal code postal_code = None for loc in locations: pc = loc.get('postal_code') if pc: postal_code = normalize_postal_code(str(pc)) break if not postal_code: return False # Look up coordinates with fallback coords, method = lookup_postal_code(postal_code, postal_map) if not coords: return False lat = coords['latitude'] lon = coords['longitude'] if dry_run: print(f"Would update: {filepath.name}") print(f" Postal code: {postal_code} (method: {method})") print(f" Coordinates: {lat}, {lon}") return True # Update the file # Add coordinates to original_entry.locations for loc in data.get('original_entry', {}).get('locations', []): loc['latitude'] = lat loc['longitude'] = lon # Also add to top-level location if it exists if 'location' in data: data['location']['latitude'] = lat data['location']['longitude'] = lon # Add geocoding provenance if 'geocoding' not in data: data['geocoding'] = {} data['geocoding']['method'] = 'POSTAL_CODE_LOOKUP' data['geocoding']['lookup_method'] = method data['geocoding']['source'] = 'GeoNames JP postal codes' data['geocoding']['postal_code'] = postal_code data['geocoding']['resolved_place'] = coords['place_name'] data['geocoding']['resolved_city'] = coords['city'] data['geocoding']['timestamp'] = datetime.now(timezone.utc).isoformat() # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False) return True except Exception as e: print(f"Error processing {filepath}: {e}", file=sys.stderr) return False def main(): import argparse parser = argparse.ArgumentParser(description='Geocode Japanese custodian files using postal codes') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0 = all)') args = parser.parse_args() # Paths project_root = Path(__file__).parent.parent custodian_dir = project_root / 'data' / 'custodian' postal_file = project_root / 'data' / 'reference' / 'JP.txt' missing_list = Path('/tmp/jp_missing.txt') if not postal_file.exists(): print(f"Error: Postal code file not found: {postal_file}") print("Download from: https://download.geonames.org/export/zip/JP.zip") sys.exit(1) print("Loading postal code database...") postal_map = load_postal_codes(postal_file) print(f"Loaded {len(postal_map):,} postal codes") # Get list of files to process if missing_list.exists(): print(f"Using file list from {missing_list}") with open(missing_list) as f: files = [Path(line.strip()) for line in f if line.strip()] else: print("Scanning for JP-*.yaml files without coordinates...") files = list(custodian_dir.glob('JP-*.yaml')) if args.limit > 0: files = files[:args.limit] print(f"Processing {len(files)} files...") updated = 0 not_found = 0 already_has = 0 errors = 0 for filepath in files: if not filepath.exists(): errors += 1 continue result = update_custodian_file(filepath, postal_map, args.dry_run) if result: updated += 1 else: # Check why not updated try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) locations = data.get('original_entry', {}).get('locations', []) has_coords = any(loc.get('latitude') for loc in locations) if has_coords: already_has += 1 else: not_found += 1 except: errors += 1 print() print("=" * 50) print("JAPAN POSTAL CODE GEOCODING RESULTS") print("=" * 50) print(f"Files processed: {len(files)}") print(f"Updated: {updated}") print(f"Already had coords: {already_has}") print(f"Postal code not found: {not_found}") print(f"Errors: {errors}") if args.dry_run: print() print("(Dry run - no files were modified)") if __name__ == '__main__': main()