glam/scripts/geocode_cz_from_ghcid.py
kempersc 2137c522db geocode: add coordinates to JP compound cities and CZ files from GeoNames
- JP: Handle Gun/Cho/Machi/Mura compound city names (2615 files)
- CZ: Map city codes to GeoNames entries (667 files)
- Overall coverage: 84.5% → 96.4%
2025-12-09 21:49:40 +01:00

321 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Geocode Czech Files Using GHCID City Codes
Many Czech custodian files have empty locations but the city code is in the GHCID.
This script uses the GHCID city code to look up coordinates in GeoNames.
Czech city codes mapping:
- PRA = Praha (Prague)
- BRN = Brno
- OST = Ostrava
- PLZ = Plzen
- etc.
Usage:
python scripts/geocode_cz_from_ghcid.py --dry-run
python scripts/geocode_cz_from_ghcid.py --all
"""
import argparse
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
from ruamel.yaml import YAML
# Setup ruamel.yaml
yaml = YAML()
yaml.preserve_quotes = True
yaml.width = 120
# Configuration
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
# Czech city code to GeoNames name mapping
CZ_CITY_CODES = {
'PRA': ['Prague', 'Praha'],
'BRN': ['Brno'],
'OST': ['Ostrava'],
'PLZ': ['Plzen', 'Pilsen'],
'LIB': ['Liberec'],
'OLO': ['Olomouc'],
'UST': ['Usti nad Labem'],
'HRA': ['Hradec Kralove'],
'PAR': ['Pardubice'],
'ZLI': ['Zlin'],
'HAV': ['Havirov'],
'KLA': ['Kladno'],
'MOS': ['Most'],
'KAR': ['Karlovy Vary'],
'FRY': ['Frydek-Mistek'],
'OPA': ['Opava'],
'DEC': ['Decin'],
'TEP': ['Teplice'],
'JIH': ['Jihlava'],
'CEL': ['Ceske Budejovice'],
'CHO': ['Cheb'],
'PRE': ['Prerov'],
'PRO': ['Prostejov'],
'KRO': ['Kromeriz'],
'VYS': ['Vyskov'],
'HOR': ['Horice'],
'TAB': ['Tabor'],
'PIS': ['Pisek'],
'STR': ['Strakonice'],
'KUT': ['Kutna Hora'],
'TEL': ['Telc'],
'TRE': ['Trebic'],
'ZNO': ['Znojmo'],
# Additional cities from errors
'MBM': ['Mlada Boleslav'],
'MLB': ['Mlada Boleslav'],
'PRI': ['Pribram'],
'CBU': ['Ceske Budejovice'],
'MVO': ['Mladovozice', 'Mlada Vozice'],
'TRO': ['Trnova', 'Tabor'],
'CLE': ['Ceska Lipa'],
'KVL': ['Karlovy Vary'],
'CRA': ['Cerna Hora', 'Hradec Kralove'],
'LNP': ['Liberec'],
'NMS': ['Nove Mesto nad Metuji'],
'SPR': ['Spindleruv Mlyn', 'Hradec Kralove'],
'ZBR': ['Zbiroh'],
'BEL': ['Bela pod Bezdezem'],
'DKR': ['Dvur Kralove'],
'HKR': ['Hradec Kralove'],
'KUK': ['Kutna Hora'],
'TRU': ['Trutnov'],
'MTR': ['Moravska Trebova'],
'SKU': ['Skutec'],
# Add more as needed
}
class GeoNamesLookup:
"""GeoNames database lookup."""
def __init__(self, db_path: Path):
self.conn = sqlite3.connect(db_path)
self.conn.row_factory = sqlite3.Row
def lookup_city(self, names: list[str], country_code: str = "CZ") -> Optional[dict]:
"""Look up city by list of possible names."""
for name in names:
cursor = self.conn.execute("""
SELECT geonames_id, name, ascii_name, latitude, longitude,
admin1_code, admin1_name, feature_code, population
FROM cities
WHERE country_code = ?
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY population DESC
LIMIT 1
""", (country_code, name, name))
row = cursor.fetchone()
if row:
return {
'geonames_id': row['geonames_id'],
'geonames_name': row['name'],
'latitude': row['latitude'],
'longitude': row['longitude'],
'admin1_code': row['admin1_code'],
'feature_code': row['feature_code'],
'population': row['population'],
'matched_name': name
}
return None
def close(self):
self.conn.close()
def has_coordinates(data: dict) -> bool:
"""Check if file already has coordinates."""
loc = data.get('location', {})
return loc.get('latitude') is not None and loc.get('longitude') is not None
def get_city_code_from_ghcid(data: dict) -> Optional[str]:
"""Extract city code from GHCID."""
ghcid = data.get('ghcid', {}).get('ghcid_current', '')
if ghcid and len(ghcid.split('-')) >= 3:
parts = ghcid.split('-')
return parts[2] # Country-Region-City-Type-Abbrev
return None
def geocode_file(filepath: Path, geonames: GeoNamesLookup, dry_run: bool = False) -> dict:
"""Geocode a single CZ file."""
result = {
'success': False,
'geocoded': False,
'already_has_coords': False,
'city_code': None,
'error': None
}
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.load(f)
if not isinstance(data, dict):
result['error'] = "Invalid YAML"
return result
if has_coordinates(data):
result['success'] = True
result['already_has_coords'] = True
return result
# Get city code from GHCID
city_code = get_city_code_from_ghcid(data)
result['city_code'] = city_code
if not city_code or city_code == 'XXX':
result['error'] = f"No valid city code: {city_code}"
result['success'] = True
return result
# Look up city names for this code
city_names = CZ_CITY_CODES.get(city_code)
if not city_names:
result['error'] = f"Unknown city code: {city_code}"
result['success'] = True
return result
# Look up in GeoNames
geo_result = geonames.lookup_city(city_names, "CZ")
if not geo_result:
result['error'] = f"City not found: {city_names}"
result['success'] = True
return result
# Update location
if 'location' not in data:
data['location'] = {}
data['location']['latitude'] = geo_result['latitude']
data['location']['longitude'] = geo_result['longitude']
data['location']['city'] = geo_result['geonames_name']
data['location']['country'] = 'CZ'
data['location']['geonames_id'] = geo_result['geonames_id']
data['location']['geonames_name'] = geo_result['geonames_name']
data['location']['feature_code'] = geo_result['feature_code']
data['location']['coordinate_provenance'] = {
'source_type': 'GEONAMES_GHCID_CITY_CODE',
'source_path': 'data/reference/geonames.db',
'entity_id': geo_result['geonames_id'],
'city_code': city_code,
'original_timestamp': datetime.now(timezone.utc).isoformat()
}
data['location']['normalization_timestamp'] = datetime.now(timezone.utc).isoformat()
if not dry_run:
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f)
result['success'] = True
result['geocoded'] = True
return result
except Exception as e:
result['error'] = str(e)
return result
def main():
parser = argparse.ArgumentParser(description="Geocode Czech files using GHCID city codes")
parser.add_argument('--dry-run', action='store_true', help="Preview without writing")
parser.add_argument('--limit', type=int, default=0, help="Limit files")
parser.add_argument('--all', action='store_true', help="Process all")
parser.add_argument('--verbose', action='store_true', help="Verbose output")
parser.add_argument('--file-list', type=str, help="File containing list of files")
args = parser.parse_args()
if args.dry_run:
print("DRY RUN - No files will be modified\n")
if not GEONAMES_DB.exists():
print(f"Error: GeoNames database not found at {GEONAMES_DB}")
return 1
geonames = GeoNamesLookup(GEONAMES_DB)
# Get files to process
if args.file_list:
with open(args.file_list, 'r') as f:
files_to_process = [CUSTODIAN_DIR / line.strip() for line in f if line.strip()]
print(f"Loaded {len(files_to_process)} files from {args.file_list}")
else:
all_files = sorted(CUSTODIAN_DIR.glob("CZ-*.yaml"))
print(f"Total CZ files: {len(all_files)}")
files_to_process = []
for fp in all_files:
try:
with open(fp, 'r', encoding='utf-8') as f:
data = yaml.load(f)
if isinstance(data, dict) and not has_coordinates(data):
files_to_process.append(fp)
except:
pass
print(f"CZ files missing coordinates: {len(files_to_process)}")
if args.limit and not args.all:
files_to_process = files_to_process[:args.limit]
print(f"Limited to first {args.limit} files")
# Statistics
stats = {'total': len(files_to_process), 'geocoded': 0, 'not_found': 0, 'no_code': 0, 'errors': 0}
not_found = []
for i, filepath in enumerate(files_to_process):
result = geocode_file(filepath, geonames, dry_run=args.dry_run)
if result['geocoded']:
stats['geocoded'] += 1
if args.verbose:
print(f"{filepath.name}: {result['city_code']}")
elif result.get('error') and 'Unknown city code' in result['error']:
stats['not_found'] += 1
if len(not_found) < 30 and result['city_code'] not in [x[1] for x in not_found]:
not_found.append((filepath.name, result['city_code']))
elif result.get('error') and 'No valid city code' in result['error']:
stats['no_code'] += 1
elif result.get('error'):
stats['errors'] += 1
if not args.verbose and (i + 1) % 200 == 0:
print(f"Processed {i+1}/{len(files_to_process)}... (geocoded: {stats['geocoded']})")
# Summary
print("\n" + "=" * 60)
print("CZECH GEOCODING SUMMARY")
print("=" * 60)
print(f"Total processed: {stats['total']}")
print(f"Successfully geocoded: {stats['geocoded']}")
print(f"Unknown city codes: {stats['not_found']}")
print(f"No city code: {stats['no_code']}")
print(f"Errors: {stats['errors']}")
if not_found:
print(f"\nUnknown city codes (need to add to mapping):")
for fname, code in not_found[:20]:
print(f" {code}: {fname}")
if args.dry_run:
print("\n(DRY RUN - No files were modified)")
geonames.close()
return 0
if __name__ == "__main__":
exit(main())