#!/usr/bin/env python3 """ Geocode Czech Files Using GHCID City Codes Many Czech custodian files have empty locations but the city code is in the GHCID. This script uses the GHCID city code to look up coordinates in GeoNames. Czech city codes mapping: - PRA = Praha (Prague) - BRN = Brno - OST = Ostrava - PLZ = Plzen - etc. Usage: python scripts/geocode_cz_from_ghcid.py --dry-run python scripts/geocode_cz_from_ghcid.py --all """ import argparse import sqlite3 from datetime import datetime, timezone from pathlib import Path from typing import Optional from ruamel.yaml import YAML # Setup ruamel.yaml yaml = YAML() yaml.preserve_quotes = True yaml.width = 120 # Configuration CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db") # Czech city code to GeoNames name mapping CZ_CITY_CODES = { 'PRA': ['Prague', 'Praha'], 'BRN': ['Brno'], 'OST': ['Ostrava'], 'PLZ': ['Plzen', 'Pilsen'], 'LIB': ['Liberec'], 'OLO': ['Olomouc'], 'UST': ['Usti nad Labem'], 'HRA': ['Hradec Kralove'], 'PAR': ['Pardubice'], 'ZLI': ['Zlin'], 'HAV': ['Havirov'], 'KLA': ['Kladno'], 'MOS': ['Most'], 'KAR': ['Karlovy Vary'], 'FRY': ['Frydek-Mistek'], 'OPA': ['Opava'], 'DEC': ['Decin'], 'TEP': ['Teplice'], 'JIH': ['Jihlava'], 'CEL': ['Ceske Budejovice'], 'CHO': ['Cheb'], 'PRE': ['Prerov'], 'PRO': ['Prostejov'], 'KRO': ['Kromeriz'], 'VYS': ['Vyskov'], 'HOR': ['Horice'], 'TAB': ['Tabor'], 'PIS': ['Pisek'], 'STR': ['Strakonice'], 'KUT': ['Kutna Hora'], 'TEL': ['Telc'], 'TRE': ['Trebic'], 'ZNO': ['Znojmo'], # Additional cities from errors 'MBM': ['Mlada Boleslav'], 'MLB': ['Mlada Boleslav'], 'PRI': ['Pribram'], 'CBU': ['Ceske Budejovice'], 'MVO': ['Mladovozice', 'Mlada Vozice'], 'TRO': ['Trnova', 'Tabor'], 'CLE': ['Ceska Lipa'], 'KVL': ['Karlovy Vary'], 'CRA': ['Cerna Hora', 'Hradec Kralove'], 'LNP': ['Liberec'], 'NMS': ['Nove Mesto nad Metuji'], 'SPR': ['Spindleruv Mlyn', 'Hradec Kralove'], 'ZBR': ['Zbiroh'], 'BEL': ['Bela pod Bezdezem'], 'DKR': ['Dvur Kralove'], 'HKR': ['Hradec Kralove'], 'KUK': ['Kutna Hora'], 'TRU': ['Trutnov'], 'MTR': ['Moravska Trebova'], 'SKU': ['Skutec'], # Add more as needed } class GeoNamesLookup: """GeoNames database lookup.""" def __init__(self, db_path: Path): self.conn = sqlite3.connect(db_path) self.conn.row_factory = sqlite3.Row def lookup_city(self, names: list[str], country_code: str = "CZ") -> Optional[dict]: """Look up city by list of possible names.""" for name in names: cursor = self.conn.execute(""" SELECT geonames_id, name, ascii_name, latitude, longitude, admin1_code, admin1_name, feature_code, population FROM cities WHERE country_code = ? AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY population DESC LIMIT 1 """, (country_code, name, name)) row = cursor.fetchone() if row: return { 'geonames_id': row['geonames_id'], 'geonames_name': row['name'], 'latitude': row['latitude'], 'longitude': row['longitude'], 'admin1_code': row['admin1_code'], 'feature_code': row['feature_code'], 'population': row['population'], 'matched_name': name } return None def close(self): self.conn.close() def has_coordinates(data: dict) -> bool: """Check if file already has coordinates.""" loc = data.get('location', {}) return loc.get('latitude') is not None and loc.get('longitude') is not None def get_city_code_from_ghcid(data: dict) -> Optional[str]: """Extract city code from GHCID.""" ghcid = data.get('ghcid', {}).get('ghcid_current', '') if ghcid and len(ghcid.split('-')) >= 3: parts = ghcid.split('-') return parts[2] # Country-Region-City-Type-Abbrev return None def geocode_file(filepath: Path, geonames: GeoNamesLookup, dry_run: bool = False) -> dict: """Geocode a single CZ file.""" result = { 'success': False, 'geocoded': False, 'already_has_coords': False, 'city_code': None, 'error': None } try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if not isinstance(data, dict): result['error'] = "Invalid YAML" return result if has_coordinates(data): result['success'] = True result['already_has_coords'] = True return result # Get city code from GHCID city_code = get_city_code_from_ghcid(data) result['city_code'] = city_code if not city_code or city_code == 'XXX': result['error'] = f"No valid city code: {city_code}" result['success'] = True return result # Look up city names for this code city_names = CZ_CITY_CODES.get(city_code) if not city_names: result['error'] = f"Unknown city code: {city_code}" result['success'] = True return result # Look up in GeoNames geo_result = geonames.lookup_city(city_names, "CZ") if not geo_result: result['error'] = f"City not found: {city_names}" result['success'] = True return result # Update location if 'location' not in data: data['location'] = {} data['location']['latitude'] = geo_result['latitude'] data['location']['longitude'] = geo_result['longitude'] data['location']['city'] = geo_result['geonames_name'] data['location']['country'] = 'CZ' data['location']['geonames_id'] = geo_result['geonames_id'] data['location']['geonames_name'] = geo_result['geonames_name'] data['location']['feature_code'] = geo_result['feature_code'] data['location']['coordinate_provenance'] = { 'source_type': 'GEONAMES_GHCID_CITY_CODE', 'source_path': 'data/reference/geonames.db', 'entity_id': geo_result['geonames_id'], 'city_code': city_code, 'original_timestamp': datetime.now(timezone.utc).isoformat() } data['location']['normalization_timestamp'] = datetime.now(timezone.utc).isoformat() if not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f) result['success'] = True result['geocoded'] = True return result except Exception as e: result['error'] = str(e) return result def main(): parser = argparse.ArgumentParser(description="Geocode Czech files using GHCID city codes") parser.add_argument('--dry-run', action='store_true', help="Preview without writing") parser.add_argument('--limit', type=int, default=0, help="Limit files") parser.add_argument('--all', action='store_true', help="Process all") parser.add_argument('--verbose', action='store_true', help="Verbose output") parser.add_argument('--file-list', type=str, help="File containing list of files") args = parser.parse_args() if args.dry_run: print("DRY RUN - No files will be modified\n") if not GEONAMES_DB.exists(): print(f"Error: GeoNames database not found at {GEONAMES_DB}") return 1 geonames = GeoNamesLookup(GEONAMES_DB) # Get files to process if args.file_list: with open(args.file_list, 'r') as f: files_to_process = [CUSTODIAN_DIR / line.strip() for line in f if line.strip()] print(f"Loaded {len(files_to_process)} files from {args.file_list}") else: all_files = sorted(CUSTODIAN_DIR.glob("CZ-*.yaml")) print(f"Total CZ files: {len(all_files)}") files_to_process = [] for fp in all_files: try: with open(fp, 'r', encoding='utf-8') as f: data = yaml.load(f) if isinstance(data, dict) and not has_coordinates(data): files_to_process.append(fp) except: pass print(f"CZ files missing coordinates: {len(files_to_process)}") if args.limit and not args.all: files_to_process = files_to_process[:args.limit] print(f"Limited to first {args.limit} files") # Statistics stats = {'total': len(files_to_process), 'geocoded': 0, 'not_found': 0, 'no_code': 0, 'errors': 0} not_found = [] for i, filepath in enumerate(files_to_process): result = geocode_file(filepath, geonames, dry_run=args.dry_run) if result['geocoded']: stats['geocoded'] += 1 if args.verbose: print(f"✅ {filepath.name}: {result['city_code']}") elif result.get('error') and 'Unknown city code' in result['error']: stats['not_found'] += 1 if len(not_found) < 30 and result['city_code'] not in [x[1] for x in not_found]: not_found.append((filepath.name, result['city_code'])) elif result.get('error') and 'No valid city code' in result['error']: stats['no_code'] += 1 elif result.get('error'): stats['errors'] += 1 if not args.verbose and (i + 1) % 200 == 0: print(f"Processed {i+1}/{len(files_to_process)}... (geocoded: {stats['geocoded']})") # Summary print("\n" + "=" * 60) print("CZECH GEOCODING SUMMARY") print("=" * 60) print(f"Total processed: {stats['total']}") print(f"Successfully geocoded: {stats['geocoded']}") print(f"Unknown city codes: {stats['not_found']}") print(f"No city code: {stats['no_code']}") print(f"Errors: {stats['errors']}") if not_found: print(f"\nUnknown city codes (need to add to mapping):") for fname, code in not_found[:20]: print(f" {code}: {fname}") if args.dry_run: print("\n(DRY RUN - No files were modified)") geonames.close() return 0 if __name__ == "__main__": exit(main())