- JP: Handle Gun/Cho/Machi/Mura compound city names (2615 files) - CZ: Map city codes to GeoNames entries (667 files) - Overall coverage: 84.5% → 96.4%
321 lines
11 KiB
Python
321 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Geocode Czech Files Using GHCID City Codes
|
|
|
|
Many Czech custodian files have empty locations but the city code is in the GHCID.
|
|
This script uses the GHCID city code to look up coordinates in GeoNames.
|
|
|
|
Czech city codes mapping:
|
|
- PRA = Praha (Prague)
|
|
- BRN = Brno
|
|
- OST = Ostrava
|
|
- PLZ = Plzen
|
|
- etc.
|
|
|
|
Usage:
|
|
python scripts/geocode_cz_from_ghcid.py --dry-run
|
|
python scripts/geocode_cz_from_ghcid.py --all
|
|
"""
|
|
|
|
import argparse
|
|
import sqlite3
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
from ruamel.yaml import YAML
|
|
|
|
# Setup ruamel.yaml
|
|
yaml = YAML()
|
|
yaml.preserve_quotes = True
|
|
yaml.width = 120
|
|
|
|
# Configuration
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
|
|
|
|
# Czech city code to GeoNames name mapping
|
|
CZ_CITY_CODES = {
|
|
'PRA': ['Prague', 'Praha'],
|
|
'BRN': ['Brno'],
|
|
'OST': ['Ostrava'],
|
|
'PLZ': ['Plzen', 'Pilsen'],
|
|
'LIB': ['Liberec'],
|
|
'OLO': ['Olomouc'],
|
|
'UST': ['Usti nad Labem'],
|
|
'HRA': ['Hradec Kralove'],
|
|
'PAR': ['Pardubice'],
|
|
'ZLI': ['Zlin'],
|
|
'HAV': ['Havirov'],
|
|
'KLA': ['Kladno'],
|
|
'MOS': ['Most'],
|
|
'KAR': ['Karlovy Vary'],
|
|
'FRY': ['Frydek-Mistek'],
|
|
'OPA': ['Opava'],
|
|
'DEC': ['Decin'],
|
|
'TEP': ['Teplice'],
|
|
'JIH': ['Jihlava'],
|
|
'CEL': ['Ceske Budejovice'],
|
|
'CHO': ['Cheb'],
|
|
'PRE': ['Prerov'],
|
|
'PRO': ['Prostejov'],
|
|
'KRO': ['Kromeriz'],
|
|
'VYS': ['Vyskov'],
|
|
'HOR': ['Horice'],
|
|
'TAB': ['Tabor'],
|
|
'PIS': ['Pisek'],
|
|
'STR': ['Strakonice'],
|
|
'KUT': ['Kutna Hora'],
|
|
'TEL': ['Telc'],
|
|
'TRE': ['Trebic'],
|
|
'ZNO': ['Znojmo'],
|
|
# Additional cities from errors
|
|
'MBM': ['Mlada Boleslav'],
|
|
'MLB': ['Mlada Boleslav'],
|
|
'PRI': ['Pribram'],
|
|
'CBU': ['Ceske Budejovice'],
|
|
'MVO': ['Mladovozice', 'Mlada Vozice'],
|
|
'TRO': ['Trnova', 'Tabor'],
|
|
'CLE': ['Ceska Lipa'],
|
|
'KVL': ['Karlovy Vary'],
|
|
'CRA': ['Cerna Hora', 'Hradec Kralove'],
|
|
'LNP': ['Liberec'],
|
|
'NMS': ['Nove Mesto nad Metuji'],
|
|
'SPR': ['Spindleruv Mlyn', 'Hradec Kralove'],
|
|
'ZBR': ['Zbiroh'],
|
|
'BEL': ['Bela pod Bezdezem'],
|
|
'DKR': ['Dvur Kralove'],
|
|
'HKR': ['Hradec Kralove'],
|
|
'KUK': ['Kutna Hora'],
|
|
'TRU': ['Trutnov'],
|
|
'MTR': ['Moravska Trebova'],
|
|
'SKU': ['Skutec'],
|
|
# Add more as needed
|
|
}
|
|
|
|
|
|
class GeoNamesLookup:
|
|
"""GeoNames database lookup."""
|
|
|
|
def __init__(self, db_path: Path):
|
|
self.conn = sqlite3.connect(db_path)
|
|
self.conn.row_factory = sqlite3.Row
|
|
|
|
def lookup_city(self, names: list[str], country_code: str = "CZ") -> Optional[dict]:
|
|
"""Look up city by list of possible names."""
|
|
for name in names:
|
|
cursor = self.conn.execute("""
|
|
SELECT geonames_id, name, ascii_name, latitude, longitude,
|
|
admin1_code, admin1_name, feature_code, population
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (country_code, name, name))
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return {
|
|
'geonames_id': row['geonames_id'],
|
|
'geonames_name': row['name'],
|
|
'latitude': row['latitude'],
|
|
'longitude': row['longitude'],
|
|
'admin1_code': row['admin1_code'],
|
|
'feature_code': row['feature_code'],
|
|
'population': row['population'],
|
|
'matched_name': name
|
|
}
|
|
return None
|
|
|
|
def close(self):
|
|
self.conn.close()
|
|
|
|
|
|
def has_coordinates(data: dict) -> bool:
|
|
"""Check if file already has coordinates."""
|
|
loc = data.get('location', {})
|
|
return loc.get('latitude') is not None and loc.get('longitude') is not None
|
|
|
|
|
|
def get_city_code_from_ghcid(data: dict) -> Optional[str]:
|
|
"""Extract city code from GHCID."""
|
|
ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
|
if ghcid and len(ghcid.split('-')) >= 3:
|
|
parts = ghcid.split('-')
|
|
return parts[2] # Country-Region-City-Type-Abbrev
|
|
return None
|
|
|
|
|
|
def geocode_file(filepath: Path, geonames: GeoNamesLookup, dry_run: bool = False) -> dict:
|
|
"""Geocode a single CZ file."""
|
|
result = {
|
|
'success': False,
|
|
'geocoded': False,
|
|
'already_has_coords': False,
|
|
'city_code': None,
|
|
'error': None
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
|
|
if not isinstance(data, dict):
|
|
result['error'] = "Invalid YAML"
|
|
return result
|
|
|
|
if has_coordinates(data):
|
|
result['success'] = True
|
|
result['already_has_coords'] = True
|
|
return result
|
|
|
|
# Get city code from GHCID
|
|
city_code = get_city_code_from_ghcid(data)
|
|
result['city_code'] = city_code
|
|
|
|
if not city_code or city_code == 'XXX':
|
|
result['error'] = f"No valid city code: {city_code}"
|
|
result['success'] = True
|
|
return result
|
|
|
|
# Look up city names for this code
|
|
city_names = CZ_CITY_CODES.get(city_code)
|
|
if not city_names:
|
|
result['error'] = f"Unknown city code: {city_code}"
|
|
result['success'] = True
|
|
return result
|
|
|
|
# Look up in GeoNames
|
|
geo_result = geonames.lookup_city(city_names, "CZ")
|
|
|
|
if not geo_result:
|
|
result['error'] = f"City not found: {city_names}"
|
|
result['success'] = True
|
|
return result
|
|
|
|
# Update location
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
|
|
data['location']['latitude'] = geo_result['latitude']
|
|
data['location']['longitude'] = geo_result['longitude']
|
|
data['location']['city'] = geo_result['geonames_name']
|
|
data['location']['country'] = 'CZ'
|
|
data['location']['geonames_id'] = geo_result['geonames_id']
|
|
data['location']['geonames_name'] = geo_result['geonames_name']
|
|
data['location']['feature_code'] = geo_result['feature_code']
|
|
data['location']['coordinate_provenance'] = {
|
|
'source_type': 'GEONAMES_GHCID_CITY_CODE',
|
|
'source_path': 'data/reference/geonames.db',
|
|
'entity_id': geo_result['geonames_id'],
|
|
'city_code': city_code,
|
|
'original_timestamp': datetime.now(timezone.utc).isoformat()
|
|
}
|
|
data['location']['normalization_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
if not dry_run:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f)
|
|
|
|
result['success'] = True
|
|
result['geocoded'] = True
|
|
return result
|
|
|
|
except Exception as e:
|
|
result['error'] = str(e)
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Geocode Czech files using GHCID city codes")
|
|
parser.add_argument('--dry-run', action='store_true', help="Preview without writing")
|
|
parser.add_argument('--limit', type=int, default=0, help="Limit files")
|
|
parser.add_argument('--all', action='store_true', help="Process all")
|
|
parser.add_argument('--verbose', action='store_true', help="Verbose output")
|
|
parser.add_argument('--file-list', type=str, help="File containing list of files")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.dry_run:
|
|
print("DRY RUN - No files will be modified\n")
|
|
|
|
if not GEONAMES_DB.exists():
|
|
print(f"Error: GeoNames database not found at {GEONAMES_DB}")
|
|
return 1
|
|
|
|
geonames = GeoNamesLookup(GEONAMES_DB)
|
|
|
|
# Get files to process
|
|
if args.file_list:
|
|
with open(args.file_list, 'r') as f:
|
|
files_to_process = [CUSTODIAN_DIR / line.strip() for line in f if line.strip()]
|
|
print(f"Loaded {len(files_to_process)} files from {args.file_list}")
|
|
else:
|
|
all_files = sorted(CUSTODIAN_DIR.glob("CZ-*.yaml"))
|
|
print(f"Total CZ files: {len(all_files)}")
|
|
|
|
files_to_process = []
|
|
for fp in all_files:
|
|
try:
|
|
with open(fp, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
if isinstance(data, dict) and not has_coordinates(data):
|
|
files_to_process.append(fp)
|
|
except:
|
|
pass
|
|
|
|
print(f"CZ files missing coordinates: {len(files_to_process)}")
|
|
|
|
if args.limit and not args.all:
|
|
files_to_process = files_to_process[:args.limit]
|
|
print(f"Limited to first {args.limit} files")
|
|
|
|
# Statistics
|
|
stats = {'total': len(files_to_process), 'geocoded': 0, 'not_found': 0, 'no_code': 0, 'errors': 0}
|
|
not_found = []
|
|
|
|
for i, filepath in enumerate(files_to_process):
|
|
result = geocode_file(filepath, geonames, dry_run=args.dry_run)
|
|
|
|
if result['geocoded']:
|
|
stats['geocoded'] += 1
|
|
if args.verbose:
|
|
print(f"✅ {filepath.name}: {result['city_code']}")
|
|
elif result.get('error') and 'Unknown city code' in result['error']:
|
|
stats['not_found'] += 1
|
|
if len(not_found) < 30 and result['city_code'] not in [x[1] for x in not_found]:
|
|
not_found.append((filepath.name, result['city_code']))
|
|
elif result.get('error') and 'No valid city code' in result['error']:
|
|
stats['no_code'] += 1
|
|
elif result.get('error'):
|
|
stats['errors'] += 1
|
|
|
|
if not args.verbose and (i + 1) % 200 == 0:
|
|
print(f"Processed {i+1}/{len(files_to_process)}... (geocoded: {stats['geocoded']})")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("CZECH GEOCODING SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total processed: {stats['total']}")
|
|
print(f"Successfully geocoded: {stats['geocoded']}")
|
|
print(f"Unknown city codes: {stats['not_found']}")
|
|
print(f"No city code: {stats['no_code']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
|
|
if not_found:
|
|
print(f"\nUnknown city codes (need to add to mapping):")
|
|
for fname, code in not_found[:20]:
|
|
print(f" {code}: {fname}")
|
|
|
|
if args.dry_run:
|
|
print("\n(DRY RUN - No files were modified)")
|
|
|
|
geonames.close()
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|