glam/scripts/geocode_remaining_26.py

191 lines
5.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Geocode remaining 26 custodian files using GHCID city codes and GeoNames.
"""
import yaml
import sqlite3
from pathlib import Path
from datetime import datetime, timezone
# Files to process
FILES = [
"data/custodian/CZ-80-BOH-A-ASZG.yaml",
"data/custodian/CZ-63-KNL-M-MVMKNL.yaml",
"data/custodian/CZ-52-DKR-M-MMDKNL.yaml",
"data/custodian/BE-VLG-ONZ-L-SUIS.yaml",
"data/custodian/AR-C-BUE-A-APDH.yaml",
"data/custodian/CZ-71-VLO-M-RPVLSMP.yaml",
"data/custodian/CZ-64-BLA-M-MBP.yaml",
"data/custodian/CZ-63-PEL-M-MVPPO.yaml",
"data/custodian/CZ-80-BRU-M-MVBPO.yaml",
"data/custodian/CZ-71-JES-M-VMJPO.yaml",
"data/custodian/CZ-80-BNB-M-MMBVBNB.yaml",
"data/custodian/CZ-72-ROZ-M-VMVPNKP.yaml",
"data/custodian/CZ-64-STZ-A-MSVS.yaml",
"data/custodian/LB-BA-BEI-A-NA.yaml",
"data/custodian/ML-6-TIM-L-MHCL.yaml",
"data/custodian/CZ-80-KOP-M-RMKPSTM.yaml",
"data/custodian/CZ-63-PRB-A-KZMPPO.yaml",
"data/custodian/CZ-63-POL-M-MMPPO.yaml",
"data/custodian/BE-VLG-OST-L-VIZV.yaml",
"data/custodian/CZ-80-RYM-M-MMVRPO.yaml",
"data/custodian/BE-VLG-ANT-A-MPMP.yaml",
"data/custodian/BE-VLG-ANT-A-UABWIUC.yaml",
"data/custodian/CZ-53-UNO-A-AUMAVESUNO.yaml",
"data/custodian/CZ-63-ZNS-A-AUMAVESZNS.yaml",
"data/custodian/BE-BRU-WOL-L-LL.yaml",
"data/custodian/CZ-72-UBR-M-MJAKVUB.yaml",
]
# Manual coordinates for known cities
CITY_COORDS = {
# Argentina
("AR", "BUE"): ((-34.6037, -58.3816), "Buenos Aires"),
# Lebanon
("LB", "BEI"): ((33.8938, 35.5018), "Beirut"),
# Mali
("ML", "TIM"): ((16.7666, -3.0026), "Timbuktu"),
# Belgium
("BE", "ONZ"): ((50.9667, 3.8167), "Onze-Lieve-Vrouw-Waver"), # Approximation
("BE", "OST"): ((51.2194, 2.9264), "Ostend"),
("BE", "ANT"): ((51.2194, 4.4025), "Antwerp"),
("BE", "WOL"): ((50.8503, 4.3517), "Woluwe-Saint-Lambert (Brussels)"),
# Czech Republic - will query GeoNames
}
# Czech city codes to city names (for GeoNames lookup)
CZ_CITY_MAP = {
"BOH": "Bohumín",
"KNL": "Kralovice", # Unclear - will research
"DKR": "Dvůr Králové nad Labem",
"VLO": "Valašské Meziříčí", # VLO region
"BLA": "Blansko",
"PEL": "Pelhřimov",
"BRU": "Bruntál",
"JES": "Jeseník",
"BNB": "Bohumín", # BNB unclear - check file
"ROZ": "Rožnov pod Radhoštěm",
"STZ": "Strakonice", # STZ unclear
"KOP": "Kopřivnice",
"PRB": "Příbram",
"POL": "Polička",
"RYM": "Rýmařov",
"UNO": "Ústí nad Orlicí",
"ZNS": "Znojmo", # ZNS unclear
"UBR": "Uherský Brod",
}
def get_geonames_coords(city_name: str, country_code: str) -> tuple | None:
"""Query GeoNames database for city coordinates."""
db_path = Path("data/reference/geonames.db")
if not db_path.exists():
print(f" GeoNames DB not found at {db_path}")
return None
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Try exact match first
cursor.execute("""
SELECT latitude, longitude, name
FROM cities
WHERE country_code = ?
AND (name LIKE ? OR ascii_name LIKE ?)
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC')
ORDER BY population DESC
LIMIT 1
""", (country_code, f"%{city_name}%", f"%{city_name}%"))
row = cursor.fetchone()
conn.close()
if row:
return ((row[0], row[1]), row[2])
return None
def process_file(filepath: str) -> bool:
"""Add coordinates to a single custodian file."""
path = Path(filepath)
if not path.exists():
print(f" File not found: {filepath}")
return False
with open(path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Check if already has coordinates
if data.get('location', {}).get('latitude'):
print(f" Already has coordinates: {filepath}")
return False
# Extract country and city code from GHCID
ghcid = data.get('ghcid', {}).get('ghcid_current', '')
parts = ghcid.split('-')
if len(parts) < 4:
print(f" Invalid GHCID format: {ghcid}")
return False
country_code = parts[0]
city_code = parts[2]
coords = None
city_name = None
method = "CITY_CODE_LOOKUP"
# Try manual mapping first
key = (country_code, city_code)
if key in CITY_COORDS:
coords, city_name = CITY_COORDS[key]
elif country_code == "CZ" and city_code in CZ_CITY_MAP:
# Query GeoNames for Czech cities
cz_city = CZ_CITY_MAP[city_code]
result = get_geonames_coords(cz_city, "CZ")
if result:
coords, city_name = result
method = "GEONAMES_LOOKUP"
else:
print(f" No GeoNames match for: {cz_city}")
return False
else:
print(f" Unknown city code: {country_code}-{city_code}")
return False
if not coords:
print(f" No coordinates found for {filepath}")
return False
# Update location
if 'location' not in data:
data['location'] = {}
data['location']['latitude'] = coords[0]
data['location']['longitude'] = coords[1]
data['location']['city'] = city_name
data['location']['geocoding_method'] = method
data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat()
# Write back
with open(path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"{filepath}: {city_name} ({coords[0]:.4f}, {coords[1]:.4f})")
return True
def main():
print(f"Geocoding {len(FILES)} remaining files...")
updated = 0
for filepath in FILES:
result = process_file(filepath)
if result:
updated += 1
print(f"\nUpdated {updated}/{len(FILES)} files")
if __name__ == "__main__":
main()