191 lines
5.9 KiB
Python
Executable file
191 lines
5.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Geocode remaining 26 custodian files using GHCID city codes and GeoNames.
|
|
"""
|
|
|
|
import yaml
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Files to process
|
|
FILES = [
|
|
"data/custodian/CZ-80-BOH-A-ASZG.yaml",
|
|
"data/custodian/CZ-63-KNL-M-MVMKNL.yaml",
|
|
"data/custodian/CZ-52-DKR-M-MMDKNL.yaml",
|
|
"data/custodian/BE-VLG-ONZ-L-SUIS.yaml",
|
|
"data/custodian/AR-C-BUE-A-APDH.yaml",
|
|
"data/custodian/CZ-71-VLO-M-RPVLSMP.yaml",
|
|
"data/custodian/CZ-64-BLA-M-MBP.yaml",
|
|
"data/custodian/CZ-63-PEL-M-MVPPO.yaml",
|
|
"data/custodian/CZ-80-BRU-M-MVBPO.yaml",
|
|
"data/custodian/CZ-71-JES-M-VMJPO.yaml",
|
|
"data/custodian/CZ-80-BNB-M-MMBVBNB.yaml",
|
|
"data/custodian/CZ-72-ROZ-M-VMVPNKP.yaml",
|
|
"data/custodian/CZ-64-STZ-A-MSVS.yaml",
|
|
"data/custodian/LB-BA-BEI-A-NA.yaml",
|
|
"data/custodian/ML-6-TIM-L-MHCL.yaml",
|
|
"data/custodian/CZ-80-KOP-M-RMKPSTM.yaml",
|
|
"data/custodian/CZ-63-PRB-A-KZMPPO.yaml",
|
|
"data/custodian/CZ-63-POL-M-MMPPO.yaml",
|
|
"data/custodian/BE-VLG-OST-L-VIZV.yaml",
|
|
"data/custodian/CZ-80-RYM-M-MMVRPO.yaml",
|
|
"data/custodian/BE-VLG-ANT-A-MPMP.yaml",
|
|
"data/custodian/BE-VLG-ANT-A-UABWIUC.yaml",
|
|
"data/custodian/CZ-53-UNO-A-AUMAVESUNO.yaml",
|
|
"data/custodian/CZ-63-ZNS-A-AUMAVESZNS.yaml",
|
|
"data/custodian/BE-BRU-WOL-L-LL.yaml",
|
|
"data/custodian/CZ-72-UBR-M-MJAKVUB.yaml",
|
|
]
|
|
|
|
# Manual coordinates for known cities
|
|
CITY_COORDS = {
|
|
# Argentina
|
|
("AR", "BUE"): ((-34.6037, -58.3816), "Buenos Aires"),
|
|
# Lebanon
|
|
("LB", "BEI"): ((33.8938, 35.5018), "Beirut"),
|
|
# Mali
|
|
("ML", "TIM"): ((16.7666, -3.0026), "Timbuktu"),
|
|
# Belgium
|
|
("BE", "ONZ"): ((50.9667, 3.8167), "Onze-Lieve-Vrouw-Waver"), # Approximation
|
|
("BE", "OST"): ((51.2194, 2.9264), "Ostend"),
|
|
("BE", "ANT"): ((51.2194, 4.4025), "Antwerp"),
|
|
("BE", "WOL"): ((50.8503, 4.3517), "Woluwe-Saint-Lambert (Brussels)"),
|
|
# Czech Republic - will query GeoNames
|
|
}
|
|
|
|
# Czech city codes to city names (for GeoNames lookup)
|
|
CZ_CITY_MAP = {
|
|
"BOH": "Bohumín",
|
|
"KNL": "Kralovice", # Unclear - will research
|
|
"DKR": "Dvůr Králové nad Labem",
|
|
"VLO": "Valašské Meziříčí", # VLO region
|
|
"BLA": "Blansko",
|
|
"PEL": "Pelhřimov",
|
|
"BRU": "Bruntál",
|
|
"JES": "Jeseník",
|
|
"BNB": "Bohumín", # BNB unclear - check file
|
|
"ROZ": "Rožnov pod Radhoštěm",
|
|
"STZ": "Strakonice", # STZ unclear
|
|
"KOP": "Kopřivnice",
|
|
"PRB": "Příbram",
|
|
"POL": "Polička",
|
|
"RYM": "Rýmařov",
|
|
"UNO": "Ústí nad Orlicí",
|
|
"ZNS": "Znojmo", # ZNS unclear
|
|
"UBR": "Uherský Brod",
|
|
}
|
|
|
|
|
|
def get_geonames_coords(city_name: str, country_code: str) -> tuple | None:
|
|
"""Query GeoNames database for city coordinates."""
|
|
db_path = Path("data/reference/geonames.db")
|
|
if not db_path.exists():
|
|
print(f" GeoNames DB not found at {db_path}")
|
|
return None
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
|
|
# Try exact match first
|
|
cursor.execute("""
|
|
SELECT latitude, longitude, name
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND (name LIKE ? OR ascii_name LIKE ?)
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC')
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (country_code, f"%{city_name}%", f"%{city_name}%"))
|
|
|
|
row = cursor.fetchone()
|
|
conn.close()
|
|
|
|
if row:
|
|
return ((row[0], row[1]), row[2])
|
|
return None
|
|
|
|
|
|
def process_file(filepath: str) -> bool:
|
|
"""Add coordinates to a single custodian file."""
|
|
path = Path(filepath)
|
|
if not path.exists():
|
|
print(f" File not found: {filepath}")
|
|
return False
|
|
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Check if already has coordinates
|
|
if data.get('location', {}).get('latitude'):
|
|
print(f" Already has coordinates: {filepath}")
|
|
return False
|
|
|
|
# Extract country and city code from GHCID
|
|
ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
|
parts = ghcid.split('-')
|
|
if len(parts) < 4:
|
|
print(f" Invalid GHCID format: {ghcid}")
|
|
return False
|
|
|
|
country_code = parts[0]
|
|
city_code = parts[2]
|
|
|
|
coords = None
|
|
city_name = None
|
|
method = "CITY_CODE_LOOKUP"
|
|
|
|
# Try manual mapping first
|
|
key = (country_code, city_code)
|
|
if key in CITY_COORDS:
|
|
coords, city_name = CITY_COORDS[key]
|
|
elif country_code == "CZ" and city_code in CZ_CITY_MAP:
|
|
# Query GeoNames for Czech cities
|
|
cz_city = CZ_CITY_MAP[city_code]
|
|
result = get_geonames_coords(cz_city, "CZ")
|
|
if result:
|
|
coords, city_name = result
|
|
method = "GEONAMES_LOOKUP"
|
|
else:
|
|
print(f" No GeoNames match for: {cz_city}")
|
|
return False
|
|
else:
|
|
print(f" Unknown city code: {country_code}-{city_code}")
|
|
return False
|
|
|
|
if not coords:
|
|
print(f" No coordinates found for {filepath}")
|
|
return False
|
|
|
|
# Update location
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
|
|
data['location']['latitude'] = coords[0]
|
|
data['location']['longitude'] = coords[1]
|
|
data['location']['city'] = city_name
|
|
data['location']['geocoding_method'] = method
|
|
data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Write back
|
|
with open(path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f" ✓ {filepath}: {city_name} ({coords[0]:.4f}, {coords[1]:.4f})")
|
|
return True
|
|
|
|
|
|
def main():
|
|
print(f"Geocoding {len(FILES)} remaining files...")
|
|
|
|
updated = 0
|
|
for filepath in FILES:
|
|
result = process_file(filepath)
|
|
if result:
|
|
updated += 1
|
|
|
|
print(f"\nUpdated {updated}/{len(FILES)} files")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|