- BG: Add lat/lon from existing GeoNames IDs (28 files) - EG: Map city codes to GeoNames (CAI→Cairo, ALX→Alexandria, etc.) (28 files) - Fix malformed EG-IS-\`A\`-O-SCA.yaml → EG-IS-ISM-O-SCA.yaml - Overall coverage: 96.4% → 96.6%
190 lines
5.8 KiB
Python
Executable file
190 lines
5.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Add lat/lon coordinates to Egyptian custodian files based on city codes.
|
|
|
|
City code mapping for Egypt:
|
|
- CAI = Cairo
|
|
- ALX = Alexandria
|
|
- ASS = Assiut
|
|
- NIL = Nile (various locations, use Cairo as proxy)
|
|
- GIZ = Giza
|
|
- LUX = Luxor
|
|
"""
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from ruamel.yaml import YAML
|
|
|
|
GEONAMES_DB = Path("data/reference/geonames.db")
|
|
CUSTODIAN_DIR = Path("data/custodian")
|
|
|
|
yaml = YAML()
|
|
yaml.preserve_quotes = True
|
|
yaml.width = 4096
|
|
|
|
# Egypt city code mappings (city_code -> GeoNames city name)
|
|
EG_CITY_MAPPING = {
|
|
'CAI': 'Cairo',
|
|
'ALX': 'Alexandria',
|
|
'ASS': 'Assiut', # Assiut city
|
|
'NIL': 'Cairo', # Default to Cairo for Nile-related
|
|
'GIZ': 'Giza',
|
|
'LUX': 'Luxor',
|
|
'ASW': 'Aswan',
|
|
'POR': 'Port Said',
|
|
'SUE': 'Suez',
|
|
'MAN': 'Mansoura',
|
|
'TAN': 'Tanta',
|
|
'ISM': 'Ismailia',
|
|
}
|
|
|
|
|
|
def get_coords_for_city(conn: sqlite3.Connection, city_name: str, country_code: str = 'EG') -> tuple[float, float, int] | None:
|
|
"""Get lat/lon and geonames_id for a city."""
|
|
cursor = conn.execute(
|
|
"""SELECT latitude, longitude, geonames_id
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND (name = ? OR ascii_name = ?)
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLC')
|
|
ORDER BY population DESC
|
|
LIMIT 1""",
|
|
(country_code, city_name, city_name)
|
|
)
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return row[0], row[1], row[2]
|
|
return None
|
|
|
|
|
|
def process_file(filepath: Path, conn: sqlite3.Connection) -> bool:
|
|
"""Process a single custodian file. Returns True if updated."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
|
|
if not data:
|
|
return False
|
|
|
|
# Check if already has coordinates
|
|
location = data.get('location', {})
|
|
if location.get('latitude') and location.get('longitude'):
|
|
return False
|
|
|
|
# Get city code from location_resolution or GHCID
|
|
city_code = None
|
|
|
|
ghcid = data.get('ghcid', {})
|
|
loc_res = ghcid.get('location_resolution', {})
|
|
if loc_res.get('city_code'):
|
|
city_code = loc_res['city_code']
|
|
|
|
# Also try to extract from ghcid_current (e.g., EG-C-CAI-L-...)
|
|
if not city_code and ghcid.get('ghcid_current'):
|
|
parts = ghcid['ghcid_current'].split('-')
|
|
if len(parts) >= 3 and parts[0] == 'EG':
|
|
city_code = parts[2]
|
|
|
|
if not city_code:
|
|
print(f" No city code found: {filepath.name}")
|
|
return False
|
|
|
|
# Map city code to city name
|
|
city_name = EG_CITY_MAPPING.get(city_code)
|
|
if not city_name:
|
|
print(f" Unknown city code {city_code}: {filepath.name}")
|
|
return False
|
|
|
|
# Look up coordinates
|
|
result = get_coords_for_city(conn, city_name)
|
|
if not result:
|
|
print(f" City not found in GeoNames: {city_name} ({city_code}): {filepath.name}")
|
|
return False
|
|
|
|
lat, lon, geonames_id = result
|
|
|
|
# Update location block
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
|
|
data['location']['city'] = city_name
|
|
data['location']['latitude'] = lat
|
|
data['location']['longitude'] = lon
|
|
data['location']['geonames_id'] = geonames_id
|
|
data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
data['location']['geocoding_method'] = 'EG_CITY_CODE_LOOKUP'
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f)
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Geocode Egyptian institutions by city code')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
|
args = parser.parse_args()
|
|
|
|
conn = sqlite3.connect(GEONAMES_DB)
|
|
|
|
# Find EG files
|
|
files = list(CUSTODIAN_DIR.glob("EG-*.yaml"))
|
|
print(f"Found {len(files)} EG files")
|
|
|
|
updated = 0
|
|
skipped = 0
|
|
|
|
for filepath in files:
|
|
if not filepath.is_file():
|
|
continue
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
|
|
if not data:
|
|
continue
|
|
|
|
location = data.get('location', {})
|
|
if location.get('latitude'):
|
|
skipped += 1
|
|
continue
|
|
|
|
if args.dry_run:
|
|
# Check what would happen
|
|
ghcid = data.get('ghcid', {})
|
|
city_code = ghcid.get('location_resolution', {}).get('city_code')
|
|
if not city_code and ghcid.get('ghcid_current'):
|
|
parts = ghcid['ghcid_current'].split('-')
|
|
if len(parts) >= 3 and parts[0] == 'EG':
|
|
city_code = parts[2]
|
|
|
|
if city_code and city_code in EG_CITY_MAPPING:
|
|
city_name = EG_CITY_MAPPING[city_code]
|
|
result = get_coords_for_city(conn, city_name)
|
|
if result:
|
|
print(f"Would update: {filepath.name} -> {city_name} ({result[0]}, {result[1]})")
|
|
updated += 1
|
|
else:
|
|
print(f" City not in GeoNames: {city_name}")
|
|
else:
|
|
print(f" Unknown/no city code: {filepath.name} ({city_code})")
|
|
else:
|
|
if process_file(filepath, conn):
|
|
print(f"Updated: {filepath.name}")
|
|
updated += 1
|
|
|
|
except Exception as e:
|
|
print(f"Error: {filepath.name}: {e}")
|
|
|
|
conn.close()
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Updated: {updated}")
|
|
print(f" Skipped (already has coords): {skipped}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|