glam/scripts/geocode_from_city_name.py
kempersc dee7a4c7d9 geocode: add coordinates to 147 Swiss custodian files
- Improved city name normalization to handle:
  - St. Gallen / St.Gallen -> Sankt Gallen
  - Canton suffixes (Buchs SG, Brugg AG)
  - Hyphenated districts (Bernex - Genève)
  - Postal codes with slashes (Ecublens/VD)
  - German prepositions (Hausen b. Brugg)
- Created scripts/geocode_from_city_name.py for unified geocoding
2025-12-09 22:38:33 +01:00

303 lines
10 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Geocode custodian files by looking up city names in GeoNames database.
Handles various city name formats and country-specific patterns.
Usage:
python scripts/geocode_from_city_name.py --country CH
python scripts/geocode_from_city_name.py --country CZ
python scripts/geocode_from_city_name.py # All countries
"""
import sqlite3
import re
from pathlib import Path
from datetime import datetime, timezone
from ruamel.yaml import YAML
GEONAMES_DB = Path("data/reference/geonames.db")
CUSTODIAN_DIR = Path("data/custodian")
yaml = YAML()
yaml.preserve_quotes = True
yaml.width = 4096
def normalize_city_name(city: str, country: str) -> list[str]:
"""Generate candidate city names for lookup, ordered by priority."""
candidates = [city]
# Common patterns for all countries
# Remove " - Suisse", " - France", etc.
no_country = re.sub(r'\s*-\s*(Suisse|Schweiz|Switzerland|France|Deutschland|Germany)$', '', city, flags=re.I)
if no_country != city:
candidates.append(no_country)
# Remove house numbers (common in CZ addresses)
cleaned = re.sub(r'\s+\d+$', '', city)
if cleaned != city:
candidates.append(cleaned)
# Handle Swiss city names
if country == 'CH':
# "St. Gallen" and "St.Gallen" -> also try "Sankt Gallen"
if 'St.' in city or 'St-' in city:
candidates.append(city.replace('St.', 'Sankt ').replace('St-', 'Sankt-'))
candidates.append(city.replace('St.', 'Saint ').replace('St-', 'Saint-'))
# Remove space if it created double space
for i, c in enumerate(candidates):
candidates[i] = re.sub(r'\s+', ' ', c).strip()
# Remove parenthetical suffixes (Sauverny)
no_parens = re.sub(r'\s*\([^)]*\)', '', city)
if no_parens != city:
candidates.append(no_parens.strip())
# "Bernex - Genève" -> try "Bernex" and "Genève"
if ' - ' in city:
parts = [p.strip() for p in city.split(' - ')]
candidates.extend(parts)
# "Lausanne-Dorigny" -> try just "Lausanne"
if '-' in city:
parts = city.split('-')
candidates.extend([p.strip() for p in parts])
# "Buchs SG", "Brugg AG" -> try just "Buchs", "Brugg"
canton_suffix = re.match(r'^(.+)\s+(AG|AI|AR|BE|BL|BS|FR|GE|GL|GR|JU|LU|NE|NW|OW|SG|SH|SO|SZ|TG|TI|UR|VD|VS|ZG|ZH)$', city)
if canton_suffix:
candidates.append(canton_suffix.group(1).strip())
# "Dättwil AG" -> "Dättwil"
# "Villigen PSI" -> "Villigen"
no_suffix = re.sub(r'\s+[A-Z]{2,3}$', '', city)
if no_suffix != city:
candidates.append(no_suffix)
# "Hausen b. Brugg" -> "Hausen"
no_bei = re.sub(r'\s+b\.\s+.*$', '', city)
if no_bei != city:
candidates.append(no_bei)
# "Ecublens/VD" -> "Ecublens"
if '/' in city:
parts = city.split('/')
candidates.append(parts[0].strip())
# "Sils / Segl Maria" -> "Sils", "Segl Maria"
if ' / ' in city:
parts = [p.strip() for p in city.split(' / ')]
candidates.extend(parts)
# "Glion sur Montreux" -> "Glion"
no_sur = re.sub(r'\s+sur\s+.*$', '', city, flags=re.I)
if no_sur != city:
candidates.append(no_sur)
# Handle Czech city names with district suffixes
if country == 'CZ':
# "Břasy-Stupno" -> try just "Břasy" and "Stupno"
if '-' in city:
parts = city.split('-')
candidates.extend([p.strip() for p in parts])
# Remove district numbers like "Praha 1"
no_district = re.sub(r'\s+\d+$', '', city)
if no_district != city:
candidates.append(no_district)
# Handle Japanese compound names (fallback to just the town name)
if country == 'JP':
# "Waga Gun Nishiwaga Machi" -> try "Nishiwaga"
parts = city.split()
for i, part in enumerate(parts):
if part in ('Machi', 'Cho', 'Mura', 'Shi', 'Ku'):
if i > 0:
candidates.append(parts[i-1]) # Just the town name
# Also try "town + suffix"
candidates.append(f"{parts[i-1]} {part}")
# Try removing "Gun" district entirely
no_gun = re.sub(r'\w+\s+Gun\s+', '', city)
if no_gun != city:
candidates.append(no_gun)
# Deduplicate while preserving order
seen = set()
result = []
for c in candidates:
if c and c not in seen:
seen.add(c)
result.append(c)
return result
def get_coords_for_city(conn: sqlite3.Connection, city_name: str, country_code: str) -> tuple[float, float, int, str] | None:
"""Get lat/lon, geonames_id, and matched name for a city."""
# First try exact match on name/ascii_name (fast)
cursor = conn.execute(
"""SELECT latitude, longitude, geonames_id, name
FROM cities
WHERE country_code = ?
AND (name = ? OR ascii_name = ?)
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY population DESC
LIMIT 1""",
(country_code, city_name, city_name)
)
row = cursor.fetchone()
if row:
return row[0], row[1], row[2], row[3]
return None
def process_file(filepath: Path, conn: sqlite3.Connection) -> bool:
"""Process a single custodian file. Returns True if updated."""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.load(f)
if not data:
return False
# Check if already has coordinates
location = data.get('location', {})
if location.get('latitude') and location.get('longitude'):
return False
# Get city and country from location block
city = location.get('city')
country = location.get('country')
if not city or not country:
return False
# Handle country as dict (from Wikidata enrichment)
if isinstance(country, dict):
# Extract country code from Wikidata country structure
country_label = country.get('label', '')
country_map = {
'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Czechia': 'CZ',
'Japan': 'JP', 'Germany': 'DE', 'Austria': 'AT', 'Belgium': 'BE',
'France': 'FR', 'Italy': 'IT', 'Netherlands': 'NL', 'Poland': 'PL'
}
country = country_map.get(country_label, None)
if not country:
return False
# Generate candidate city names
candidates = normalize_city_name(city, country)
# Try each candidate
for candidate in candidates:
result = get_coords_for_city(conn, candidate, country)
if result:
lat, lon, geonames_id, matched_name = result
# Update location block
data['location']['latitude'] = lat
data['location']['longitude'] = lon
data['location']['geonames_id'] = geonames_id
data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat()
data['location']['geocoding_method'] = 'CITY_NAME_LOOKUP'
if matched_name != city:
data['location']['geonames_matched_name'] = matched_name
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f)
return True
return False
def main():
import argparse
parser = argparse.ArgumentParser(description='Geocode by city name lookup')
parser.add_argument('--country', type=str, help='Country code to process')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
args = parser.parse_args()
conn = sqlite3.connect(GEONAMES_DB)
# Find files to process
if args.country:
pattern = f"{args.country}-*.yaml"
else:
pattern = "*.yaml"
files = list(CUSTODIAN_DIR.glob(pattern))
print(f"Found {len(files)} files matching {pattern}")
updated = 0
skipped = 0
no_match = 0
for filepath in files:
if not filepath.is_file():
continue
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.load(f)
if not data:
continue
location = data.get('location', {})
if location.get('latitude'):
skipped += 1
continue
city = location.get('city')
country = location.get('country')
if not city or not country:
no_match += 1
continue
# Handle country dict
if isinstance(country, dict):
country_label = country.get('label', '')
country_map = {
'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Czechia': 'CZ',
'Japan': 'JP', 'Germany': 'DE', 'Austria': 'AT', 'Belgium': 'BE'
}
country = country_map.get(country_label, None)
if not country:
no_match += 1
continue
candidates = normalize_city_name(city, country)
if args.dry_run:
found = False
for candidate in candidates:
result = get_coords_for_city(conn, candidate, country)
if result:
print(f"Would update: {filepath.name} ({city}) -> {result[3]} ({result[0]:.4f}, {result[1]:.4f})")
updated += 1
found = True
break
if not found:
print(f" No match: {filepath.name} ({city}, {country}) tried: {candidates[:3]}")
no_match += 1
else:
if process_file(filepath, conn):
print(f"Updated: {filepath.name}")
updated += 1
else:
no_match += 1
except Exception as e:
print(f"Error: {filepath.name}: {e}")
conn.close()
print(f"\nSummary:")
print(f" Updated: {updated}")
print(f" Skipped (already has coords): {skipped}")
print(f" No match found: {no_match}")
if __name__ == "__main__":
main()