- Improved city name normalization to handle: - St. Gallen / St.Gallen -> Sankt Gallen - Canton suffixes (Buchs SG, Brugg AG) - Hyphenated districts (Bernex - Genève) - Postal codes with slashes (Ecublens/VD) - German prepositions (Hausen b. Brugg) - Created scripts/geocode_from_city_name.py for unified geocoding
303 lines
10 KiB
Python
Executable file
303 lines
10 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Geocode custodian files by looking up city names in GeoNames database.
|
|
Handles various city name formats and country-specific patterns.
|
|
|
|
Usage:
|
|
python scripts/geocode_from_city_name.py --country CH
|
|
python scripts/geocode_from_city_name.py --country CZ
|
|
python scripts/geocode_from_city_name.py # All countries
|
|
"""
|
|
|
|
import sqlite3
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from ruamel.yaml import YAML
|
|
|
|
GEONAMES_DB = Path("data/reference/geonames.db")
|
|
CUSTODIAN_DIR = Path("data/custodian")
|
|
|
|
yaml = YAML()
|
|
yaml.preserve_quotes = True
|
|
yaml.width = 4096
|
|
|
|
|
|
def normalize_city_name(city: str, country: str) -> list[str]:
|
|
"""Generate candidate city names for lookup, ordered by priority."""
|
|
candidates = [city]
|
|
|
|
# Common patterns for all countries
|
|
# Remove " - Suisse", " - France", etc.
|
|
no_country = re.sub(r'\s*-\s*(Suisse|Schweiz|Switzerland|France|Deutschland|Germany)$', '', city, flags=re.I)
|
|
if no_country != city:
|
|
candidates.append(no_country)
|
|
|
|
# Remove house numbers (common in CZ addresses)
|
|
cleaned = re.sub(r'\s+\d+$', '', city)
|
|
if cleaned != city:
|
|
candidates.append(cleaned)
|
|
|
|
# Handle Swiss city names
|
|
if country == 'CH':
|
|
# "St. Gallen" and "St.Gallen" -> also try "Sankt Gallen"
|
|
if 'St.' in city or 'St-' in city:
|
|
candidates.append(city.replace('St.', 'Sankt ').replace('St-', 'Sankt-'))
|
|
candidates.append(city.replace('St.', 'Saint ').replace('St-', 'Saint-'))
|
|
# Remove space if it created double space
|
|
for i, c in enumerate(candidates):
|
|
candidates[i] = re.sub(r'\s+', ' ', c).strip()
|
|
|
|
# Remove parenthetical suffixes (Sauverny)
|
|
no_parens = re.sub(r'\s*\([^)]*\)', '', city)
|
|
if no_parens != city:
|
|
candidates.append(no_parens.strip())
|
|
|
|
# "Bernex - Genève" -> try "Bernex" and "Genève"
|
|
if ' - ' in city:
|
|
parts = [p.strip() for p in city.split(' - ')]
|
|
candidates.extend(parts)
|
|
|
|
# "Lausanne-Dorigny" -> try just "Lausanne"
|
|
if '-' in city:
|
|
parts = city.split('-')
|
|
candidates.extend([p.strip() for p in parts])
|
|
|
|
# "Buchs SG", "Brugg AG" -> try just "Buchs", "Brugg"
|
|
canton_suffix = re.match(r'^(.+)\s+(AG|AI|AR|BE|BL|BS|FR|GE|GL|GR|JU|LU|NE|NW|OW|SG|SH|SO|SZ|TG|TI|UR|VD|VS|ZG|ZH)$', city)
|
|
if canton_suffix:
|
|
candidates.append(canton_suffix.group(1).strip())
|
|
|
|
# "Dättwil AG" -> "Dättwil"
|
|
# "Villigen PSI" -> "Villigen"
|
|
no_suffix = re.sub(r'\s+[A-Z]{2,3}$', '', city)
|
|
if no_suffix != city:
|
|
candidates.append(no_suffix)
|
|
|
|
# "Hausen b. Brugg" -> "Hausen"
|
|
no_bei = re.sub(r'\s+b\.\s+.*$', '', city)
|
|
if no_bei != city:
|
|
candidates.append(no_bei)
|
|
|
|
# "Ecublens/VD" -> "Ecublens"
|
|
if '/' in city:
|
|
parts = city.split('/')
|
|
candidates.append(parts[0].strip())
|
|
|
|
# "Sils / Segl Maria" -> "Sils", "Segl Maria"
|
|
if ' / ' in city:
|
|
parts = [p.strip() for p in city.split(' / ')]
|
|
candidates.extend(parts)
|
|
|
|
# "Glion sur Montreux" -> "Glion"
|
|
no_sur = re.sub(r'\s+sur\s+.*$', '', city, flags=re.I)
|
|
if no_sur != city:
|
|
candidates.append(no_sur)
|
|
|
|
# Handle Czech city names with district suffixes
|
|
if country == 'CZ':
|
|
# "Břasy-Stupno" -> try just "Břasy" and "Stupno"
|
|
if '-' in city:
|
|
parts = city.split('-')
|
|
candidates.extend([p.strip() for p in parts])
|
|
# Remove district numbers like "Praha 1"
|
|
no_district = re.sub(r'\s+\d+$', '', city)
|
|
if no_district != city:
|
|
candidates.append(no_district)
|
|
|
|
# Handle Japanese compound names (fallback to just the town name)
|
|
if country == 'JP':
|
|
# "Waga Gun Nishiwaga Machi" -> try "Nishiwaga"
|
|
parts = city.split()
|
|
for i, part in enumerate(parts):
|
|
if part in ('Machi', 'Cho', 'Mura', 'Shi', 'Ku'):
|
|
if i > 0:
|
|
candidates.append(parts[i-1]) # Just the town name
|
|
# Also try "town + suffix"
|
|
candidates.append(f"{parts[i-1]} {part}")
|
|
# Try removing "Gun" district entirely
|
|
no_gun = re.sub(r'\w+\s+Gun\s+', '', city)
|
|
if no_gun != city:
|
|
candidates.append(no_gun)
|
|
|
|
# Deduplicate while preserving order
|
|
seen = set()
|
|
result = []
|
|
for c in candidates:
|
|
if c and c not in seen:
|
|
seen.add(c)
|
|
result.append(c)
|
|
|
|
return result
|
|
|
|
|
|
def get_coords_for_city(conn: sqlite3.Connection, city_name: str, country_code: str) -> tuple[float, float, int, str] | None:
|
|
"""Get lat/lon, geonames_id, and matched name for a city."""
|
|
# First try exact match on name/ascii_name (fast)
|
|
cursor = conn.execute(
|
|
"""SELECT latitude, longitude, geonames_id, name
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND (name = ? OR ascii_name = ?)
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY population DESC
|
|
LIMIT 1""",
|
|
(country_code, city_name, city_name)
|
|
)
|
|
row = cursor.fetchone()
|
|
if row:
|
|
return row[0], row[1], row[2], row[3]
|
|
return None
|
|
|
|
|
|
def process_file(filepath: Path, conn: sqlite3.Connection) -> bool:
|
|
"""Process a single custodian file. Returns True if updated."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
|
|
if not data:
|
|
return False
|
|
|
|
# Check if already has coordinates
|
|
location = data.get('location', {})
|
|
if location.get('latitude') and location.get('longitude'):
|
|
return False
|
|
|
|
# Get city and country from location block
|
|
city = location.get('city')
|
|
country = location.get('country')
|
|
|
|
if not city or not country:
|
|
return False
|
|
|
|
# Handle country as dict (from Wikidata enrichment)
|
|
if isinstance(country, dict):
|
|
# Extract country code from Wikidata country structure
|
|
country_label = country.get('label', '')
|
|
country_map = {
|
|
'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Czechia': 'CZ',
|
|
'Japan': 'JP', 'Germany': 'DE', 'Austria': 'AT', 'Belgium': 'BE',
|
|
'France': 'FR', 'Italy': 'IT', 'Netherlands': 'NL', 'Poland': 'PL'
|
|
}
|
|
country = country_map.get(country_label, None)
|
|
if not country:
|
|
return False
|
|
|
|
# Generate candidate city names
|
|
candidates = normalize_city_name(city, country)
|
|
|
|
# Try each candidate
|
|
for candidate in candidates:
|
|
result = get_coords_for_city(conn, candidate, country)
|
|
if result:
|
|
lat, lon, geonames_id, matched_name = result
|
|
|
|
# Update location block
|
|
data['location']['latitude'] = lat
|
|
data['location']['longitude'] = lon
|
|
data['location']['geonames_id'] = geonames_id
|
|
data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
data['location']['geocoding_method'] = 'CITY_NAME_LOOKUP'
|
|
if matched_name != city:
|
|
data['location']['geonames_matched_name'] = matched_name
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f)
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Geocode by city name lookup')
|
|
parser.add_argument('--country', type=str, help='Country code to process')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
|
args = parser.parse_args()
|
|
|
|
conn = sqlite3.connect(GEONAMES_DB)
|
|
|
|
# Find files to process
|
|
if args.country:
|
|
pattern = f"{args.country}-*.yaml"
|
|
else:
|
|
pattern = "*.yaml"
|
|
|
|
files = list(CUSTODIAN_DIR.glob(pattern))
|
|
print(f"Found {len(files)} files matching {pattern}")
|
|
|
|
updated = 0
|
|
skipped = 0
|
|
no_match = 0
|
|
|
|
for filepath in files:
|
|
if not filepath.is_file():
|
|
continue
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
|
|
if not data:
|
|
continue
|
|
|
|
location = data.get('location', {})
|
|
if location.get('latitude'):
|
|
skipped += 1
|
|
continue
|
|
|
|
city = location.get('city')
|
|
country = location.get('country')
|
|
|
|
if not city or not country:
|
|
no_match += 1
|
|
continue
|
|
|
|
# Handle country dict
|
|
if isinstance(country, dict):
|
|
country_label = country.get('label', '')
|
|
country_map = {
|
|
'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Czechia': 'CZ',
|
|
'Japan': 'JP', 'Germany': 'DE', 'Austria': 'AT', 'Belgium': 'BE'
|
|
}
|
|
country = country_map.get(country_label, None)
|
|
if not country:
|
|
no_match += 1
|
|
continue
|
|
|
|
candidates = normalize_city_name(city, country)
|
|
|
|
if args.dry_run:
|
|
found = False
|
|
for candidate in candidates:
|
|
result = get_coords_for_city(conn, candidate, country)
|
|
if result:
|
|
print(f"Would update: {filepath.name} ({city}) -> {result[3]} ({result[0]:.4f}, {result[1]:.4f})")
|
|
updated += 1
|
|
found = True
|
|
break
|
|
if not found:
|
|
print(f" No match: {filepath.name} ({city}, {country}) tried: {candidates[:3]}")
|
|
no_match += 1
|
|
else:
|
|
if process_file(filepath, conn):
|
|
print(f"Updated: {filepath.name}")
|
|
updated += 1
|
|
else:
|
|
no_match += 1
|
|
|
|
except Exception as e:
|
|
print(f"Error: {filepath.name}: {e}")
|
|
|
|
conn.close()
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Updated: {updated}")
|
|
print(f" Skipped (already has coords): {skipped}")
|
|
print(f" No match found: {no_match}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|