#!/usr/bin/env python3 """ Geocode custodian files by looking up city names in GeoNames database. Handles various city name formats and country-specific patterns. Usage: python scripts/geocode_from_city_name.py --country CH python scripts/geocode_from_city_name.py --country CZ python scripts/geocode_from_city_name.py # All countries """ import sqlite3 import re from pathlib import Path from datetime import datetime, timezone from ruamel.yaml import YAML GEONAMES_DB = Path("data/reference/geonames.db") CUSTODIAN_DIR = Path("data/custodian") yaml = YAML() yaml.preserve_quotes = True yaml.width = 4096 def normalize_city_name(city: str, country: str) -> list[str]: """Generate candidate city names for lookup, ordered by priority.""" candidates = [city] # Common patterns for all countries # Remove " - Suisse", " - France", etc. no_country = re.sub(r'\s*-\s*(Suisse|Schweiz|Switzerland|France|Deutschland|Germany)$', '', city, flags=re.I) if no_country != city: candidates.append(no_country) # Remove house numbers (common in CZ addresses) cleaned = re.sub(r'\s+\d+$', '', city) if cleaned != city: candidates.append(cleaned) # Handle Swiss city names if country == 'CH': # "St. Gallen" and "St.Gallen" -> also try "Sankt Gallen" if 'St.' in city or 'St-' in city: candidates.append(city.replace('St.', 'Sankt ').replace('St-', 'Sankt-')) candidates.append(city.replace('St.', 'Saint ').replace('St-', 'Saint-')) # Remove space if it created double space for i, c in enumerate(candidates): candidates[i] = re.sub(r'\s+', ' ', c).strip() # Remove parenthetical suffixes (Sauverny) no_parens = re.sub(r'\s*\([^)]*\)', '', city) if no_parens != city: candidates.append(no_parens.strip()) # "Bernex - Genève" -> try "Bernex" and "Genève" if ' - ' in city: parts = [p.strip() for p in city.split(' - ')] candidates.extend(parts) # "Lausanne-Dorigny" -> try just "Lausanne" if '-' in city: parts = city.split('-') candidates.extend([p.strip() for p in parts]) # "Buchs SG", "Brugg AG" -> try just "Buchs", "Brugg" canton_suffix = re.match(r'^(.+)\s+(AG|AI|AR|BE|BL|BS|FR|GE|GL|GR|JU|LU|NE|NW|OW|SG|SH|SO|SZ|TG|TI|UR|VD|VS|ZG|ZH)$', city) if canton_suffix: candidates.append(canton_suffix.group(1).strip()) # "Dättwil AG" -> "Dättwil" # "Villigen PSI" -> "Villigen" no_suffix = re.sub(r'\s+[A-Z]{2,3}$', '', city) if no_suffix != city: candidates.append(no_suffix) # "Hausen b. Brugg" -> "Hausen" no_bei = re.sub(r'\s+b\.\s+.*$', '', city) if no_bei != city: candidates.append(no_bei) # "Ecublens/VD" -> "Ecublens" if '/' in city: parts = city.split('/') candidates.append(parts[0].strip()) # "Sils / Segl Maria" -> "Sils", "Segl Maria" if ' / ' in city: parts = [p.strip() for p in city.split(' / ')] candidates.extend(parts) # "Glion sur Montreux" -> "Glion" no_sur = re.sub(r'\s+sur\s+.*$', '', city, flags=re.I) if no_sur != city: candidates.append(no_sur) # Handle Czech city names with district suffixes if country == 'CZ': # "Břasy-Stupno" -> try just "Břasy" and "Stupno" if '-' in city: parts = city.split('-') candidates.extend([p.strip() for p in parts]) # Remove district numbers like "Praha 1" no_district = re.sub(r'\s+\d+$', '', city) if no_district != city: candidates.append(no_district) # Handle Japanese compound names (fallback to just the town name) if country == 'JP': # "Waga Gun Nishiwaga Machi" -> try "Nishiwaga" parts = city.split() for i, part in enumerate(parts): if part in ('Machi', 'Cho', 'Mura', 'Shi', 'Ku'): if i > 0: candidates.append(parts[i-1]) # Just the town name # Also try "town + suffix" candidates.append(f"{parts[i-1]} {part}") # Try removing "Gun" district entirely no_gun = re.sub(r'\w+\s+Gun\s+', '', city) if no_gun != city: candidates.append(no_gun) # Deduplicate while preserving order seen = set() result = [] for c in candidates: if c and c not in seen: seen.add(c) result.append(c) return result def get_coords_for_city(conn: sqlite3.Connection, city_name: str, country_code: str) -> tuple[float, float, int, str] | None: """Get lat/lon, geonames_id, and matched name for a city.""" # First try exact match on name/ascii_name (fast) cursor = conn.execute( """SELECT latitude, longitude, geonames_id, name FROM cities WHERE country_code = ? AND (name = ? OR ascii_name = ?) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY population DESC LIMIT 1""", (country_code, city_name, city_name) ) row = cursor.fetchone() if row: return row[0], row[1], row[2], row[3] return None def process_file(filepath: Path, conn: sqlite3.Connection) -> bool: """Process a single custodian file. Returns True if updated.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if not data: return False # Check if already has coordinates location = data.get('location', {}) if location.get('latitude') and location.get('longitude'): return False # Get city and country from location block city = location.get('city') country = location.get('country') if not city or not country: return False # Handle country as dict (from Wikidata enrichment) if isinstance(country, dict): # Extract country code from Wikidata country structure country_label = country.get('label', '') country_map = { 'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Czechia': 'CZ', 'Japan': 'JP', 'Germany': 'DE', 'Austria': 'AT', 'Belgium': 'BE', 'France': 'FR', 'Italy': 'IT', 'Netherlands': 'NL', 'Poland': 'PL' } country = country_map.get(country_label, None) if not country: return False # Generate candidate city names candidates = normalize_city_name(city, country) # Try each candidate for candidate in candidates: result = get_coords_for_city(conn, candidate, country) if result: lat, lon, geonames_id, matched_name = result # Update location block data['location']['latitude'] = lat data['location']['longitude'] = lon data['location']['geonames_id'] = geonames_id data['location']['geocoding_timestamp'] = datetime.now(timezone.utc).isoformat() data['location']['geocoding_method'] = 'CITY_NAME_LOOKUP' if matched_name != city: data['location']['geonames_matched_name'] = matched_name # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f) return True return False def main(): import argparse parser = argparse.ArgumentParser(description='Geocode by city name lookup') parser.add_argument('--country', type=str, help='Country code to process') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') args = parser.parse_args() conn = sqlite3.connect(GEONAMES_DB) # Find files to process if args.country: pattern = f"{args.country}-*.yaml" else: pattern = "*.yaml" files = list(CUSTODIAN_DIR.glob(pattern)) print(f"Found {len(files)} files matching {pattern}") updated = 0 skipped = 0 no_match = 0 for filepath in files: if not filepath.is_file(): continue try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if not data: continue location = data.get('location', {}) if location.get('latitude'): skipped += 1 continue city = location.get('city') country = location.get('country') if not city or not country: no_match += 1 continue # Handle country dict if isinstance(country, dict): country_label = country.get('label', '') country_map = { 'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Czechia': 'CZ', 'Japan': 'JP', 'Germany': 'DE', 'Austria': 'AT', 'Belgium': 'BE' } country = country_map.get(country_label, None) if not country: no_match += 1 continue candidates = normalize_city_name(city, country) if args.dry_run: found = False for candidate in candidates: result = get_coords_for_city(conn, candidate, country) if result: print(f"Would update: {filepath.name} ({city}) -> {result[3]} ({result[0]:.4f}, {result[1]:.4f})") updated += 1 found = True break if not found: print(f" No match: {filepath.name} ({city}, {country}) tried: {candidates[:3]}") no_match += 1 else: if process_file(filepath, conn): print(f"Updated: {filepath.name}") updated += 1 else: no_match += 1 except Exception as e: print(f"Error: {filepath.name}: {e}") conn.close() print(f"\nSummary:") print(f" Updated: {updated}") print(f" Skipped (already has coords): {skipped}") print(f" No match found: {no_match}") if __name__ == "__main__": main()