#!/usr/bin/env python3 """ Resolve XX region codes using city names already in the file. This script handles files that have city data but unknown region codes. It looks up the city in GeoNames to get the admin1 (region) code. Following AGENTS.md Rules: - Rule 5: Additive only - never delete existing data - GHCID settlement standardization: GeoNames is authoritative """ import os import sys import yaml import sqlite3 import re import unicodedata from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any, List, Tuple # GeoNames database GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db" CUSTODIAN_DIR = Path(__file__).parent.parent / "data/custodian" # Feature codes for proper settlements (EXCLUDE PPLX neighborhoods) SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') # Country-specific region code mappings (GeoNames admin1 → ISO 3166-2) COUNTRY_ADMIN_MAPS = { 'NL': { '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL' }, 'BE': { 'VAN': 'VAN', 'VBR': 'VBR', 'VLI': 'VLI', 'VOV': 'VOV', 'VWV': 'VWV', 'WBR': 'WBR', 'WHT': 'WHT', 'WLG': 'WLG', 'WLX': 'WLX', 'WNA': 'WNA', 'BRU': 'BRU' }, # Georgia: GeoNames admin1 → ISO 3166-2:GE 'GE': { '51': 'TB', # Tbilisi '04': 'AJ', # Adjara '67': 'KA', # Kakheti '66': 'IM', # Imereti '68': 'KK', # Kvemo Kartli '69': 'MM', # Mtskheta-Mtianeti '70': 'RL', # Racha-Lechkhumi and Kvemo Svaneti '71': 'SZ', # Samegrelo and Zemo Svaneti '72': 'SJ', # Samtskhe-Javakheti '73': 'SK', # Shida Kartli '65': 'GU', # Guria }, # Czech Republic: GeoNames admin1 → ISO 3166-2:CZ (2-digit NUTS codes) # Source: https://en.wikipedia.org/wiki/ISO_3166-2:CZ 'CZ': { '52': '10', # Prague (Praha) '88': '20', # Central Bohemian (Středočeský kraj) '79': '31', # South Bohemian (Jihočeský kraj) '87': '32', # Plzeň Region (Plzeňský kraj) '81': '41', # Karlovy Vary Region (Karlovarský kraj) '89': '42', # Ústí nad Labem Region (Ústecký kraj) '83': '51', # Liberec Region (Liberecký kraj) '82': '52', # Hradec Králové Region (Královéhradecký kraj) '86': '53', # Pardubice Region (Pardubický kraj) '80': '63', # Vysočina Region '78': '64', # South Moravian (Jihomoravský kraj) '84': '71', # Olomouc Region (Olomoucký kraj) '90': '72', # Zlín Region (Zlínský kraj) '85': '80', # Moravian-Silesian (Moravskoslezský kraj) }, # Austria: GeoNames admin1 → ISO 3166-2:AT 'AT': { '01': '1', # Burgenland '02': '2', # Kärnten (Carinthia) '03': '3', # Niederösterreich (Lower Austria) '04': '4', # Oberösterreich (Upper Austria) '05': '5', # Salzburg '06': '6', # Steiermark (Styria) '07': '7', # Tirol (Tyrol) '08': '8', # Vorarlberg '09': '9', # Wien (Vienna) }, # Bulgaria: GeoNames admin1 → ISO 3166-2:BG (2-letter province codes) 'BG': { '38': '01', # Blagoevgrad '39': '02', # Burgas '40': '08', # Dobrich '41': '07', # Gabrovo '42': '26', # Haskovo '43': '09', # Kardzhali (Kurdzhali) '44': '10', # Kyustendil '45': '11', # Lovech '46': '12', # Montana '47': '13', # Pazardzhik '48': '14', # Pernik '49': '15', # Pleven '50': '16', # Plovdiv '51': '17', # Razgrad '52': '18', # Ruse '53': '27', # Shumen '54': '19', # Silistra '55': '20', # Sliven '56': '21', # Smolyan '57': '23', # Sofia (Sofiya-Grad) '58': '22', # Sofia Province (Sofiya) '59': '24', # Stara Zagora '60': '25', # Targovishte '61': '03', # Varna '62': '04', # Veliko Tarnovo '63': '05', # Vidin '64': '06', # Vratsa '65': '28', # Yambol }, # Switzerland: GeoNames already uses ISO 3166-2:CH canton codes 'CH': { 'AG': 'AG', 'AI': 'AI', 'AR': 'AR', 'BE': 'BE', 'BL': 'BL', 'BS': 'BS', 'FR': 'FR', 'GE': 'GE', 'GL': 'GL', 'GR': 'GR', 'JU': 'JU', 'LU': 'LU', 'NE': 'NE', 'NW': 'NW', 'OW': 'OW', 'SG': 'SG', 'SH': 'SH', 'SO': 'SO', 'SZ': 'SZ', 'TG': 'TG', 'TI': 'TI', 'UR': 'UR', 'VD': 'VD', 'VS': 'VS', 'ZG': 'ZG', 'ZH': 'ZH', }, # Vietnam: GeoNames admin1 codes are the ISO 3166-2:VN codes (use directly) # GeoNames uses 2-digit codes that match ISO 3166-2:VN province codes 'VN': { '01': 'HN', # Hanoi (Ha Noi) '31': 'HP', # Hai Phong '48': 'DN', # Da Nang (Đà Nẵng) '79': 'SG', # Ho Chi Minh City (Saigon) '92': 'CT', # Can Tho '75': 'DNa', # Dong Nai '24': 'BN', # Bac Ninh '22': 'QN', # Quang Ninh (Quảng Ninh) '38': 'TH', # Thanh Hoa (Thanh Hóa) '46': 'TTH', # Thua Thien-Hue (Thừa Thiên Huế) '40': 'NA', # Nghe An (Nghệ An) '04': 'CB', # Cao Bang '37': 'NB', # Ninh Binh '56': 'KH', # Khanh Hoa '66': 'DLK', # Dak Lak '68': 'LDG', # Lam Dong '91': 'AG', # An Giang '86': 'VL', # Vinh Long '82': 'DTP', # Dong Thap '80': 'TNi', # Tay Ninh '96': 'CMa', # Ca Mau '51': 'QNg', # Quang Ngai '52': 'GL', # Gia Lai '19': 'TN', # Thai Nguyen '25': 'PT', # Phu Tho }, # Japan: GeoNames admin1 → ISO 3166-2:JP (2-digit prefecture codes) # See: https://en.wikipedia.org/wiki/ISO_3166-2:JP 'JP': { '01': '23', # Aichi '02': '05', # Akita '03': '02', # Aomori '04': '12', # Chiba '05': '38', # Ehime '06': '18', # Fukui '07': '40', # Fukuoka '08': '07', # Fukushima '09': '21', # Gifu '10': '10', # Gunma '11': '34', # Hiroshima '12': '01', # Hokkaido '13': '28', # Hyogo '14': '08', # Ibaraki '15': '17', # Ishikawa '16': '03', # Iwate '17': '37', # Kagawa '18': '46', # Kagoshima '19': '14', # Kanagawa '20': '39', # Kochi '21': '43', # Kumamoto '22': '26', # Kyoto '23': '24', # Mie '24': '04', # Miyagi '25': '45', # Miyazaki '26': '20', # Nagano '27': '42', # Nagasaki '28': '29', # Nara '29': '15', # Niigata '30': '44', # Oita '31': '33', # Okayama '32': '27', # Osaka '33': '41', # Saga '34': '11', # Saitama '35': '25', # Shiga '36': '32', # Shimane '37': '22', # Shizuoka '38': '09', # Tochigi '39': '36', # Tokushima '40': '13', # Tokyo '41': '31', # Tottori '42': '16', # Toyama '43': '30', # Wakayama '44': '06', # Yamagata '45': '35', # Yamaguchi '46': '19', # Yamanashi '47': '47', # Okinawa }, # Egypt: GeoNames admin1 → ISO 3166-2:EG # See: https://en.wikipedia.org/wiki/ISO_3166-2:EG 'EG': { '01': 'DK', # Dakahlia '02': 'BA', # Red Sea (Al Bahr al Ahmar) '03': 'BH', # Beheira '04': 'FYM', # Faiyum '05': 'GH', # Gharbia '06': 'ALX', # Alexandria '07': 'IS', # Ismailia '08': 'GZ', # Giza '09': 'MNF', # Monufia '10': 'MN', # Minya '11': 'C', # Cairo '12': 'KB', # Qalyubia '13': 'WAD', # New Valley (Al Wadi al Jadid) '14': 'SHR', # Sharqia '15': 'SUZ', # Suez '16': 'ASN', # Aswan '17': 'AST', # Asyut '18': 'BNS', # Beni Suweif '19': 'PTS', # Port Said '20': 'DT', # Damietta '21': 'KFS', # Kafr el-Sheikh '22': 'MT', # Matruh '23': 'KN', # Qena '24': 'SHG', # Sohag '26': 'JS', # South Sinai '27': 'SIN', # North Sinai '28': 'LX', # Luxor }, } # City name translations (native → GeoNames ASCII name) # Many cities in GeoNames use English/anglicized names CITY_NAME_TRANSLATIONS = { # German → English 'wien': 'vienna', 'munchen': 'munich', 'koln': 'cologne', 'nurnberg': 'nuremberg', 'braunschweig': 'brunswick', # Czech → GeoNames (use normalized/ASCII keys) 'praha': 'prague', 'plzen': 'pilsen', # Plzeň → plzen after normalization 'brno': 'brno', 'ostrava': 'ostrava', # Swiss cities 'geneve': 'geneva', 'zurich': 'zurich', 'bern': 'berne', 'basel': 'basle', # Italian cities 'roma': 'rome', 'milano': 'milan', 'napoli': 'naples', 'firenze': 'florence', 'venezia': 'venice', 'torino': 'turin', # Austrian special cases (use normalized keys after diacritics removal) # GeoNames uses 'oe' for ö, so 'Sankt Poelten' 'st. polten': 'sankt poelten', 'st polten': 'sankt poelten', 'sankt polten': 'sankt poelten', # Japanese cities - complex administrative format to GeoNames # Format: "District Gun City Machi/Cho" → just the city name 'haga gun motegi machi': 'motegi', 'motegi machi': 'motegi', # Egyptian landmarks → Cairo 'nile corniche': 'cairo', } def normalize_city_name(name: str) -> str: """Normalize city name for matching.""" # NFD normalization to separate diacritics normalized = unicodedata.normalize('NFD', name) # Remove diacritics ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Lowercase return ascii_name.lower().strip() def clean_city_name(city: str) -> str: """Extract base city name from complex strings like 'Praha 1' or 'Zlín - Louky'.""" # Remove district numbers like "Praha 1", "Praha 9 - Běchovice" city = re.sub(r'\s+\d+.*$', '', city) # Remove parts after dash city = re.sub(r'\s*-\s*.*$', '', city) # Remove postal code patterns city = re.sub(r'\s+\d{3}\s*\d{2}.*$', '', city) return city.strip() def lookup_city_region(city_name: str, country: str, conn: sqlite3.Connection) -> Optional[Dict]: """Look up city in GeoNames and return region info.""" cursor = conn.cursor() # Clean city name base_city = clean_city_name(city_name) normalized = normalize_city_name(base_city) # Check for translated name (native → GeoNames) if normalized in CITY_NAME_TRANSLATIONS: translated = CITY_NAME_TRANSLATIONS[normalized] else: translated = normalized # Try translated name first, then normalized row = None for search_name in [translated, normalized]: cursor.execute(f''' SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, latitude, longitude, feature_code, population FROM cities WHERE country_code = ? AND feature_code IN {SETTLEMENT_FEATURE_CODES} AND LOWER(ascii_name) = ? ORDER BY population DESC LIMIT 1 ''', (country, search_name)) row = cursor.fetchone() if row: break # If no match, try LIKE search with normalized name if not row: cursor.execute(f''' SELECT geonames_id, name, ascii_name, admin1_code, admin2_code, latitude, longitude, feature_code, population FROM cities WHERE country_code = ? AND feature_code IN {SETTLEMENT_FEATURE_CODES} AND LOWER(ascii_name) LIKE ? ORDER BY population DESC LIMIT 1 ''', (country, f'{normalized}%')) row = cursor.fetchone() if not row: return None return { 'geonames_id': row[0], 'name': row[1], 'ascii_name': row[2], 'admin1_code': row[3], 'admin2_code': row[4], 'latitude': row[5], 'longitude': row[6], 'feature_code': row[7], 'population': row[8], } def get_region_code(country: str, admin1_code: Optional[str], admin2_code: Optional[str] = None) -> str: """Convert GeoNames admin codes to ISO 3166-2 region codes.""" if country in COUNTRY_ADMIN_MAPS: country_map = COUNTRY_ADMIN_MAPS[country] if country == 'BE' and admin2_code: return country_map.get(admin2_code, admin1_code or 'XX') if admin1_code: return country_map.get(admin1_code, admin1_code) return 'XX' return admin1_code if admin1_code else 'XX' def find_city_in_file(data: Dict) -> Optional[Tuple[str, str]]: """Find city name and country from file data.""" country = None city = None # Get country from ghcid ghcid = data.get('ghcid', {}) loc_res = ghcid.get('location_resolution', {}) country = loc_res.get('country_code') # Check original_entry.locations if 'original_entry' in data: locations = data['original_entry'].get('locations', []) for loc in locations: if 'city' in loc and loc['city']: city = loc['city'] if not country and 'country' in loc: country = loc['country'] break # Check top-level locations if not city: locations = data.get('locations', []) for loc in locations: if 'city' in loc and loc['city']: city = loc['city'] if not country and 'country' in loc: country = loc['country'] break if city and country: return (city, country) return None def process_file(filepath: Path, conn: sqlite3.Connection, apply: bool) -> bool: """Process a single file with XX region code.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: print(f" Error reading {filepath}: {e}") return False if not data: return False # Check if region is already resolved ghcid = data.get('ghcid', {}) loc_res = ghcid.get('location_resolution', {}) if loc_res.get('region_code', 'XX') != 'XX': return False # Find city name city_info = find_city_in_file(data) if not city_info: return False city_name, country = city_info print(f" City: {city_name} ({country})") # Look up in GeoNames city_data = lookup_city_region(city_name, country, conn) if not city_data: print(f" No GeoNames match for '{city_name}'") return False region_code = get_region_code(country, city_data['admin1_code'], city_data.get('admin2_code')) if region_code == 'XX': print(f" Could not determine region for admin1={city_data['admin1_code']}") return False print(f" Found: {city_data['name']} -> Region {region_code}") if not apply: return True # Update GHCID current = ghcid.get('ghcid_current', '') parts = current.split('-') if len(parts) < 5: print(f" Invalid GHCID format: {current}") return False old_region = parts[1] if old_region != 'XX': print(f" Region already set: {old_region}") return False parts[1] = region_code new_ghcid = '-'.join(parts) # Update data ghcid['ghcid_current'] = new_ghcid loc_res['region_code'] = region_code loc_res['region_name'] = f"{country}-{region_code}" loc_res['geonames_id'] = city_data['geonames_id'] loc_res['method'] = 'GEONAMES_CITY_LOOKUP' loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat() ghcid['location_resolution'] = loc_res # Add to history history = ghcid.get('ghcid_history', []) history.append({ 'ghcid': new_ghcid, 'valid_from': datetime.now(timezone.utc).isoformat(), 'reason': f'Region resolved via GeoNames city lookup: XX->{region_code} ({city_data["name"]})' }) ghcid['ghcid_history'] = history data['ghcid'] = ghcid # Calculate new filename old_name = filepath.name new_name = old_name.replace(f'{country}-XX-', f'{country}-{region_code}-') new_path = filepath.parent / new_name # Write and rename with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) if new_path != filepath: filepath.rename(new_path) print(f" Renamed: {old_name} -> {new_name}") return True def main(): import argparse parser = argparse.ArgumentParser(description='Resolve XX region codes using city names in files') parser.add_argument('--limit', type=int, default=100, help='Max files to process') parser.add_argument('--apply', action='store_true', help='Apply changes (default: dry run)') parser.add_argument('--country', help='Filter by country code') args = parser.parse_args() print("=" * 70) print("REGION RESOLUTION FROM FILE CITY NAMES") print("=" * 70) print(f"Mode: {'APPLYING CHANGES' if args.apply else 'DRY RUN'}") print() # Connect to GeoNames if not GEONAMES_DB.exists(): print(f"ERROR: GeoNames database not found: {GEONAMES_DB}") sys.exit(1) conn = sqlite3.connect(str(GEONAMES_DB)) # Find XX files with city names xx_files = [] for f in CUSTODIAN_DIR.glob('*.yaml'): if '-XX-' in f.name: if args.country and not f.name.startswith(f'{args.country}-'): continue xx_files.append(f) print(f"Found {len(xx_files)} files with XX region codes") # Filter to files with city names files_with_cities = [] for f in xx_files: try: with open(f, 'r', encoding='utf-8') as fp: content = fp.read() if 'city:' in content: files_with_cities.append(f) except: pass print(f"Processing {min(len(files_with_cities), args.limit)} files with city names") print() resolved = 0 renamed = 0 for f in files_with_cities[:args.limit]: print(f"Processing {f.name}...") if process_file(f, conn, args.apply): resolved += 1 if args.apply: renamed += 1 conn.close() print() print("=" * 70) print("SUMMARY") print("=" * 70) print(f"Files processed: {min(len(files_with_cities), args.limit)}") print(f"Resolved: {resolved}") print(f"Renamed: {renamed}") if __name__ == '__main__': main()