glam/scripts/resolve_regions_from_city.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

568 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Resolve XX region codes using city names already in the file.
This script handles files that have city data but unknown region codes.
It looks up the city in GeoNames to get the admin1 (region) code.
Following AGENTS.md Rules:
- Rule 5: Additive only - never delete existing data
- GHCID settlement standardization: GeoNames is authoritative
"""
import os
import sys
import yaml
import sqlite3
import re
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
# GeoNames database
GEONAMES_DB = Path(__file__).parent.parent / "data/reference/geonames.db"
CUSTODIAN_DIR = Path(__file__).parent.parent / "data/custodian"
# Feature codes for proper settlements (EXCLUDE PPLX neighborhoods)
SETTLEMENT_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
# Country-specific region code mappings (GeoNames admin1 → ISO 3166-2)
COUNTRY_ADMIN_MAPS = {
'NL': {
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI',
'06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH',
'15': 'OV', '16': 'FL'
},
'BE': {
'VAN': 'VAN', 'VBR': 'VBR', 'VLI': 'VLI', 'VOV': 'VOV', 'VWV': 'VWV',
'WBR': 'WBR', 'WHT': 'WHT', 'WLG': 'WLG', 'WLX': 'WLX', 'WNA': 'WNA',
'BRU': 'BRU'
},
# Georgia: GeoNames admin1 → ISO 3166-2:GE
'GE': {
'51': 'TB', # Tbilisi
'04': 'AJ', # Adjara
'67': 'KA', # Kakheti
'66': 'IM', # Imereti
'68': 'KK', # Kvemo Kartli
'69': 'MM', # Mtskheta-Mtianeti
'70': 'RL', # Racha-Lechkhumi and Kvemo Svaneti
'71': 'SZ', # Samegrelo and Zemo Svaneti
'72': 'SJ', # Samtskhe-Javakheti
'73': 'SK', # Shida Kartli
'65': 'GU', # Guria
},
# Czech Republic: GeoNames admin1 → ISO 3166-2:CZ (2-digit NUTS codes)
# Source: https://en.wikipedia.org/wiki/ISO_3166-2:CZ
'CZ': {
'52': '10', # Prague (Praha)
'88': '20', # Central Bohemian (Středočeský kraj)
'79': '31', # South Bohemian (Jihočeský kraj)
'87': '32', # Plzeň Region (Plzeňský kraj)
'81': '41', # Karlovy Vary Region (Karlovarský kraj)
'89': '42', # Ústí nad Labem Region (Ústecký kraj)
'83': '51', # Liberec Region (Liberecký kraj)
'82': '52', # Hradec Králové Region (Královéhradecký kraj)
'86': '53', # Pardubice Region (Pardubický kraj)
'80': '63', # Vysočina Region
'78': '64', # South Moravian (Jihomoravský kraj)
'84': '71', # Olomouc Region (Olomoucký kraj)
'90': '72', # Zlín Region (Zlínský kraj)
'85': '80', # Moravian-Silesian (Moravskoslezský kraj)
},
# Austria: GeoNames admin1 → ISO 3166-2:AT
'AT': {
'01': '1', # Burgenland
'02': '2', # Kärnten (Carinthia)
'03': '3', # Niederösterreich (Lower Austria)
'04': '4', # Oberösterreich (Upper Austria)
'05': '5', # Salzburg
'06': '6', # Steiermark (Styria)
'07': '7', # Tirol (Tyrol)
'08': '8', # Vorarlberg
'09': '9', # Wien (Vienna)
},
# Bulgaria: GeoNames admin1 → ISO 3166-2:BG (2-letter province codes)
'BG': {
'38': '01', # Blagoevgrad
'39': '02', # Burgas
'40': '08', # Dobrich
'41': '07', # Gabrovo
'42': '26', # Haskovo
'43': '09', # Kardzhali (Kurdzhali)
'44': '10', # Kyustendil
'45': '11', # Lovech
'46': '12', # Montana
'47': '13', # Pazardzhik
'48': '14', # Pernik
'49': '15', # Pleven
'50': '16', # Plovdiv
'51': '17', # Razgrad
'52': '18', # Ruse
'53': '27', # Shumen
'54': '19', # Silistra
'55': '20', # Sliven
'56': '21', # Smolyan
'57': '23', # Sofia (Sofiya-Grad)
'58': '22', # Sofia Province (Sofiya)
'59': '24', # Stara Zagora
'60': '25', # Targovishte
'61': '03', # Varna
'62': '04', # Veliko Tarnovo
'63': '05', # Vidin
'64': '06', # Vratsa
'65': '28', # Yambol
},
# Switzerland: GeoNames already uses ISO 3166-2:CH canton codes
'CH': {
'AG': 'AG', 'AI': 'AI', 'AR': 'AR', 'BE': 'BE', 'BL': 'BL',
'BS': 'BS', 'FR': 'FR', 'GE': 'GE', 'GL': 'GL', 'GR': 'GR',
'JU': 'JU', 'LU': 'LU', 'NE': 'NE', 'NW': 'NW', 'OW': 'OW',
'SG': 'SG', 'SH': 'SH', 'SO': 'SO', 'SZ': 'SZ', 'TG': 'TG',
'TI': 'TI', 'UR': 'UR', 'VD': 'VD', 'VS': 'VS', 'ZG': 'ZG',
'ZH': 'ZH',
},
# Vietnam: GeoNames admin1 codes are the ISO 3166-2:VN codes (use directly)
# GeoNames uses 2-digit codes that match ISO 3166-2:VN province codes
'VN': {
'01': 'HN', # Hanoi (Ha Noi)
'31': 'HP', # Hai Phong
'48': 'DN', # Da Nang (Đà Nẵng)
'79': 'SG', # Ho Chi Minh City (Saigon)
'92': 'CT', # Can Tho
'75': 'DNa', # Dong Nai
'24': 'BN', # Bac Ninh
'22': 'QN', # Quang Ninh (Quảng Ninh)
'38': 'TH', # Thanh Hoa (Thanh Hóa)
'46': 'TTH', # Thua Thien-Hue (Thừa Thiên Huế)
'40': 'NA', # Nghe An (Nghệ An)
'04': 'CB', # Cao Bang
'37': 'NB', # Ninh Binh
'56': 'KH', # Khanh Hoa
'66': 'DLK', # Dak Lak
'68': 'LDG', # Lam Dong
'91': 'AG', # An Giang
'86': 'VL', # Vinh Long
'82': 'DTP', # Dong Thap
'80': 'TNi', # Tay Ninh
'96': 'CMa', # Ca Mau
'51': 'QNg', # Quang Ngai
'52': 'GL', # Gia Lai
'19': 'TN', # Thai Nguyen
'25': 'PT', # Phu Tho
},
# Japan: GeoNames admin1 → ISO 3166-2:JP (2-digit prefecture codes)
# See: https://en.wikipedia.org/wiki/ISO_3166-2:JP
'JP': {
'01': '23', # Aichi
'02': '05', # Akita
'03': '02', # Aomori
'04': '12', # Chiba
'05': '38', # Ehime
'06': '18', # Fukui
'07': '40', # Fukuoka
'08': '07', # Fukushima
'09': '21', # Gifu
'10': '10', # Gunma
'11': '34', # Hiroshima
'12': '01', # Hokkaido
'13': '28', # Hyogo
'14': '08', # Ibaraki
'15': '17', # Ishikawa
'16': '03', # Iwate
'17': '37', # Kagawa
'18': '46', # Kagoshima
'19': '14', # Kanagawa
'20': '39', # Kochi
'21': '43', # Kumamoto
'22': '26', # Kyoto
'23': '24', # Mie
'24': '04', # Miyagi
'25': '45', # Miyazaki
'26': '20', # Nagano
'27': '42', # Nagasaki
'28': '29', # Nara
'29': '15', # Niigata
'30': '44', # Oita
'31': '33', # Okayama
'32': '27', # Osaka
'33': '41', # Saga
'34': '11', # Saitama
'35': '25', # Shiga
'36': '32', # Shimane
'37': '22', # Shizuoka
'38': '09', # Tochigi
'39': '36', # Tokushima
'40': '13', # Tokyo
'41': '31', # Tottori
'42': '16', # Toyama
'43': '30', # Wakayama
'44': '06', # Yamagata
'45': '35', # Yamaguchi
'46': '19', # Yamanashi
'47': '47', # Okinawa
},
# Egypt: GeoNames admin1 → ISO 3166-2:EG
# See: https://en.wikipedia.org/wiki/ISO_3166-2:EG
'EG': {
'01': 'DK', # Dakahlia
'02': 'BA', # Red Sea (Al Bahr al Ahmar)
'03': 'BH', # Beheira
'04': 'FYM', # Faiyum
'05': 'GH', # Gharbia
'06': 'ALX', # Alexandria
'07': 'IS', # Ismailia
'08': 'GZ', # Giza
'09': 'MNF', # Monufia
'10': 'MN', # Minya
'11': 'C', # Cairo
'12': 'KB', # Qalyubia
'13': 'WAD', # New Valley (Al Wadi al Jadid)
'14': 'SHR', # Sharqia
'15': 'SUZ', # Suez
'16': 'ASN', # Aswan
'17': 'AST', # Asyut
'18': 'BNS', # Beni Suweif
'19': 'PTS', # Port Said
'20': 'DT', # Damietta
'21': 'KFS', # Kafr el-Sheikh
'22': 'MT', # Matruh
'23': 'KN', # Qena
'24': 'SHG', # Sohag
'26': 'JS', # South Sinai
'27': 'SIN', # North Sinai
'28': 'LX', # Luxor
},
}
# City name translations (native → GeoNames ASCII name)
# Many cities in GeoNames use English/anglicized names
CITY_NAME_TRANSLATIONS = {
# German → English
'wien': 'vienna',
'munchen': 'munich',
'koln': 'cologne',
'nurnberg': 'nuremberg',
'braunschweig': 'brunswick',
# Czech → GeoNames (use normalized/ASCII keys)
'praha': 'prague',
'plzen': 'pilsen', # Plzeň → plzen after normalization
'brno': 'brno',
'ostrava': 'ostrava',
# Swiss cities
'geneve': 'geneva',
'zurich': 'zurich',
'bern': 'berne',
'basel': 'basle',
# Italian cities
'roma': 'rome',
'milano': 'milan',
'napoli': 'naples',
'firenze': 'florence',
'venezia': 'venice',
'torino': 'turin',
# Austrian special cases (use normalized keys after diacritics removal)
# GeoNames uses 'oe' for ö, so 'Sankt Poelten'
'st. polten': 'sankt poelten',
'st polten': 'sankt poelten',
'sankt polten': 'sankt poelten',
# Japanese cities - complex administrative format to GeoNames
# Format: "District Gun City Machi/Cho" → just the city name
'haga gun motegi machi': 'motegi',
'motegi machi': 'motegi',
# Egyptian landmarks → Cairo
'nile corniche': 'cairo',
}
def normalize_city_name(name: str) -> str:
"""Normalize city name for matching."""
# NFD normalization to separate diacritics
normalized = unicodedata.normalize('NFD', name)
# Remove diacritics
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Lowercase
return ascii_name.lower().strip()
def clean_city_name(city: str) -> str:
"""Extract base city name from complex strings like 'Praha 1' or 'Zlín - Louky'."""
# Remove district numbers like "Praha 1", "Praha 9 - Běchovice"
city = re.sub(r'\s+\d+.*$', '', city)
# Remove parts after dash
city = re.sub(r'\s*-\s*.*$', '', city)
# Remove postal code patterns
city = re.sub(r'\s+\d{3}\s*\d{2}.*$', '', city)
return city.strip()
def lookup_city_region(city_name: str, country: str, conn: sqlite3.Connection) -> Optional[Dict]:
"""Look up city in GeoNames and return region info."""
cursor = conn.cursor()
# Clean city name
base_city = clean_city_name(city_name)
normalized = normalize_city_name(base_city)
# Check for translated name (native → GeoNames)
if normalized in CITY_NAME_TRANSLATIONS:
translated = CITY_NAME_TRANSLATIONS[normalized]
else:
translated = normalized
# Try translated name first, then normalized
row = None
for search_name in [translated, normalized]:
cursor.execute(f'''
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code,
latitude, longitude, feature_code, population
FROM cities
WHERE country_code = ?
AND feature_code IN {SETTLEMENT_FEATURE_CODES}
AND LOWER(ascii_name) = ?
ORDER BY population DESC
LIMIT 1
''', (country, search_name))
row = cursor.fetchone()
if row:
break
# If no match, try LIKE search with normalized name
if not row:
cursor.execute(f'''
SELECT geonames_id, name, ascii_name, admin1_code, admin2_code,
latitude, longitude, feature_code, population
FROM cities
WHERE country_code = ?
AND feature_code IN {SETTLEMENT_FEATURE_CODES}
AND LOWER(ascii_name) LIKE ?
ORDER BY population DESC
LIMIT 1
''', (country, f'{normalized}%'))
row = cursor.fetchone()
if not row:
return None
return {
'geonames_id': row[0],
'name': row[1],
'ascii_name': row[2],
'admin1_code': row[3],
'admin2_code': row[4],
'latitude': row[5],
'longitude': row[6],
'feature_code': row[7],
'population': row[8],
}
def get_region_code(country: str, admin1_code: Optional[str], admin2_code: Optional[str] = None) -> str:
"""Convert GeoNames admin codes to ISO 3166-2 region codes."""
if country in COUNTRY_ADMIN_MAPS:
country_map = COUNTRY_ADMIN_MAPS[country]
if country == 'BE' and admin2_code:
return country_map.get(admin2_code, admin1_code or 'XX')
if admin1_code:
return country_map.get(admin1_code, admin1_code)
return 'XX'
return admin1_code if admin1_code else 'XX'
def find_city_in_file(data: Dict) -> Optional[Tuple[str, str]]:
"""Find city name and country from file data."""
country = None
city = None
# Get country from ghcid
ghcid = data.get('ghcid', {})
loc_res = ghcid.get('location_resolution', {})
country = loc_res.get('country_code')
# Check original_entry.locations
if 'original_entry' in data:
locations = data['original_entry'].get('locations', [])
for loc in locations:
if 'city' in loc and loc['city']:
city = loc['city']
if not country and 'country' in loc:
country = loc['country']
break
# Check top-level locations
if not city:
locations = data.get('locations', [])
for loc in locations:
if 'city' in loc and loc['city']:
city = loc['city']
if not country and 'country' in loc:
country = loc['country']
break
if city and country:
return (city, country)
return None
def process_file(filepath: Path, conn: sqlite3.Connection, apply: bool) -> bool:
"""Process a single file with XX region code."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
print(f" Error reading {filepath}: {e}")
return False
if not data:
return False
# Check if region is already resolved
ghcid = data.get('ghcid', {})
loc_res = ghcid.get('location_resolution', {})
if loc_res.get('region_code', 'XX') != 'XX':
return False
# Find city name
city_info = find_city_in_file(data)
if not city_info:
return False
city_name, country = city_info
print(f" City: {city_name} ({country})")
# Look up in GeoNames
city_data = lookup_city_region(city_name, country, conn)
if not city_data:
print(f" No GeoNames match for '{city_name}'")
return False
region_code = get_region_code(country, city_data['admin1_code'], city_data.get('admin2_code'))
if region_code == 'XX':
print(f" Could not determine region for admin1={city_data['admin1_code']}")
return False
print(f" Found: {city_data['name']} -> Region {region_code}")
if not apply:
return True
# Update GHCID
current = ghcid.get('ghcid_current', '')
parts = current.split('-')
if len(parts) < 5:
print(f" Invalid GHCID format: {current}")
return False
old_region = parts[1]
if old_region != 'XX':
print(f" Region already set: {old_region}")
return False
parts[1] = region_code
new_ghcid = '-'.join(parts)
# Update data
ghcid['ghcid_current'] = new_ghcid
loc_res['region_code'] = region_code
loc_res['region_name'] = f"{country}-{region_code}"
loc_res['geonames_id'] = city_data['geonames_id']
loc_res['method'] = 'GEONAMES_CITY_LOOKUP'
loc_res['resolution_timestamp'] = datetime.now(timezone.utc).isoformat()
ghcid['location_resolution'] = loc_res
# Add to history
history = ghcid.get('ghcid_history', [])
history.append({
'ghcid': new_ghcid,
'valid_from': datetime.now(timezone.utc).isoformat(),
'reason': f'Region resolved via GeoNames city lookup: XX->{region_code} ({city_data["name"]})'
})
ghcid['ghcid_history'] = history
data['ghcid'] = ghcid
# Calculate new filename
old_name = filepath.name
new_name = old_name.replace(f'{country}-XX-', f'{country}-{region_code}-')
new_path = filepath.parent / new_name
# Write and rename
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
if new_path != filepath:
filepath.rename(new_path)
print(f" Renamed: {old_name} -> {new_name}")
return True
def main():
import argparse
parser = argparse.ArgumentParser(description='Resolve XX region codes using city names in files')
parser.add_argument('--limit', type=int, default=100, help='Max files to process')
parser.add_argument('--apply', action='store_true', help='Apply changes (default: dry run)')
parser.add_argument('--country', help='Filter by country code')
args = parser.parse_args()
print("=" * 70)
print("REGION RESOLUTION FROM FILE CITY NAMES")
print("=" * 70)
print(f"Mode: {'APPLYING CHANGES' if args.apply else 'DRY RUN'}")
print()
# Connect to GeoNames
if not GEONAMES_DB.exists():
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
sys.exit(1)
conn = sqlite3.connect(str(GEONAMES_DB))
# Find XX files with city names
xx_files = []
for f in CUSTODIAN_DIR.glob('*.yaml'):
if '-XX-' in f.name:
if args.country and not f.name.startswith(f'{args.country}-'):
continue
xx_files.append(f)
print(f"Found {len(xx_files)} files with XX region codes")
# Filter to files with city names
files_with_cities = []
for f in xx_files:
try:
with open(f, 'r', encoding='utf-8') as fp:
content = fp.read()
if 'city:' in content:
files_with_cities.append(f)
except:
pass
print(f"Processing {min(len(files_with_cities), args.limit)} files with city names")
print()
resolved = 0
renamed = 0
for f in files_with_cities[:args.limit]:
print(f"Processing {f.name}...")
if process_file(f, conn, args.apply):
resolved += 1
if args.apply:
renamed += 1
conn.close()
print()
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Files processed: {min(len(files_with_cities), args.limit)}")
print(f"Resolved: {resolved}")
print(f"Renamed: {renamed}")
if __name__ == '__main__':
main()