#!/usr/bin/env python3 """ Enrich Bulgarian custodian files with proper city codes from GeoNames. Maps Cyrillic city names to ASCII equivalents and resolves admin1 regions. """ import os import re import sqlite3 from pathlib import Path from datetime import datetime, timezone import yaml # Bulgarian Cyrillic to ASCII city name mapping # Based on standard transliteration CYRILLIC_TO_ASCII = { # Major cities found in XXX files 'Самоков': 'Samokov', 'Асеновград': 'Asenovgrad', 'Казанлък': 'Kazanlak', 'Карлово': 'Karlovo', 'Котел': 'Kotel', 'Димитровград': 'Dimitrovgrad', 'Исперих': 'Isperih', 'Панагюрище': 'Panagyurishte', 'Раднево': 'Radnevo', 'Белица': 'Belitsa', 'Гоце Делчев': 'Gotse Delchev', 'Горна Оряховица': 'Gorna Oryahovitsa', 'Якоруда': 'Yakoruda', 'Хаджидимово': 'Hadzhidimovo', 'Генерал Тодоров': 'General Todorov', 'Черноморец': 'Chernomorets', 'Плоски': 'Ploski', 'Плетена': 'Pletena', 'Дюлево': 'Dyulevo', 'Левуново': 'Levunovo', 'Гълъбово': 'Galabovo', 'Абланица': 'Ablanitsa', # Additional common cities 'София': 'Sofia', 'Пловдив': 'Plovdiv', 'Варна': 'Varna', 'Бургас': 'Burgas', 'Русе': 'Ruse', 'Стара Загора': 'Stara Zagora', 'Плевен': 'Pleven', 'Сливен': 'Sliven', 'Добрич': 'Dobrich', 'Шумен': 'Shumen', 'Перник': 'Pernik', 'Хасково': 'Haskovo', 'Благоевград': 'Blagoevgrad', 'Велико Търново': 'Veliko Tarnovo', 'Враца': 'Vratsa', 'Габрово': 'Gabrovo', 'Пазарджик': 'Pazardzhik', 'Ямбол': 'Yambol', 'Кърджали': 'Kardzhali', 'Монтана': 'Montana', 'Разград': 'Razgrad', 'Силистра': 'Silistra', 'Смолян': 'Smolyan', 'Търговище': 'Targovishte', 'Кюстендил': 'Kyustendil', 'Ловеч': 'Lovech', 'Видин': 'Vidin', } # Bulgarian admin1 GeoNames code to ISO 3166-2:BG mapping ADMIN1_TO_ISO = { '38': 'BLG', # Blagoevgrad '39': 'BGS', # Burgas '40': 'DOB', # Dobrich '41': 'GAB', # Gabrovo '42': 'SOF', # Sofia-Capital (also SFO for city) '43': 'KHO', # Haskovo (officially HKV but using KHO) '44': 'KRZ', # Kardzhali '45': 'KNL', # Kyustendil '46': 'LOV', # Lovech '47': 'MON', # Montana '48': 'PAZ', # Pazardzhik '49': 'PER', # Pernik '50': 'PVN', # Pleven '51': 'PDV', # Plovdiv '52': 'RAZ', # Razgrad '53': 'RSE', # Ruse '54': 'SHU', # Shumen '55': 'SLS', # Silistra '56': 'SLV', # Sliven '57': 'SML', # Smolyan '58': 'SFO', # Sofia (Province) '59': 'SZR', # Stara Zagora '60': 'TGV', # Targovishte '61': 'VAR', # Varna '62': 'VTR', # Veliko Tarnovo '63': 'VID', # Vidin '64': 'VRC', # Vratsa '65': 'JAM', # Yambol } def get_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" # Clean the name name = city_name.strip() words = name.split() if len(words) == 1: # Single word: first 3 letters return name[:3].upper() elif len(words) == 2: # Two words: first letter of each + first letter of second word return (words[0][0] + words[1][:2]).upper() else: # Multiple words: first letter of each (up to 3) return ''.join(w[0] for w in words[:3]).upper() def transliterate_cyrillic(text: str) -> str: """Basic Cyrillic to Latin transliteration.""" # Check direct mapping first if text in CYRILLIC_TO_ASCII: return CYRILLIC_TO_ASCII[text] # Basic character-by-character transliteration cyrillic_map = { 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'h', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh', 'щ': 'sht', 'ъ': 'a', 'ь': '', 'ю': 'yu', 'я': 'ya', 'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E', 'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K', 'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R', 'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'H', 'Ц': 'Ts', 'Ч': 'Ch', 'Ш': 'Sh', 'Щ': 'Sht', 'Ъ': 'A', 'Ь': '', 'Ю': 'Yu', 'Я': 'Ya', } result = [] for char in text: if char in cyrillic_map: result.append(cyrillic_map[char]) else: result.append(char) return ''.join(result) def lookup_city_in_geonames(conn: sqlite3.Connection, city_name: str) -> dict | None: """Look up city in GeoNames database.""" cursor = conn.cursor() # First try direct ASCII lookup ascii_name = CYRILLIC_TO_ASCII.get(city_name) or transliterate_cyrillic(city_name) # Try exact match first cursor.execute(""" SELECT name, ascii_name, admin1_code, admin1_name, geonames_id, latitude, longitude, population, feature_code FROM cities WHERE country_code='BG' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') AND (ascii_name = ? OR name = ?) ORDER BY population DESC LIMIT 1 """, (ascii_name, ascii_name)) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'geonames_id': row[4], 'latitude': row[5], 'longitude': row[6], 'population': row[7], 'feature_code': row[8], } # Try fuzzy match with LIKE cursor.execute(""" SELECT name, ascii_name, admin1_code, admin1_name, geonames_id, latitude, longitude, population, feature_code FROM cities WHERE country_code='BG' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') AND (ascii_name LIKE ? OR name LIKE ?) ORDER BY population DESC LIMIT 1 """, (f'{ascii_name}%', f'{ascii_name}%')) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'geonames_id': row[4], 'latitude': row[5], 'longitude': row[6], 'population': row[7], 'feature_code': row[8], } return None def process_file(filepath: Path, conn: sqlite3.Connection, dry_run: bool = False) -> dict: """Process a single Bulgarian custodian file.""" result = { 'file': str(filepath), 'status': 'skipped', 'old_ghcid': None, 'new_ghcid': None, 'city_cyrillic': None, 'city_ascii': None, 'error': None, } try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: result['status'] = 'error' result['error'] = f'Failed to load YAML: {e}' return result if not data: result['status'] = 'error' result['error'] = 'Empty YAML file' return result # Get current GHCID ghcid_data = data.get('ghcid', {}) old_ghcid = ghcid_data.get('ghcid_current', '') result['old_ghcid'] = old_ghcid # Check if it's a BG-XX-XXX file if not old_ghcid.startswith('BG-XX-XXX-'): result['status'] = 'skipped' result['error'] = 'Not a BG-XX-XXX file' return result # Extract city from original_entry or locations city_cyrillic = None if 'original_entry' in data and 'locations' in data['original_entry']: locations = data['original_entry']['locations'] if locations and isinstance(locations, list) and len(locations) > 0: city_cyrillic = locations[0].get('city') if not city_cyrillic: result['status'] = 'error' result['error'] = 'No city found in original_entry' return result result['city_cyrillic'] = city_cyrillic # Look up city in GeoNames city_info = lookup_city_in_geonames(conn, city_cyrillic) if not city_info: result['status'] = 'error' result['error'] = f'City not found in GeoNames: {city_cyrillic}' return result result['city_ascii'] = city_info['ascii_name'] # Get region code admin1_code = city_info['admin1_code'] region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX') # Generate city code city_code = get_city_code(city_info['ascii_name']) # Build new GHCID # Extract type and abbreviation from old GHCID # Format: BG-XX-XXX-{type}-{abbrev} parts = old_ghcid.split('-') if len(parts) >= 5: inst_type = parts[3] abbreviation = '-'.join(parts[4:]) # May contain hyphens else: result['status'] = 'error' result['error'] = f'Invalid GHCID format: {old_ghcid}' return result new_ghcid = f'BG-{region_code}-{city_code}-{inst_type}-{abbreviation}' result['new_ghcid'] = new_ghcid if dry_run: result['status'] = 'would_update' return result # Update the GHCID data timestamp = datetime.now(timezone.utc).isoformat() # Update ghcid section data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['location_resolution'] = { 'method': 'GEONAMES_LOOKUP', 'country_code': 'BG', 'region_code': region_code, 'region_name': city_info['admin1_name'], 'city_code': city_code, 'city_name': city_info['ascii_name'], 'city_name_cyrillic': city_cyrillic, 'geonames_id': city_info['geonames_id'], 'feature_code': city_info['feature_code'], 'resolution_date': timestamp, } # Add to GHCID history if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] # Mark old GHCID as ended for entry in data['ghcid']['ghcid_history']: if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'): entry['valid_to'] = timestamp # Add new GHCID entry data['ghcid']['ghcid_history'].append({ 'ghcid': new_ghcid, 'ghcid_numeric': data['ghcid'].get('ghcid_numeric'), 'valid_from': timestamp, 'reason': f'City resolved via GeoNames: {city_cyrillic} → {city_info["ascii_name"]} ({region_code})', }) # Update identifiers if 'identifiers' in data: for identifier in data['identifiers']: if identifier.get('identifier_scheme') == 'GHCID': identifier['identifier_value'] = new_ghcid # Calculate new file path new_filename = f'{new_ghcid}.yaml' new_filepath = filepath.parent / new_filename # Write updated data with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Rename file if filepath != new_filepath and not new_filepath.exists(): filepath.rename(new_filepath) result['new_file'] = str(new_filepath) elif new_filepath.exists() and filepath != new_filepath: result['status'] = 'collision' result['error'] = f'Target file already exists: {new_filepath}' return result result['status'] = 'updated' return result def main(): import argparse parser = argparse.ArgumentParser(description='Enrich Bulgarian custodian files with GeoNames data') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') parser.add_argument('--limit', type=int, help='Limit number of files to process') args = parser.parse_args() # Find all Bulgarian XXX files custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') geonames_db = Path('/Users/kempersc/apps/glam/data/reference/geonames.db') if not geonames_db.exists(): print(f'ERROR: GeoNames database not found: {geonames_db}') return files = sorted(custodian_dir.glob('BG-XX-XXX-*.yaml')) if args.limit: files = files[:args.limit] print(f'Found {len(files)} Bulgarian XXX files') print(f'Dry run: {args.dry_run}') print() # Connect to GeoNames database conn = sqlite3.connect(str(geonames_db)) stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0} errors = [] for filepath in files: result = process_file(filepath, conn, dry_run=args.dry_run) stats[result['status']] = stats.get(result['status'], 0) + 1 if result['status'] == 'updated' or result['status'] == 'would_update': print(f"✓ {result['city_cyrillic']} → {result['city_ascii']}: {result['old_ghcid']} → {result['new_ghcid']}") elif result['status'] == 'error': print(f"✗ {filepath.name}: {result['error']}") errors.append(result) elif result['status'] == 'collision': print(f"⚠ {filepath.name}: {result['error']}") conn.close() print() print('=' * 60) print('Summary:') print(f" Updated: {stats.get('updated', 0)}") print(f" Would update: {stats.get('would_update', 0)}") print(f" Errors: {stats.get('error', 0)}") print(f" Collisions: {stats.get('collision', 0)}") print(f" Skipped: {stats.get('skipped', 0)}") if errors: print() print('Errors:') for err in errors: print(f" - {err['file']}: {err['error']}") if __name__ == '__main__': main()