glam/scripts/enrich_bulgarian_cities.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

424 lines
14 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Enrich Bulgarian custodian files with proper city codes from GeoNames.
Maps Cyrillic city names to ASCII equivalents and resolves admin1 regions.
"""
import os
import re
import sqlite3
from pathlib import Path
from datetime import datetime, timezone
import yaml
# Bulgarian Cyrillic to ASCII city name mapping
# Based on standard transliteration
CYRILLIC_TO_ASCII = {
# Major cities found in XXX files
'Самоков': 'Samokov',
'Асеновград': 'Asenovgrad',
'Казанлък': 'Kazanlak',
'Карлово': 'Karlovo',
'Котел': 'Kotel',
'Димитровград': 'Dimitrovgrad',
'Исперих': 'Isperih',
'Панагюрище': 'Panagyurishte',
'Раднево': 'Radnevo',
'Белица': 'Belitsa',
'Гоце Делчев': 'Gotse Delchev',
'Горна Оряховица': 'Gorna Oryahovitsa',
'Якоруда': 'Yakoruda',
'Хаджидимово': 'Hadzhidimovo',
'Генерал Тодоров': 'General Todorov',
'Черноморец': 'Chernomorets',
'Плоски': 'Ploski',
'Плетена': 'Pletena',
'Дюлево': 'Dyulevo',
'Левуново': 'Levunovo',
'Гълъбово': 'Galabovo',
'Абланица': 'Ablanitsa',
# Additional common cities
'София': 'Sofia',
'Пловдив': 'Plovdiv',
'Варна': 'Varna',
'Бургас': 'Burgas',
'Русе': 'Ruse',
'Стара Загора': 'Stara Zagora',
'Плевен': 'Pleven',
'Сливен': 'Sliven',
'Добрич': 'Dobrich',
'Шумен': 'Shumen',
'Перник': 'Pernik',
'Хасково': 'Haskovo',
'Благоевград': 'Blagoevgrad',
'Велико Търново': 'Veliko Tarnovo',
'Враца': 'Vratsa',
'Габрово': 'Gabrovo',
'Пазарджик': 'Pazardzhik',
'Ямбол': 'Yambol',
'Кърджали': 'Kardzhali',
'Монтана': 'Montana',
'Разград': 'Razgrad',
'Силистра': 'Silistra',
'Смолян': 'Smolyan',
'Търговище': 'Targovishte',
'Кюстендил': 'Kyustendil',
'Ловеч': 'Lovech',
'Видин': 'Vidin',
}
# Bulgarian admin1 GeoNames code to ISO 3166-2:BG mapping
ADMIN1_TO_ISO = {
'38': 'BLG', # Blagoevgrad
'39': 'BGS', # Burgas
'40': 'DOB', # Dobrich
'41': 'GAB', # Gabrovo
'42': 'SOF', # Sofia-Capital (also SFO for city)
'43': 'KHO', # Haskovo (officially HKV but using KHO)
'44': 'KRZ', # Kardzhali
'45': 'KNL', # Kyustendil
'46': 'LOV', # Lovech
'47': 'MON', # Montana
'48': 'PAZ', # Pazardzhik
'49': 'PER', # Pernik
'50': 'PVN', # Pleven
'51': 'PDV', # Plovdiv
'52': 'RAZ', # Razgrad
'53': 'RSE', # Ruse
'54': 'SHU', # Shumen
'55': 'SLS', # Silistra
'56': 'SLV', # Sliven
'57': 'SML', # Smolyan
'58': 'SFO', # Sofia (Province)
'59': 'SZR', # Stara Zagora
'60': 'TGV', # Targovishte
'61': 'VAR', # Varna
'62': 'VTR', # Veliko Tarnovo
'63': 'VID', # Vidin
'64': 'VRC', # Vratsa
'65': 'JAM', # Yambol
}
def get_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name."""
# Clean the name
name = city_name.strip()
words = name.split()
if len(words) == 1:
# Single word: first 3 letters
return name[:3].upper()
elif len(words) == 2:
# Two words: first letter of each + first letter of second word
return (words[0][0] + words[1][:2]).upper()
else:
# Multiple words: first letter of each (up to 3)
return ''.join(w[0] for w in words[:3]).upper()
def transliterate_cyrillic(text: str) -> str:
"""Basic Cyrillic to Latin transliteration."""
# Check direct mapping first
if text in CYRILLIC_TO_ASCII:
return CYRILLIC_TO_ASCII[text]
# Basic character-by-character transliteration
cyrillic_map = {
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd',
'е': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y',
'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u',
'ф': 'f', 'х': 'h', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh',
'щ': 'sht', 'ъ': 'a', 'ь': '', 'ю': 'yu', 'я': 'ya',
'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D',
'Е': 'E', 'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y',
'К': 'K', 'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O',
'П': 'P', 'Р': 'R', 'С': 'S', 'Т': 'T', 'У': 'U',
'Ф': 'F', 'Х': 'H', 'Ц': 'Ts', 'Ч': 'Ch', 'Ш': 'Sh',
'Щ': 'Sht', 'Ъ': 'A', 'Ь': '', 'Ю': 'Yu', 'Я': 'Ya',
}
result = []
for char in text:
if char in cyrillic_map:
result.append(cyrillic_map[char])
else:
result.append(char)
return ''.join(result)
def lookup_city_in_geonames(conn: sqlite3.Connection, city_name: str) -> dict | None:
"""Look up city in GeoNames database."""
cursor = conn.cursor()
# First try direct ASCII lookup
ascii_name = CYRILLIC_TO_ASCII.get(city_name) or transliterate_cyrillic(city_name)
# Try exact match first
cursor.execute("""
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
latitude, longitude, population, feature_code
FROM cities
WHERE country_code='BG'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (ascii_name = ? OR name = ?)
ORDER BY population DESC
LIMIT 1
""", (ascii_name, ascii_name))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'geonames_id': row[4],
'latitude': row[5],
'longitude': row[6],
'population': row[7],
'feature_code': row[8],
}
# Try fuzzy match with LIKE
cursor.execute("""
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
latitude, longitude, population, feature_code
FROM cities
WHERE country_code='BG'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (ascii_name LIKE ? OR name LIKE ?)
ORDER BY population DESC
LIMIT 1
""", (f'{ascii_name}%', f'{ascii_name}%'))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'geonames_id': row[4],
'latitude': row[5],
'longitude': row[6],
'population': row[7],
'feature_code': row[8],
}
return None
def process_file(filepath: Path, conn: sqlite3.Connection, dry_run: bool = False) -> dict:
"""Process a single Bulgarian custodian file."""
result = {
'file': str(filepath),
'status': 'skipped',
'old_ghcid': None,
'new_ghcid': None,
'city_cyrillic': None,
'city_ascii': None,
'error': None,
}
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
result['status'] = 'error'
result['error'] = f'Failed to load YAML: {e}'
return result
if not data:
result['status'] = 'error'
result['error'] = 'Empty YAML file'
return result
# Get current GHCID
ghcid_data = data.get('ghcid', {})
old_ghcid = ghcid_data.get('ghcid_current', '')
result['old_ghcid'] = old_ghcid
# Check if it's a BG-XX-XXX file
if not old_ghcid.startswith('BG-XX-XXX-'):
result['status'] = 'skipped'
result['error'] = 'Not a BG-XX-XXX file'
return result
# Extract city from original_entry or locations
city_cyrillic = None
if 'original_entry' in data and 'locations' in data['original_entry']:
locations = data['original_entry']['locations']
if locations and isinstance(locations, list) and len(locations) > 0:
city_cyrillic = locations[0].get('city')
if not city_cyrillic:
result['status'] = 'error'
result['error'] = 'No city found in original_entry'
return result
result['city_cyrillic'] = city_cyrillic
# Look up city in GeoNames
city_info = lookup_city_in_geonames(conn, city_cyrillic)
if not city_info:
result['status'] = 'error'
result['error'] = f'City not found in GeoNames: {city_cyrillic}'
return result
result['city_ascii'] = city_info['ascii_name']
# Get region code
admin1_code = city_info['admin1_code']
region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX')
# Generate city code
city_code = get_city_code(city_info['ascii_name'])
# Build new GHCID
# Extract type and abbreviation from old GHCID
# Format: BG-XX-XXX-{type}-{abbrev}
parts = old_ghcid.split('-')
if len(parts) >= 5:
inst_type = parts[3]
abbreviation = '-'.join(parts[4:]) # May contain hyphens
else:
result['status'] = 'error'
result['error'] = f'Invalid GHCID format: {old_ghcid}'
return result
new_ghcid = f'BG-{region_code}-{city_code}-{inst_type}-{abbreviation}'
result['new_ghcid'] = new_ghcid
if dry_run:
result['status'] = 'would_update'
return result
# Update the GHCID data
timestamp = datetime.now(timezone.utc).isoformat()
# Update ghcid section
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['location_resolution'] = {
'method': 'GEONAMES_LOOKUP',
'country_code': 'BG',
'region_code': region_code,
'region_name': city_info['admin1_name'],
'city_code': city_code,
'city_name': city_info['ascii_name'],
'city_name_cyrillic': city_cyrillic,
'geonames_id': city_info['geonames_id'],
'feature_code': city_info['feature_code'],
'resolution_date': timestamp,
}
# Add to GHCID history
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
# Mark old GHCID as ended
for entry in data['ghcid']['ghcid_history']:
if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
entry['valid_to'] = timestamp
# Add new GHCID entry
data['ghcid']['ghcid_history'].append({
'ghcid': new_ghcid,
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
'valid_from': timestamp,
'reason': f'City resolved via GeoNames: {city_cyrillic}{city_info["ascii_name"]} ({region_code})',
})
# Update identifiers
if 'identifiers' in data:
for identifier in data['identifiers']:
if identifier.get('identifier_scheme') == 'GHCID':
identifier['identifier_value'] = new_ghcid
# Calculate new file path
new_filename = f'{new_ghcid}.yaml'
new_filepath = filepath.parent / new_filename
# Write updated data
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rename file
if filepath != new_filepath and not new_filepath.exists():
filepath.rename(new_filepath)
result['new_file'] = str(new_filepath)
elif new_filepath.exists() and filepath != new_filepath:
result['status'] = 'collision'
result['error'] = f'Target file already exists: {new_filepath}'
return result
result['status'] = 'updated'
return result
def main():
import argparse
parser = argparse.ArgumentParser(description='Enrich Bulgarian custodian files with GeoNames data')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
parser.add_argument('--limit', type=int, help='Limit number of files to process')
args = parser.parse_args()
# Find all Bulgarian XXX files
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
geonames_db = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')
if not geonames_db.exists():
print(f'ERROR: GeoNames database not found: {geonames_db}')
return
files = sorted(custodian_dir.glob('BG-XX-XXX-*.yaml'))
if args.limit:
files = files[:args.limit]
print(f'Found {len(files)} Bulgarian XXX files')
print(f'Dry run: {args.dry_run}')
print()
# Connect to GeoNames database
conn = sqlite3.connect(str(geonames_db))
stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
errors = []
for filepath in files:
result = process_file(filepath, conn, dry_run=args.dry_run)
stats[result['status']] = stats.get(result['status'], 0) + 1
if result['status'] == 'updated' or result['status'] == 'would_update':
print(f"{result['city_cyrillic']}{result['city_ascii']}: {result['old_ghcid']}{result['new_ghcid']}")
elif result['status'] == 'error':
print(f"{filepath.name}: {result['error']}")
errors.append(result)
elif result['status'] == 'collision':
print(f"{filepath.name}: {result['error']}")
conn.close()
print()
print('=' * 60)
print('Summary:')
print(f" Updated: {stats.get('updated', 0)}")
print(f" Would update: {stats.get('would_update', 0)}")
print(f" Errors: {stats.get('error', 0)}")
print(f" Collisions: {stats.get('collision', 0)}")
print(f" Skipped: {stats.get('skipped', 0)}")
if errors:
print()
print('Errors:')
for err in errors:
print(f" - {err['file']}: {err['error']}")
if __name__ == '__main__':
main()