Enrichment scripts for country-specific city data: - enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py - enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py - enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py Location resolution utilities: - resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames - resolve_cities_wikidata.py - Use Wikidata P131 for city resolution - resolve_country_codes.py - Standardize country codes - resolve_cz_xx_regions.py - Fix Czech XX region codes - resolve_locations_by_name.py - Name-based location lookup - resolve_regions_from_city.py - Derive regions from city data - update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data CH-Annotator integration: - create_custodian_from_ch_annotator.py - Create custodians from annotations - add_ch_annotator_location_claims.py - Add location claims - extract_locations_ch_annotator.py - Extract locations from annotations Migration and fixes: - migrate_egyptian_from_ch.py - Migrate Egyptian data - migrate_web_archives.py - Migrate web archive data - fix_belgian_cities.py - Fix Belgian city data
424 lines
14 KiB
Python
Executable file
424 lines
14 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Enrich Bulgarian custodian files with proper city codes from GeoNames.
|
||
Maps Cyrillic city names to ASCII equivalents and resolves admin1 regions.
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import sqlite3
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
|
||
import yaml
|
||
|
||
# Bulgarian Cyrillic to ASCII city name mapping
|
||
# Based on standard transliteration
|
||
CYRILLIC_TO_ASCII = {
|
||
# Major cities found in XXX files
|
||
'Самоков': 'Samokov',
|
||
'Асеновград': 'Asenovgrad',
|
||
'Казанлък': 'Kazanlak',
|
||
'Карлово': 'Karlovo',
|
||
'Котел': 'Kotel',
|
||
'Димитровград': 'Dimitrovgrad',
|
||
'Исперих': 'Isperih',
|
||
'Панагюрище': 'Panagyurishte',
|
||
'Раднево': 'Radnevo',
|
||
'Белица': 'Belitsa',
|
||
'Гоце Делчев': 'Gotse Delchev',
|
||
'Горна Оряховица': 'Gorna Oryahovitsa',
|
||
'Якоруда': 'Yakoruda',
|
||
'Хаджидимово': 'Hadzhidimovo',
|
||
'Генерал Тодоров': 'General Todorov',
|
||
'Черноморец': 'Chernomorets',
|
||
'Плоски': 'Ploski',
|
||
'Плетена': 'Pletena',
|
||
'Дюлево': 'Dyulevo',
|
||
'Левуново': 'Levunovo',
|
||
'Гълъбово': 'Galabovo',
|
||
'Абланица': 'Ablanitsa',
|
||
# Additional common cities
|
||
'София': 'Sofia',
|
||
'Пловдив': 'Plovdiv',
|
||
'Варна': 'Varna',
|
||
'Бургас': 'Burgas',
|
||
'Русе': 'Ruse',
|
||
'Стара Загора': 'Stara Zagora',
|
||
'Плевен': 'Pleven',
|
||
'Сливен': 'Sliven',
|
||
'Добрич': 'Dobrich',
|
||
'Шумен': 'Shumen',
|
||
'Перник': 'Pernik',
|
||
'Хасково': 'Haskovo',
|
||
'Благоевград': 'Blagoevgrad',
|
||
'Велико Търново': 'Veliko Tarnovo',
|
||
'Враца': 'Vratsa',
|
||
'Габрово': 'Gabrovo',
|
||
'Пазарджик': 'Pazardzhik',
|
||
'Ямбол': 'Yambol',
|
||
'Кърджали': 'Kardzhali',
|
||
'Монтана': 'Montana',
|
||
'Разград': 'Razgrad',
|
||
'Силистра': 'Silistra',
|
||
'Смолян': 'Smolyan',
|
||
'Търговище': 'Targovishte',
|
||
'Кюстендил': 'Kyustendil',
|
||
'Ловеч': 'Lovech',
|
||
'Видин': 'Vidin',
|
||
}
|
||
|
||
# Bulgarian admin1 GeoNames code to ISO 3166-2:BG mapping
|
||
ADMIN1_TO_ISO = {
|
||
'38': 'BLG', # Blagoevgrad
|
||
'39': 'BGS', # Burgas
|
||
'40': 'DOB', # Dobrich
|
||
'41': 'GAB', # Gabrovo
|
||
'42': 'SOF', # Sofia-Capital (also SFO for city)
|
||
'43': 'KHO', # Haskovo (officially HKV but using KHO)
|
||
'44': 'KRZ', # Kardzhali
|
||
'45': 'KNL', # Kyustendil
|
||
'46': 'LOV', # Lovech
|
||
'47': 'MON', # Montana
|
||
'48': 'PAZ', # Pazardzhik
|
||
'49': 'PER', # Pernik
|
||
'50': 'PVN', # Pleven
|
||
'51': 'PDV', # Plovdiv
|
||
'52': 'RAZ', # Razgrad
|
||
'53': 'RSE', # Ruse
|
||
'54': 'SHU', # Shumen
|
||
'55': 'SLS', # Silistra
|
||
'56': 'SLV', # Sliven
|
||
'57': 'SML', # Smolyan
|
||
'58': 'SFO', # Sofia (Province)
|
||
'59': 'SZR', # Stara Zagora
|
||
'60': 'TGV', # Targovishte
|
||
'61': 'VAR', # Varna
|
||
'62': 'VTR', # Veliko Tarnovo
|
||
'63': 'VID', # Vidin
|
||
'64': 'VRC', # Vratsa
|
||
'65': 'JAM', # Yambol
|
||
}
|
||
|
||
|
||
def get_city_code(city_name: str) -> str:
|
||
"""Generate 3-letter city code from city name."""
|
||
# Clean the name
|
||
name = city_name.strip()
|
||
words = name.split()
|
||
|
||
if len(words) == 1:
|
||
# Single word: first 3 letters
|
||
return name[:3].upper()
|
||
elif len(words) == 2:
|
||
# Two words: first letter of each + first letter of second word
|
||
return (words[0][0] + words[1][:2]).upper()
|
||
else:
|
||
# Multiple words: first letter of each (up to 3)
|
||
return ''.join(w[0] for w in words[:3]).upper()
|
||
|
||
|
||
def transliterate_cyrillic(text: str) -> str:
|
||
"""Basic Cyrillic to Latin transliteration."""
|
||
# Check direct mapping first
|
||
if text in CYRILLIC_TO_ASCII:
|
||
return CYRILLIC_TO_ASCII[text]
|
||
|
||
# Basic character-by-character transliteration
|
||
cyrillic_map = {
|
||
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd',
|
||
'е': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y',
|
||
'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
|
||
'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u',
|
||
'ф': 'f', 'х': 'h', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh',
|
||
'щ': 'sht', 'ъ': 'a', 'ь': '', 'ю': 'yu', 'я': 'ya',
|
||
'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D',
|
||
'Е': 'E', 'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y',
|
||
'К': 'K', 'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O',
|
||
'П': 'P', 'Р': 'R', 'С': 'S', 'Т': 'T', 'У': 'U',
|
||
'Ф': 'F', 'Х': 'H', 'Ц': 'Ts', 'Ч': 'Ch', 'Ш': 'Sh',
|
||
'Щ': 'Sht', 'Ъ': 'A', 'Ь': '', 'Ю': 'Yu', 'Я': 'Ya',
|
||
}
|
||
result = []
|
||
for char in text:
|
||
if char in cyrillic_map:
|
||
result.append(cyrillic_map[char])
|
||
else:
|
||
result.append(char)
|
||
return ''.join(result)
|
||
|
||
|
||
def lookup_city_in_geonames(conn: sqlite3.Connection, city_name: str) -> dict | None:
|
||
"""Look up city in GeoNames database."""
|
||
cursor = conn.cursor()
|
||
|
||
# First try direct ASCII lookup
|
||
ascii_name = CYRILLIC_TO_ASCII.get(city_name) or transliterate_cyrillic(city_name)
|
||
|
||
# Try exact match first
|
||
cursor.execute("""
|
||
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
|
||
latitude, longitude, population, feature_code
|
||
FROM cities
|
||
WHERE country_code='BG'
|
||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||
AND (ascii_name = ? OR name = ?)
|
||
ORDER BY population DESC
|
||
LIMIT 1
|
||
""", (ascii_name, ascii_name))
|
||
|
||
row = cursor.fetchone()
|
||
if row:
|
||
return {
|
||
'name': row[0],
|
||
'ascii_name': row[1],
|
||
'admin1_code': row[2],
|
||
'admin1_name': row[3],
|
||
'geonames_id': row[4],
|
||
'latitude': row[5],
|
||
'longitude': row[6],
|
||
'population': row[7],
|
||
'feature_code': row[8],
|
||
}
|
||
|
||
# Try fuzzy match with LIKE
|
||
cursor.execute("""
|
||
SELECT name, ascii_name, admin1_code, admin1_name, geonames_id,
|
||
latitude, longitude, population, feature_code
|
||
FROM cities
|
||
WHERE country_code='BG'
|
||
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
||
AND (ascii_name LIKE ? OR name LIKE ?)
|
||
ORDER BY population DESC
|
||
LIMIT 1
|
||
""", (f'{ascii_name}%', f'{ascii_name}%'))
|
||
|
||
row = cursor.fetchone()
|
||
if row:
|
||
return {
|
||
'name': row[0],
|
||
'ascii_name': row[1],
|
||
'admin1_code': row[2],
|
||
'admin1_name': row[3],
|
||
'geonames_id': row[4],
|
||
'latitude': row[5],
|
||
'longitude': row[6],
|
||
'population': row[7],
|
||
'feature_code': row[8],
|
||
}
|
||
|
||
return None
|
||
|
||
|
||
def process_file(filepath: Path, conn: sqlite3.Connection, dry_run: bool = False) -> dict:
|
||
"""Process a single Bulgarian custodian file."""
|
||
result = {
|
||
'file': str(filepath),
|
||
'status': 'skipped',
|
||
'old_ghcid': None,
|
||
'new_ghcid': None,
|
||
'city_cyrillic': None,
|
||
'city_ascii': None,
|
||
'error': None,
|
||
}
|
||
|
||
try:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
except Exception as e:
|
||
result['status'] = 'error'
|
||
result['error'] = f'Failed to load YAML: {e}'
|
||
return result
|
||
|
||
if not data:
|
||
result['status'] = 'error'
|
||
result['error'] = 'Empty YAML file'
|
||
return result
|
||
|
||
# Get current GHCID
|
||
ghcid_data = data.get('ghcid', {})
|
||
old_ghcid = ghcid_data.get('ghcid_current', '')
|
||
result['old_ghcid'] = old_ghcid
|
||
|
||
# Check if it's a BG-XX-XXX file
|
||
if not old_ghcid.startswith('BG-XX-XXX-'):
|
||
result['status'] = 'skipped'
|
||
result['error'] = 'Not a BG-XX-XXX file'
|
||
return result
|
||
|
||
# Extract city from original_entry or locations
|
||
city_cyrillic = None
|
||
|
||
if 'original_entry' in data and 'locations' in data['original_entry']:
|
||
locations = data['original_entry']['locations']
|
||
if locations and isinstance(locations, list) and len(locations) > 0:
|
||
city_cyrillic = locations[0].get('city')
|
||
|
||
if not city_cyrillic:
|
||
result['status'] = 'error'
|
||
result['error'] = 'No city found in original_entry'
|
||
return result
|
||
|
||
result['city_cyrillic'] = city_cyrillic
|
||
|
||
# Look up city in GeoNames
|
||
city_info = lookup_city_in_geonames(conn, city_cyrillic)
|
||
|
||
if not city_info:
|
||
result['status'] = 'error'
|
||
result['error'] = f'City not found in GeoNames: {city_cyrillic}'
|
||
return result
|
||
|
||
result['city_ascii'] = city_info['ascii_name']
|
||
|
||
# Get region code
|
||
admin1_code = city_info['admin1_code']
|
||
region_code = ADMIN1_TO_ISO.get(admin1_code, 'XX')
|
||
|
||
# Generate city code
|
||
city_code = get_city_code(city_info['ascii_name'])
|
||
|
||
# Build new GHCID
|
||
# Extract type and abbreviation from old GHCID
|
||
# Format: BG-XX-XXX-{type}-{abbrev}
|
||
parts = old_ghcid.split('-')
|
||
if len(parts) >= 5:
|
||
inst_type = parts[3]
|
||
abbreviation = '-'.join(parts[4:]) # May contain hyphens
|
||
else:
|
||
result['status'] = 'error'
|
||
result['error'] = f'Invalid GHCID format: {old_ghcid}'
|
||
return result
|
||
|
||
new_ghcid = f'BG-{region_code}-{city_code}-{inst_type}-{abbreviation}'
|
||
result['new_ghcid'] = new_ghcid
|
||
|
||
if dry_run:
|
||
result['status'] = 'would_update'
|
||
return result
|
||
|
||
# Update the GHCID data
|
||
timestamp = datetime.now(timezone.utc).isoformat()
|
||
|
||
# Update ghcid section
|
||
data['ghcid']['ghcid_current'] = new_ghcid
|
||
data['ghcid']['location_resolution'] = {
|
||
'method': 'GEONAMES_LOOKUP',
|
||
'country_code': 'BG',
|
||
'region_code': region_code,
|
||
'region_name': city_info['admin1_name'],
|
||
'city_code': city_code,
|
||
'city_name': city_info['ascii_name'],
|
||
'city_name_cyrillic': city_cyrillic,
|
||
'geonames_id': city_info['geonames_id'],
|
||
'feature_code': city_info['feature_code'],
|
||
'resolution_date': timestamp,
|
||
}
|
||
|
||
# Add to GHCID history
|
||
if 'ghcid_history' not in data['ghcid']:
|
||
data['ghcid']['ghcid_history'] = []
|
||
|
||
# Mark old GHCID as ended
|
||
for entry in data['ghcid']['ghcid_history']:
|
||
if entry.get('ghcid') == old_ghcid and not entry.get('valid_to'):
|
||
entry['valid_to'] = timestamp
|
||
|
||
# Add new GHCID entry
|
||
data['ghcid']['ghcid_history'].append({
|
||
'ghcid': new_ghcid,
|
||
'ghcid_numeric': data['ghcid'].get('ghcid_numeric'),
|
||
'valid_from': timestamp,
|
||
'reason': f'City resolved via GeoNames: {city_cyrillic} → {city_info["ascii_name"]} ({region_code})',
|
||
})
|
||
|
||
# Update identifiers
|
||
if 'identifiers' in data:
|
||
for identifier in data['identifiers']:
|
||
if identifier.get('identifier_scheme') == 'GHCID':
|
||
identifier['identifier_value'] = new_ghcid
|
||
|
||
# Calculate new file path
|
||
new_filename = f'{new_ghcid}.yaml'
|
||
new_filepath = filepath.parent / new_filename
|
||
|
||
# Write updated data
|
||
with open(filepath, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||
|
||
# Rename file
|
||
if filepath != new_filepath and not new_filepath.exists():
|
||
filepath.rename(new_filepath)
|
||
result['new_file'] = str(new_filepath)
|
||
elif new_filepath.exists() and filepath != new_filepath:
|
||
result['status'] = 'collision'
|
||
result['error'] = f'Target file already exists: {new_filepath}'
|
||
return result
|
||
|
||
result['status'] = 'updated'
|
||
return result
|
||
|
||
|
||
def main():
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(description='Enrich Bulgarian custodian files with GeoNames data')
|
||
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
||
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
||
args = parser.parse_args()
|
||
|
||
# Find all Bulgarian XXX files
|
||
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
||
geonames_db = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')
|
||
|
||
if not geonames_db.exists():
|
||
print(f'ERROR: GeoNames database not found: {geonames_db}')
|
||
return
|
||
|
||
files = sorted(custodian_dir.glob('BG-XX-XXX-*.yaml'))
|
||
|
||
if args.limit:
|
||
files = files[:args.limit]
|
||
|
||
print(f'Found {len(files)} Bulgarian XXX files')
|
||
print(f'Dry run: {args.dry_run}')
|
||
print()
|
||
|
||
# Connect to GeoNames database
|
||
conn = sqlite3.connect(str(geonames_db))
|
||
|
||
stats = {'updated': 0, 'error': 0, 'skipped': 0, 'would_update': 0, 'collision': 0}
|
||
errors = []
|
||
|
||
for filepath in files:
|
||
result = process_file(filepath, conn, dry_run=args.dry_run)
|
||
stats[result['status']] = stats.get(result['status'], 0) + 1
|
||
|
||
if result['status'] == 'updated' or result['status'] == 'would_update':
|
||
print(f"✓ {result['city_cyrillic']} → {result['city_ascii']}: {result['old_ghcid']} → {result['new_ghcid']}")
|
||
elif result['status'] == 'error':
|
||
print(f"✗ {filepath.name}: {result['error']}")
|
||
errors.append(result)
|
||
elif result['status'] == 'collision':
|
||
print(f"⚠ {filepath.name}: {result['error']}")
|
||
|
||
conn.close()
|
||
|
||
print()
|
||
print('=' * 60)
|
||
print('Summary:')
|
||
print(f" Updated: {stats.get('updated', 0)}")
|
||
print(f" Would update: {stats.get('would_update', 0)}")
|
||
print(f" Errors: {stats.get('error', 0)}")
|
||
print(f" Collisions: {stats.get('collision', 0)}")
|
||
print(f" Skipped: {stats.get('skipped', 0)}")
|
||
|
||
if errors:
|
||
print()
|
||
print('Errors:')
|
||
for err in errors:
|
||
print(f" - {err['file']}: {err['error']}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|