glam/scripts/enrich_bulgarian_regions.py
2025-11-19 23:25:22 +01:00

239 lines
8.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Add region information to geocoded Bulgarian institutions using reverse geocoding.
Uses Nominatim reverse geocoding to determine the Bulgarian oblast (region)
for institutions that have coordinates but no region information.
Rate limit: 1 request per second (Nominatim usage policy)
"""
import sys
import yaml
import time
import requests
from pathlib import Path
from typing import Optional, Dict, Any
from datetime import datetime, timezone
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
# Note: GHCID generation will be done by re-running the converter script
# Configuration
INPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
OUTPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
NOMINATIM_REVERSE_URL = "https://nominatim.openstreetmap.org/reverse"
USER_AGENT = "GLAM-DataExtraction/1.0 (heritage-data-extraction)"
# Bulgarian region code mapping (ISO 3166-2:BG)
BULGARIAN_REGIONS = {
'Благоевград': {'code': 'BG-01', 'numeric': 1},
'Бургас': {'code': 'BG-02', 'numeric': 2},
'Варна': {'code': 'BG-03', 'numeric': 3},
'Велико Търново': {'code': 'BG-04', 'numeric': 4},
'Видин': {'code': 'BG-05', 'numeric': 5},
'Враца': {'code': 'BG-06', 'numeric': 6},
'Габрово': {'code': 'BG-07', 'numeric': 7},
'Добрич': {'code': 'BG-08', 'numeric': 8},
'Кърджали': {'code': 'BG-09', 'numeric': 9},
'Кюстендил': {'code': 'BG-10', 'numeric': 10},
'Ловеч': {'code': 'BG-11', 'numeric': 11},
'Монтана': {'code': 'BG-12', 'numeric': 12},
'Пазарджик': {'code': 'BG-13', 'numeric': 13},
'Перник': {'code': 'BG-14', 'numeric': 14},
'Плевен': {'code': 'BG-15', 'numeric': 15},
'Пловдив': {'code': 'BG-16', 'numeric': 16},
'Разград': {'code': 'BG-17', 'numeric': 17},
'Русе': {'code': 'BG-18', 'numeric': 18},
'Силистра': {'code': 'BG-19', 'numeric': 19},
'Сливен': {'code': 'BG-20', 'numeric': 20},
'Смолян': {'code': 'BG-21', 'numeric': 21},
'София': {'code': 'BG-22', 'numeric': 22},
'Стара Загора': {'code': 'BG-24', 'numeric': 24},
'Търговище': {'code': 'BG-25', 'numeric': 25},
'Хасково': {'code': 'BG-26', 'numeric': 26},
'Шумен': {'code': 'BG-27', 'numeric': 27},
'Ямбол': {'code': 'BG-28', 'numeric': 28}
}
def reverse_geocode(lat: float, lon: float) -> Optional[Dict[str, Any]]:
"""
Reverse geocode coordinates to get address details.
Args:
lat: Latitude
lon: Longitude
Returns:
Dict with address details including region/oblast, or None if failed
"""
params = {
'lat': lat,
'lon': lon,
'format': 'json',
'addressdetails': 1,
'zoom': 10 # Administrative level
}
headers = {
'User-Agent': USER_AGENT
}
try:
response = requests.get(NOMINATIM_REVERSE_URL, params=params, headers=headers)
response.raise_for_status()
result = response.json()
if result and 'address' in result:
address = result['address']
# Try to extract region (oblast)
# OSM uses 'state' for Bulgarian oblasts
region = address.get('state') or address.get('county') or address.get('province')
return {
'region': region,
'country': address.get('country'),
'country_code': address.get('country_code', '').upper(),
'display_name': result.get('display_name')
}
else:
return None
except Exception as e:
print(f" ⚠️ Error reverse geocoding ({lat}, {lon}): {e}")
return None
def main():
"""Main region enrichment workflow."""
print("=" * 80)
print("Bulgarian ISIL Registry - Add Region Information via Reverse Geocoding")
print("=" * 80)
print()
# Load institutions
print(f"Loading institutions from {INPUT_FILE}...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
content = f.read()
# Extract YAML header comments
yaml_start = content.index('\n- id:')
header = content[:yaml_start]
institutions = yaml.safe_load(content[yaml_start:])
print(f"Loaded {len(institutions)} institutions")
print()
# Find institutions with coordinates but no region
missing_region = []
for inst in institutions:
if inst.get('locations'):
loc = inst['locations'][0]
if loc.get('latitude') and loc.get('longitude') and not loc.get('region'):
missing_region.append({
'index': institutions.index(inst),
'isil': inst['identifiers'][0]['identifier_value'],
'name': inst['name'],
'city': loc.get('city', 'N/A'),
'lat': loc['latitude'],
'lon': loc['longitude'],
'institution': inst
})
print(f"Institutions missing region info: {len(missing_region)}")
print()
if len(missing_region) == 0:
print("✓ All institutions already have region information!")
return
# Reverse geocode to get regions
print("Reverse geocoding to determine regions/oblasts...")
print("(Rate limited to 1 request per second)")
print("=" * 80)
print()
enriched_count = 0
failed_count = 0
for i, item in enumerate(missing_region, 1):
city = item['city']
isil = item['isil']
lat = item['lat']
lon = item['lon']
print(f"{i:2d}/{len(missing_region)} | {isil} | {city:30s} ", end='', flush=True)
# Try reverse geocoding
result = reverse_geocode(lat, lon)
if result and result.get('region'):
region_name = result['region']
# Update institution location with region
idx = item['index']
institutions[idx]['locations'][0]['region'] = region_name
print(f"{region_name}")
enriched_count += 1
else:
print("✗ Region not found")
failed_count += 1
# Rate limit: 1 request per second (Nominatim usage policy)
if i < len(missing_region):
time.sleep(1)
print()
print("=" * 80)
print(f"✓ Region enrichment complete:")
print(f" Regions added: {enriched_count}")
print(f" Failed: {failed_count}")
print()
# Calculate new statistics
total = len(institutions)
with_region = sum(1 for i in institutions if i.get('locations') and i['locations'][0].get('region'))
with_ghcid = sum(1 for i in institutions if i.get('ghcid_current'))
print(f"New coverage:")
print(f" Total institutions: {total}")
print(f" With region info: {with_region} ({with_region/total*100:.1f}%)")
print(f" With GHCIDs: {with_ghcid} ({with_ghcid/total*100:.1f}%)")
print()
# Export updated data
if enriched_count > 0:
print(f"Exporting updated data to {OUTPUT_FILE}...")
# Update header timestamp
header_lines = header.split('\n')
for i, line in enumerate(header_lines):
if line.startswith('# Generated:'):
header_lines[i] = f'# Generated: {datetime.now(timezone.utc).isoformat()}'
updated_header = '\n'.join(header_lines)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
f.write(updated_header)
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Exported {len(institutions)} institutions")
print()
print("=" * 80)
print("✓ Region enrichment workflow complete!")
print("=" * 80)
if __name__ == '__main__':
main()