239 lines
8.1 KiB
Python
239 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Add region information to geocoded Bulgarian institutions using reverse geocoding.
|
||
|
||
Uses Nominatim reverse geocoding to determine the Bulgarian oblast (region)
|
||
for institutions that have coordinates but no region information.
|
||
|
||
Rate limit: 1 request per second (Nominatim usage policy)
|
||
"""
|
||
|
||
import sys
|
||
import yaml
|
||
import time
|
||
import requests
|
||
from pathlib import Path
|
||
from typing import Optional, Dict, Any
|
||
from datetime import datetime, timezone
|
||
|
||
# Add project root to path
|
||
project_root = Path(__file__).parent.parent
|
||
sys.path.insert(0, str(project_root))
|
||
|
||
# Note: GHCID generation will be done by re-running the converter script
|
||
|
||
# Configuration
|
||
INPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
|
||
OUTPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
|
||
NOMINATIM_REVERSE_URL = "https://nominatim.openstreetmap.org/reverse"
|
||
USER_AGENT = "GLAM-DataExtraction/1.0 (heritage-data-extraction)"
|
||
|
||
# Bulgarian region code mapping (ISO 3166-2:BG)
|
||
BULGARIAN_REGIONS = {
|
||
'Благоевград': {'code': 'BG-01', 'numeric': 1},
|
||
'Бургас': {'code': 'BG-02', 'numeric': 2},
|
||
'Варна': {'code': 'BG-03', 'numeric': 3},
|
||
'Велико Търново': {'code': 'BG-04', 'numeric': 4},
|
||
'Видин': {'code': 'BG-05', 'numeric': 5},
|
||
'Враца': {'code': 'BG-06', 'numeric': 6},
|
||
'Габрово': {'code': 'BG-07', 'numeric': 7},
|
||
'Добрич': {'code': 'BG-08', 'numeric': 8},
|
||
'Кърджали': {'code': 'BG-09', 'numeric': 9},
|
||
'Кюстендил': {'code': 'BG-10', 'numeric': 10},
|
||
'Ловеч': {'code': 'BG-11', 'numeric': 11},
|
||
'Монтана': {'code': 'BG-12', 'numeric': 12},
|
||
'Пазарджик': {'code': 'BG-13', 'numeric': 13},
|
||
'Перник': {'code': 'BG-14', 'numeric': 14},
|
||
'Плевен': {'code': 'BG-15', 'numeric': 15},
|
||
'Пловдив': {'code': 'BG-16', 'numeric': 16},
|
||
'Разград': {'code': 'BG-17', 'numeric': 17},
|
||
'Русе': {'code': 'BG-18', 'numeric': 18},
|
||
'Силистра': {'code': 'BG-19', 'numeric': 19},
|
||
'Сливен': {'code': 'BG-20', 'numeric': 20},
|
||
'Смолян': {'code': 'BG-21', 'numeric': 21},
|
||
'София': {'code': 'BG-22', 'numeric': 22},
|
||
'Стара Загора': {'code': 'BG-24', 'numeric': 24},
|
||
'Търговище': {'code': 'BG-25', 'numeric': 25},
|
||
'Хасково': {'code': 'BG-26', 'numeric': 26},
|
||
'Шумен': {'code': 'BG-27', 'numeric': 27},
|
||
'Ямбол': {'code': 'BG-28', 'numeric': 28}
|
||
}
|
||
|
||
|
||
def reverse_geocode(lat: float, lon: float) -> Optional[Dict[str, Any]]:
|
||
"""
|
||
Reverse geocode coordinates to get address details.
|
||
|
||
Args:
|
||
lat: Latitude
|
||
lon: Longitude
|
||
|
||
Returns:
|
||
Dict with address details including region/oblast, or None if failed
|
||
"""
|
||
params = {
|
||
'lat': lat,
|
||
'lon': lon,
|
||
'format': 'json',
|
||
'addressdetails': 1,
|
||
'zoom': 10 # Administrative level
|
||
}
|
||
|
||
headers = {
|
||
'User-Agent': USER_AGENT
|
||
}
|
||
|
||
try:
|
||
response = requests.get(NOMINATIM_REVERSE_URL, params=params, headers=headers)
|
||
response.raise_for_status()
|
||
|
||
result = response.json()
|
||
|
||
if result and 'address' in result:
|
||
address = result['address']
|
||
|
||
# Try to extract region (oblast)
|
||
# OSM uses 'state' for Bulgarian oblasts
|
||
region = address.get('state') or address.get('county') or address.get('province')
|
||
|
||
return {
|
||
'region': region,
|
||
'country': address.get('country'),
|
||
'country_code': address.get('country_code', '').upper(),
|
||
'display_name': result.get('display_name')
|
||
}
|
||
else:
|
||
return None
|
||
|
||
except Exception as e:
|
||
print(f" ⚠️ Error reverse geocoding ({lat}, {lon}): {e}")
|
||
return None
|
||
|
||
|
||
def main():
|
||
"""Main region enrichment workflow."""
|
||
print("=" * 80)
|
||
print("Bulgarian ISIL Registry - Add Region Information via Reverse Geocoding")
|
||
print("=" * 80)
|
||
print()
|
||
|
||
# Load institutions
|
||
print(f"Loading institutions from {INPUT_FILE}...")
|
||
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# Extract YAML header comments
|
||
yaml_start = content.index('\n- id:')
|
||
header = content[:yaml_start]
|
||
|
||
institutions = yaml.safe_load(content[yaml_start:])
|
||
|
||
print(f"Loaded {len(institutions)} institutions")
|
||
print()
|
||
|
||
# Find institutions with coordinates but no region
|
||
missing_region = []
|
||
|
||
for inst in institutions:
|
||
if inst.get('locations'):
|
||
loc = inst['locations'][0]
|
||
if loc.get('latitude') and loc.get('longitude') and not loc.get('region'):
|
||
missing_region.append({
|
||
'index': institutions.index(inst),
|
||
'isil': inst['identifiers'][0]['identifier_value'],
|
||
'name': inst['name'],
|
||
'city': loc.get('city', 'N/A'),
|
||
'lat': loc['latitude'],
|
||
'lon': loc['longitude'],
|
||
'institution': inst
|
||
})
|
||
|
||
print(f"Institutions missing region info: {len(missing_region)}")
|
||
print()
|
||
|
||
if len(missing_region) == 0:
|
||
print("✓ All institutions already have region information!")
|
||
return
|
||
|
||
# Reverse geocode to get regions
|
||
print("Reverse geocoding to determine regions/oblasts...")
|
||
print("(Rate limited to 1 request per second)")
|
||
print("=" * 80)
|
||
print()
|
||
|
||
enriched_count = 0
|
||
failed_count = 0
|
||
|
||
for i, item in enumerate(missing_region, 1):
|
||
city = item['city']
|
||
isil = item['isil']
|
||
lat = item['lat']
|
||
lon = item['lon']
|
||
|
||
print(f"{i:2d}/{len(missing_region)} | {isil} | {city:30s} ", end='', flush=True)
|
||
|
||
# Try reverse geocoding
|
||
result = reverse_geocode(lat, lon)
|
||
|
||
if result and result.get('region'):
|
||
region_name = result['region']
|
||
|
||
# Update institution location with region
|
||
idx = item['index']
|
||
institutions[idx]['locations'][0]['region'] = region_name
|
||
|
||
print(f"✓ {region_name}")
|
||
enriched_count += 1
|
||
|
||
else:
|
||
print("✗ Region not found")
|
||
failed_count += 1
|
||
|
||
# Rate limit: 1 request per second (Nominatim usage policy)
|
||
if i < len(missing_region):
|
||
time.sleep(1)
|
||
|
||
print()
|
||
print("=" * 80)
|
||
print(f"✓ Region enrichment complete:")
|
||
print(f" Regions added: {enriched_count}")
|
||
print(f" Failed: {failed_count}")
|
||
print()
|
||
|
||
# Calculate new statistics
|
||
total = len(institutions)
|
||
with_region = sum(1 for i in institutions if i.get('locations') and i['locations'][0].get('region'))
|
||
with_ghcid = sum(1 for i in institutions if i.get('ghcid_current'))
|
||
|
||
print(f"New coverage:")
|
||
print(f" Total institutions: {total}")
|
||
print(f" With region info: {with_region} ({with_region/total*100:.1f}%)")
|
||
print(f" With GHCIDs: {with_ghcid} ({with_ghcid/total*100:.1f}%)")
|
||
print()
|
||
|
||
# Export updated data
|
||
if enriched_count > 0:
|
||
print(f"Exporting updated data to {OUTPUT_FILE}...")
|
||
|
||
# Update header timestamp
|
||
header_lines = header.split('\n')
|
||
for i, line in enumerate(header_lines):
|
||
if line.startswith('# Generated:'):
|
||
header_lines[i] = f'# Generated: {datetime.now(timezone.utc).isoformat()}'
|
||
|
||
updated_header = '\n'.join(header_lines)
|
||
|
||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||
f.write(updated_header)
|
||
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||
|
||
print(f"✓ Exported {len(institutions)} institutions")
|
||
print()
|
||
|
||
print("=" * 80)
|
||
print("✓ Region enrichment workflow complete!")
|
||
print("=" * 80)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|