#!/usr/bin/env python3 """ Add region information to geocoded Bulgarian institutions using reverse geocoding. Uses Nominatim reverse geocoding to determine the Bulgarian oblast (region) for institutions that have coordinates but no region information. Rate limit: 1 request per second (Nominatim usage policy) """ import sys import yaml import time import requests from pathlib import Path from typing import Optional, Dict, Any from datetime import datetime, timezone # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) # Note: GHCID generation will be done by re-running the converter script # Configuration INPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml" OUTPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml" NOMINATIM_REVERSE_URL = "https://nominatim.openstreetmap.org/reverse" USER_AGENT = "GLAM-DataExtraction/1.0 (heritage-data-extraction)" # Bulgarian region code mapping (ISO 3166-2:BG) BULGARIAN_REGIONS = { 'Благоевград': {'code': 'BG-01', 'numeric': 1}, 'Бургас': {'code': 'BG-02', 'numeric': 2}, 'Варна': {'code': 'BG-03', 'numeric': 3}, 'Велико Търново': {'code': 'BG-04', 'numeric': 4}, 'Видин': {'code': 'BG-05', 'numeric': 5}, 'Враца': {'code': 'BG-06', 'numeric': 6}, 'Габрово': {'code': 'BG-07', 'numeric': 7}, 'Добрич': {'code': 'BG-08', 'numeric': 8}, 'Кърджали': {'code': 'BG-09', 'numeric': 9}, 'Кюстендил': {'code': 'BG-10', 'numeric': 10}, 'Ловеч': {'code': 'BG-11', 'numeric': 11}, 'Монтана': {'code': 'BG-12', 'numeric': 12}, 'Пазарджик': {'code': 'BG-13', 'numeric': 13}, 'Перник': {'code': 'BG-14', 'numeric': 14}, 'Плевен': {'code': 'BG-15', 'numeric': 15}, 'Пловдив': {'code': 'BG-16', 'numeric': 16}, 'Разград': {'code': 'BG-17', 'numeric': 17}, 'Русе': {'code': 'BG-18', 'numeric': 18}, 'Силистра': {'code': 'BG-19', 'numeric': 19}, 'Сливен': {'code': 'BG-20', 'numeric': 20}, 'Смолян': {'code': 'BG-21', 'numeric': 21}, 'София': {'code': 'BG-22', 'numeric': 22}, 'Стара Загора': {'code': 'BG-24', 'numeric': 24}, 'Търговище': {'code': 'BG-25', 'numeric': 25}, 'Хасково': {'code': 'BG-26', 'numeric': 26}, 'Шумен': {'code': 'BG-27', 'numeric': 27}, 'Ямбол': {'code': 'BG-28', 'numeric': 28} } def reverse_geocode(lat: float, lon: float) -> Optional[Dict[str, Any]]: """ Reverse geocode coordinates to get address details. Args: lat: Latitude lon: Longitude Returns: Dict with address details including region/oblast, or None if failed """ params = { 'lat': lat, 'lon': lon, 'format': 'json', 'addressdetails': 1, 'zoom': 10 # Administrative level } headers = { 'User-Agent': USER_AGENT } try: response = requests.get(NOMINATIM_REVERSE_URL, params=params, headers=headers) response.raise_for_status() result = response.json() if result and 'address' in result: address = result['address'] # Try to extract region (oblast) # OSM uses 'state' for Bulgarian oblasts region = address.get('state') or address.get('county') or address.get('province') return { 'region': region, 'country': address.get('country'), 'country_code': address.get('country_code', '').upper(), 'display_name': result.get('display_name') } else: return None except Exception as e: print(f" ⚠️ Error reverse geocoding ({lat}, {lon}): {e}") return None def main(): """Main region enrichment workflow.""" print("=" * 80) print("Bulgarian ISIL Registry - Add Region Information via Reverse Geocoding") print("=" * 80) print() # Load institutions print(f"Loading institutions from {INPUT_FILE}...") with open(INPUT_FILE, 'r', encoding='utf-8') as f: content = f.read() # Extract YAML header comments yaml_start = content.index('\n- id:') header = content[:yaml_start] institutions = yaml.safe_load(content[yaml_start:]) print(f"Loaded {len(institutions)} institutions") print() # Find institutions with coordinates but no region missing_region = [] for inst in institutions: if inst.get('locations'): loc = inst['locations'][0] if loc.get('latitude') and loc.get('longitude') and not loc.get('region'): missing_region.append({ 'index': institutions.index(inst), 'isil': inst['identifiers'][0]['identifier_value'], 'name': inst['name'], 'city': loc.get('city', 'N/A'), 'lat': loc['latitude'], 'lon': loc['longitude'], 'institution': inst }) print(f"Institutions missing region info: {len(missing_region)}") print() if len(missing_region) == 0: print("✓ All institutions already have region information!") return # Reverse geocode to get regions print("Reverse geocoding to determine regions/oblasts...") print("(Rate limited to 1 request per second)") print("=" * 80) print() enriched_count = 0 failed_count = 0 for i, item in enumerate(missing_region, 1): city = item['city'] isil = item['isil'] lat = item['lat'] lon = item['lon'] print(f"{i:2d}/{len(missing_region)} | {isil} | {city:30s} ", end='', flush=True) # Try reverse geocoding result = reverse_geocode(lat, lon) if result and result.get('region'): region_name = result['region'] # Update institution location with region idx = item['index'] institutions[idx]['locations'][0]['region'] = region_name print(f"✓ {region_name}") enriched_count += 1 else: print("✗ Region not found") failed_count += 1 # Rate limit: 1 request per second (Nominatim usage policy) if i < len(missing_region): time.sleep(1) print() print("=" * 80) print(f"✓ Region enrichment complete:") print(f" Regions added: {enriched_count}") print(f" Failed: {failed_count}") print() # Calculate new statistics total = len(institutions) with_region = sum(1 for i in institutions if i.get('locations') and i['locations'][0].get('region')) with_ghcid = sum(1 for i in institutions if i.get('ghcid_current')) print(f"New coverage:") print(f" Total institutions: {total}") print(f" With region info: {with_region} ({with_region/total*100:.1f}%)") print(f" With GHCIDs: {with_ghcid} ({with_ghcid/total*100:.1f}%)") print() # Export updated data if enriched_count > 0: print(f"Exporting updated data to {OUTPUT_FILE}...") # Update header timestamp header_lines = header.split('\n') for i, line in enumerate(header_lines): if line.startswith('# Generated:'): header_lines[i] = f'# Generated: {datetime.now(timezone.utc).isoformat()}' updated_header = '\n'.join(header_lines) with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: f.write(updated_header) yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Exported {len(institutions)} institutions") print() print("=" * 80) print("✓ Region enrichment workflow complete!") print("=" * 80) if __name__ == '__main__': main()