#!/usr/bin/env python3 """ Geocode missing Bulgarian institutions using Nominatim API. Uses OpenStreetMap Nominatim API to find coordinates for Bulgarian cities that were not found in the GeoNames database. Rate limit: 1 request per second (Nominatim usage policy) """ import sys import yaml import time import requests from pathlib import Path from typing import Optional, Dict, Any from datetime import datetime, timezone # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) # Configuration INPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml" OUTPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml" NOMINATIM_URL = "https://nominatim.openstreetmap.org/search" USER_AGENT = "GLAM-DataExtraction/1.0 (heritage-data-extraction)" def geocode_nominatim(city_name: str, country: str = "Bulgaria") -> Optional[Dict[str, Any]]: """ Geocode a city using Nominatim API. Args: city_name: Name of the city (can be Cyrillic) country: Country name (default: Bulgaria) Returns: Dict with latitude, longitude, display_name, or None if not found """ params = { 'q': f"{city_name}, {country}", 'format': 'json', 'limit': 1, 'addressdetails': 1 } headers = { 'User-Agent': USER_AGENT } try: response = requests.get(NOMINATIM_URL, params=params, headers=headers) response.raise_for_status() results = response.json() if results: result = results[0] return { 'latitude': float(result['lat']), 'longitude': float(result['lon']), 'display_name': result['display_name'], 'osm_type': result.get('osm_type'), 'osm_id': result.get('osm_id') } else: return None except Exception as e: print(f" ⚠️ Error geocoding {city_name}: {e}") return None def main(): """Main geocoding workflow.""" print("=" * 80) print("Bulgarian ISIL Registry - Geocoding Missing Institutions") print("=" * 80) print() # Load institutions print(f"Loading institutions from {INPUT_FILE}...") with open(INPUT_FILE, 'r', encoding='utf-8') as f: content = f.read() # Extract YAML header comments yaml_start = content.index('\n- id:') header = content[:yaml_start] institutions = yaml.safe_load(content[yaml_start:]) print(f"Loaded {len(institutions)} institutions") print() # Find institutions without geocoding missing_geocoding = [] for inst in institutions: if inst.get('locations'): loc = inst['locations'][0] if not loc.get('latitude') or not loc.get('longitude'): missing_geocoding.append({ 'index': institutions.index(inst), 'isil': inst['identifiers'][0]['identifier_value'], 'name': inst['name'], 'city': loc.get('city', 'N/A'), 'institution': inst }) print(f"Institutions missing geocoding: {len(missing_geocoding)}") print() if len(missing_geocoding) == 0: print("✓ All institutions already geocoded!") return # Geocode missing institutions print("Geocoding institutions using Nominatim API...") print("(Rate limited to 1 request per second)") print("=" * 80) print() geocoded_count = 0 failed_count = 0 for i, item in enumerate(missing_geocoding, 1): city = item['city'] isil = item['isil'] print(f"{i:2d}/{len(missing_geocoding)} | {isil} | {city:30s} ", end='', flush=True) # Try geocoding result = geocode_nominatim(city) if result: # Update institution location idx = item['index'] institutions[idx]['locations'][0]['latitude'] = result['latitude'] institutions[idx]['locations'][0]['longitude'] = result['longitude'] print(f"✓ ({result['latitude']:.5f}, {result['longitude']:.5f})") geocoded_count += 1 else: print("✗ Not found") failed_count += 1 # Rate limit: 1 request per second (Nominatim usage policy) if i < len(missing_geocoding): time.sleep(1) print() print("=" * 80) print(f"✓ Geocoding complete:") print(f" Successfully geocoded: {geocoded_count}") print(f" Failed to geocode: {failed_count}") print(f" Total geocoded: {len(institutions) - failed_count}/{len(institutions)} ({(len(institutions) - failed_count)/len(institutions)*100:.1f}%)") print() # Export updated data if geocoded_count > 0: print(f"Exporting updated data to {OUTPUT_FILE}...") # Update header timestamp header_lines = header.split('\n') for i, line in enumerate(header_lines): if line.startswith('# Generated:'): header_lines[i] = f'# Generated: {datetime.now(timezone.utc).isoformat()}' updated_header = '\n'.join(header_lines) with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: f.write(updated_header) yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Exported {len(institutions)} institutions") print() # Show failed institutions if failed_count > 0: print("=" * 80) print("⚠️ Institutions that could not be geocoded:") print("=" * 80) print() for item in missing_geocoding: city = item['city'] isil = item['isil'] # Check if it was geocoded idx = item['index'] if not institutions[idx]['locations'][0].get('latitude'): print(f" {isil} | {city:30s} | {item['name'][:50]}") print() print("Note: These may be:") print(" - Very small villages not in OpenStreetMap") print(" - Transliteration issues (Cyrillic → Latin)") print(" - Misspellings in source data") print() print("Manual geocoding may be required for these institutions.") print() print("=" * 80) print("✓ Geocoding workflow complete!") print("=" * 80) if __name__ == '__main__': main()