glam/scripts/geocode_bulgarian_missing.py
2025-11-19 23:25:22 +01:00

210 lines
6.5 KiB
Python

#!/usr/bin/env python3
"""
Geocode missing Bulgarian institutions using Nominatim API.
Uses OpenStreetMap Nominatim API to find coordinates for Bulgarian cities
that were not found in the GeoNames database.
Rate limit: 1 request per second (Nominatim usage policy)
"""
import sys
import yaml
import time
import requests
from pathlib import Path
from typing import Optional, Dict, Any
from datetime import datetime, timezone
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
# Configuration
INPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
OUTPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
USER_AGENT = "GLAM-DataExtraction/1.0 (heritage-data-extraction)"
def geocode_nominatim(city_name: str, country: str = "Bulgaria") -> Optional[Dict[str, Any]]:
"""
Geocode a city using Nominatim API.
Args:
city_name: Name of the city (can be Cyrillic)
country: Country name (default: Bulgaria)
Returns:
Dict with latitude, longitude, display_name, or None if not found
"""
params = {
'q': f"{city_name}, {country}",
'format': 'json',
'limit': 1,
'addressdetails': 1
}
headers = {
'User-Agent': USER_AGENT
}
try:
response = requests.get(NOMINATIM_URL, params=params, headers=headers)
response.raise_for_status()
results = response.json()
if results:
result = results[0]
return {
'latitude': float(result['lat']),
'longitude': float(result['lon']),
'display_name': result['display_name'],
'osm_type': result.get('osm_type'),
'osm_id': result.get('osm_id')
}
else:
return None
except Exception as e:
print(f" ⚠️ Error geocoding {city_name}: {e}")
return None
def main():
"""Main geocoding workflow."""
print("=" * 80)
print("Bulgarian ISIL Registry - Geocoding Missing Institutions")
print("=" * 80)
print()
# Load institutions
print(f"Loading institutions from {INPUT_FILE}...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
content = f.read()
# Extract YAML header comments
yaml_start = content.index('\n- id:')
header = content[:yaml_start]
institutions = yaml.safe_load(content[yaml_start:])
print(f"Loaded {len(institutions)} institutions")
print()
# Find institutions without geocoding
missing_geocoding = []
for inst in institutions:
if inst.get('locations'):
loc = inst['locations'][0]
if not loc.get('latitude') or not loc.get('longitude'):
missing_geocoding.append({
'index': institutions.index(inst),
'isil': inst['identifiers'][0]['identifier_value'],
'name': inst['name'],
'city': loc.get('city', 'N/A'),
'institution': inst
})
print(f"Institutions missing geocoding: {len(missing_geocoding)}")
print()
if len(missing_geocoding) == 0:
print("✓ All institutions already geocoded!")
return
# Geocode missing institutions
print("Geocoding institutions using Nominatim API...")
print("(Rate limited to 1 request per second)")
print("=" * 80)
print()
geocoded_count = 0
failed_count = 0
for i, item in enumerate(missing_geocoding, 1):
city = item['city']
isil = item['isil']
print(f"{i:2d}/{len(missing_geocoding)} | {isil} | {city:30s} ", end='', flush=True)
# Try geocoding
result = geocode_nominatim(city)
if result:
# Update institution location
idx = item['index']
institutions[idx]['locations'][0]['latitude'] = result['latitude']
institutions[idx]['locations'][0]['longitude'] = result['longitude']
print(f"✓ ({result['latitude']:.5f}, {result['longitude']:.5f})")
geocoded_count += 1
else:
print("✗ Not found")
failed_count += 1
# Rate limit: 1 request per second (Nominatim usage policy)
if i < len(missing_geocoding):
time.sleep(1)
print()
print("=" * 80)
print(f"✓ Geocoding complete:")
print(f" Successfully geocoded: {geocoded_count}")
print(f" Failed to geocode: {failed_count}")
print(f" Total geocoded: {len(institutions) - failed_count}/{len(institutions)} ({(len(institutions) - failed_count)/len(institutions)*100:.1f}%)")
print()
# Export updated data
if geocoded_count > 0:
print(f"Exporting updated data to {OUTPUT_FILE}...")
# Update header timestamp
header_lines = header.split('\n')
for i, line in enumerate(header_lines):
if line.startswith('# Generated:'):
header_lines[i] = f'# Generated: {datetime.now(timezone.utc).isoformat()}'
updated_header = '\n'.join(header_lines)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
f.write(updated_header)
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Exported {len(institutions)} institutions")
print()
# Show failed institutions
if failed_count > 0:
print("=" * 80)
print("⚠️ Institutions that could not be geocoded:")
print("=" * 80)
print()
for item in missing_geocoding:
city = item['city']
isil = item['isil']
# Check if it was geocoded
idx = item['index']
if not institutions[idx]['locations'][0].get('latitude'):
print(f" {isil} | {city:30s} | {item['name'][:50]}")
print()
print("Note: These may be:")
print(" - Very small villages not in OpenStreetMap")
print(" - Transliteration issues (Cyrillic → Latin)")
print(" - Misspellings in source data")
print()
print("Manual geocoding may be required for these institutions.")
print()
print("=" * 80)
print("✓ Geocoding workflow complete!")
print("=" * 80)
if __name__ == '__main__':
main()