210 lines
6.5 KiB
Python
210 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Geocode missing Bulgarian institutions using Nominatim API.
|
|
|
|
Uses OpenStreetMap Nominatim API to find coordinates for Bulgarian cities
|
|
that were not found in the GeoNames database.
|
|
|
|
Rate limit: 1 request per second (Nominatim usage policy)
|
|
"""
|
|
|
|
import sys
|
|
import yaml
|
|
import time
|
|
import requests
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any
|
|
from datetime import datetime, timezone
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
# Configuration
|
|
INPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
|
|
OUTPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
|
|
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
|
|
USER_AGENT = "GLAM-DataExtraction/1.0 (heritage-data-extraction)"
|
|
|
|
|
|
def geocode_nominatim(city_name: str, country: str = "Bulgaria") -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Geocode a city using Nominatim API.
|
|
|
|
Args:
|
|
city_name: Name of the city (can be Cyrillic)
|
|
country: Country name (default: Bulgaria)
|
|
|
|
Returns:
|
|
Dict with latitude, longitude, display_name, or None if not found
|
|
"""
|
|
params = {
|
|
'q': f"{city_name}, {country}",
|
|
'format': 'json',
|
|
'limit': 1,
|
|
'addressdetails': 1
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': USER_AGENT
|
|
}
|
|
|
|
try:
|
|
response = requests.get(NOMINATIM_URL, params=params, headers=headers)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
|
|
if results:
|
|
result = results[0]
|
|
return {
|
|
'latitude': float(result['lat']),
|
|
'longitude': float(result['lon']),
|
|
'display_name': result['display_name'],
|
|
'osm_type': result.get('osm_type'),
|
|
'osm_id': result.get('osm_id')
|
|
}
|
|
else:
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f" ⚠️ Error geocoding {city_name}: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
"""Main geocoding workflow."""
|
|
print("=" * 80)
|
|
print("Bulgarian ISIL Registry - Geocoding Missing Institutions")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load institutions
|
|
print(f"Loading institutions from {INPUT_FILE}...")
|
|
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract YAML header comments
|
|
yaml_start = content.index('\n- id:')
|
|
header = content[:yaml_start]
|
|
|
|
institutions = yaml.safe_load(content[yaml_start:])
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Find institutions without geocoding
|
|
missing_geocoding = []
|
|
|
|
for inst in institutions:
|
|
if inst.get('locations'):
|
|
loc = inst['locations'][0]
|
|
if not loc.get('latitude') or not loc.get('longitude'):
|
|
missing_geocoding.append({
|
|
'index': institutions.index(inst),
|
|
'isil': inst['identifiers'][0]['identifier_value'],
|
|
'name': inst['name'],
|
|
'city': loc.get('city', 'N/A'),
|
|
'institution': inst
|
|
})
|
|
|
|
print(f"Institutions missing geocoding: {len(missing_geocoding)}")
|
|
print()
|
|
|
|
if len(missing_geocoding) == 0:
|
|
print("✓ All institutions already geocoded!")
|
|
return
|
|
|
|
# Geocode missing institutions
|
|
print("Geocoding institutions using Nominatim API...")
|
|
print("(Rate limited to 1 request per second)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
geocoded_count = 0
|
|
failed_count = 0
|
|
|
|
for i, item in enumerate(missing_geocoding, 1):
|
|
city = item['city']
|
|
isil = item['isil']
|
|
|
|
print(f"{i:2d}/{len(missing_geocoding)} | {isil} | {city:30s} ", end='', flush=True)
|
|
|
|
# Try geocoding
|
|
result = geocode_nominatim(city)
|
|
|
|
if result:
|
|
# Update institution location
|
|
idx = item['index']
|
|
institutions[idx]['locations'][0]['latitude'] = result['latitude']
|
|
institutions[idx]['locations'][0]['longitude'] = result['longitude']
|
|
|
|
print(f"✓ ({result['latitude']:.5f}, {result['longitude']:.5f})")
|
|
geocoded_count += 1
|
|
else:
|
|
print("✗ Not found")
|
|
failed_count += 1
|
|
|
|
# Rate limit: 1 request per second (Nominatim usage policy)
|
|
if i < len(missing_geocoding):
|
|
time.sleep(1)
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print(f"✓ Geocoding complete:")
|
|
print(f" Successfully geocoded: {geocoded_count}")
|
|
print(f" Failed to geocode: {failed_count}")
|
|
print(f" Total geocoded: {len(institutions) - failed_count}/{len(institutions)} ({(len(institutions) - failed_count)/len(institutions)*100:.1f}%)")
|
|
print()
|
|
|
|
# Export updated data
|
|
if geocoded_count > 0:
|
|
print(f"Exporting updated data to {OUTPUT_FILE}...")
|
|
|
|
# Update header timestamp
|
|
header_lines = header.split('\n')
|
|
for i, line in enumerate(header_lines):
|
|
if line.startswith('# Generated:'):
|
|
header_lines[i] = f'# Generated: {datetime.now(timezone.utc).isoformat()}'
|
|
|
|
updated_header = '\n'.join(header_lines)
|
|
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
f.write(updated_header)
|
|
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"✓ Exported {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Show failed institutions
|
|
if failed_count > 0:
|
|
print("=" * 80)
|
|
print("⚠️ Institutions that could not be geocoded:")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
for item in missing_geocoding:
|
|
city = item['city']
|
|
isil = item['isil']
|
|
|
|
# Check if it was geocoded
|
|
idx = item['index']
|
|
if not institutions[idx]['locations'][0].get('latitude'):
|
|
print(f" {isil} | {city:30s} | {item['name'][:50]}")
|
|
|
|
print()
|
|
print("Note: These may be:")
|
|
print(" - Very small villages not in OpenStreetMap")
|
|
print(" - Transliteration issues (Cyrillic → Latin)")
|
|
print(" - Misspellings in source data")
|
|
print()
|
|
print("Manual geocoding may be required for these institutions.")
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print("✓ Geocoding workflow complete!")
|
|
print("=" * 80)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|