glam/scripts/scrapers/geocode_json_harvest.py
2025-11-19 23:25:22 +01:00

124 lines
3.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Quick geocoder for JSON harvest files.
Uses Nominatim API with rate limiting.
"""
import json
import time
import requests
from pathlib import Path
from datetime import datetime, timezone
import sys
def geocode_location(city: str, region: str, country: str) -> tuple:
"""Geocode using Nominatim API."""
query = f"{city}, {region}, {country}"
url = "https://nominatim.openstreetmap.org/search"
params = {
'q': query,
'format': 'json',
'limit': 1,
'addressdetails': 1
}
headers = {
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
}
try:
time.sleep(1) # Rate limit: 1 req/sec
response = requests.get(url, params=params, headers=headers, timeout=10)
response.raise_for_status()
results = response.json()
if results:
result = results[0]
return (
float(result['lat']),
float(result['lon']),
result.get('display_name', '')
)
except Exception as e:
print(f" ⚠️ Geocoding failed: {e}")
return None, None, None
def main():
if len(sys.argv) < 2:
print("Usage: python scripts/scrapers/geocode_json_harvest.py <input.json>")
sys.exit(1)
input_file = Path(sys.argv[1])
print(f"🌍 Geocoding JSON Harvest")
print(f"📁 Input: {input_file}")
print()
# Load data
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
archives = data['archives']
# Count archives without coordinates
need_geocoding = [a for a in archives if a.get('city') and not a.get('latitude')]
print(f"📊 Statistics:")
print(f" Total archives: {len(archives)}")
print(f" With cities: {sum(1 for a in archives if a.get('city'))}")
print(f" Need geocoding: {len(need_geocoding)}")
print()
if not need_geocoding:
print("✅ All archives already geocoded!")
return
print(f"🚀 Starting geocoding of {len(need_geocoding)} archives...")
print()
success = 0
failed = 0
for i, archive in enumerate(need_geocoding, 1):
city = archive['city']
region = archive.get('region', '')
country = archive.get('country', 'DE')
print(f"[{i}/{len(need_geocoding)}] {archive['name']}")
print(f" 📍 {city}, {region}, {country}")
lat, lon, display = geocode_location(city, region, country)
if lat and lon:
archive['latitude'] = lat
archive['longitude'] = lon
archive['geocoded_display_name'] = display
success += 1
print(f"{lat:.6f}, {lon:.6f}")
else:
failed += 1
print(f" ❌ Failed")
print()
# Save updated data
output_file = input_file.parent / f"{input_file.stem}_geocoded.json"
data['geocoding_metadata'] = {
'geocoded_at': datetime.now(timezone.utc).isoformat(),
'success_count': success,
'failed_count': failed,
'total_processed': len(need_geocoding)
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\n✅ Geocoding complete!")
print(f" Success: {success}/{len(need_geocoding)} ({success/len(need_geocoding)*100:.1f}%)")
print(f" Failed: {failed}")
print(f" Output: {output_file}")
if __name__ == '__main__':
main()