124 lines
3.5 KiB
Python
Executable file
124 lines
3.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Quick geocoder for JSON harvest files.
|
|
Uses Nominatim API with rate limiting.
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import requests
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import sys
|
|
|
|
def geocode_location(city: str, region: str, country: str) -> tuple:
|
|
"""Geocode using Nominatim API."""
|
|
query = f"{city}, {region}, {country}"
|
|
|
|
url = "https://nominatim.openstreetmap.org/search"
|
|
params = {
|
|
'q': query,
|
|
'format': 'json',
|
|
'limit': 1,
|
|
'addressdetails': 1
|
|
}
|
|
headers = {
|
|
'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)'
|
|
}
|
|
|
|
try:
|
|
time.sleep(1) # Rate limit: 1 req/sec
|
|
response = requests.get(url, params=params, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
if results:
|
|
result = results[0]
|
|
return (
|
|
float(result['lat']),
|
|
float(result['lon']),
|
|
result.get('display_name', '')
|
|
)
|
|
except Exception as e:
|
|
print(f" ⚠️ Geocoding failed: {e}")
|
|
|
|
return None, None, None
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python scripts/scrapers/geocode_json_harvest.py <input.json>")
|
|
sys.exit(1)
|
|
|
|
input_file = Path(sys.argv[1])
|
|
|
|
print(f"🌍 Geocoding JSON Harvest")
|
|
print(f"📁 Input: {input_file}")
|
|
print()
|
|
|
|
# Load data
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
archives = data['archives']
|
|
|
|
# Count archives without coordinates
|
|
need_geocoding = [a for a in archives if a.get('city') and not a.get('latitude')]
|
|
|
|
print(f"📊 Statistics:")
|
|
print(f" Total archives: {len(archives)}")
|
|
print(f" With cities: {sum(1 for a in archives if a.get('city'))}")
|
|
print(f" Need geocoding: {len(need_geocoding)}")
|
|
print()
|
|
|
|
if not need_geocoding:
|
|
print("✅ All archives already geocoded!")
|
|
return
|
|
|
|
print(f"🚀 Starting geocoding of {len(need_geocoding)} archives...")
|
|
print()
|
|
|
|
success = 0
|
|
failed = 0
|
|
|
|
for i, archive in enumerate(need_geocoding, 1):
|
|
city = archive['city']
|
|
region = archive.get('region', '')
|
|
country = archive.get('country', 'DE')
|
|
|
|
print(f"[{i}/{len(need_geocoding)}] {archive['name']}")
|
|
print(f" 📍 {city}, {region}, {country}")
|
|
|
|
lat, lon, display = geocode_location(city, region, country)
|
|
|
|
if lat and lon:
|
|
archive['latitude'] = lat
|
|
archive['longitude'] = lon
|
|
archive['geocoded_display_name'] = display
|
|
success += 1
|
|
print(f" ✅ {lat:.6f}, {lon:.6f}")
|
|
else:
|
|
failed += 1
|
|
print(f" ❌ Failed")
|
|
|
|
print()
|
|
|
|
# Save updated data
|
|
output_file = input_file.parent / f"{input_file.stem}_geocoded.json"
|
|
|
|
data['geocoding_metadata'] = {
|
|
'geocoded_at': datetime.now(timezone.utc).isoformat(),
|
|
'success_count': success,
|
|
'failed_count': failed,
|
|
'total_processed': len(need_geocoding)
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n✅ Geocoding complete!")
|
|
print(f" Success: {success}/{len(need_geocoding)} ({success/len(need_geocoding)*100:.1f}%)")
|
|
print(f" Failed: {failed}")
|
|
print(f" Output: {output_file}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|