#!/usr/bin/env python3 """ Quick geocoder for JSON harvest files. Uses Nominatim API with rate limiting. """ import json import time import requests from pathlib import Path from datetime import datetime, timezone import sys def geocode_location(city: str, region: str, country: str) -> tuple: """Geocode using Nominatim API.""" query = f"{city}, {region}, {country}" url = "https://nominatim.openstreetmap.org/search" params = { 'q': query, 'format': 'json', 'limit': 1, 'addressdetails': 1 } headers = { 'User-Agent': 'GLAM-Data-Project/1.0 (heritage institution research)' } try: time.sleep(1) # Rate limit: 1 req/sec response = requests.get(url, params=params, headers=headers, timeout=10) response.raise_for_status() results = response.json() if results: result = results[0] return ( float(result['lat']), float(result['lon']), result.get('display_name', '') ) except Exception as e: print(f" āš ļø Geocoding failed: {e}") return None, None, None def main(): if len(sys.argv) < 2: print("Usage: python scripts/scrapers/geocode_json_harvest.py ") sys.exit(1) input_file = Path(sys.argv[1]) print(f"šŸŒ Geocoding JSON Harvest") print(f"šŸ“ Input: {input_file}") print() # Load data with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) archives = data['archives'] # Count archives without coordinates need_geocoding = [a for a in archives if a.get('city') and not a.get('latitude')] print(f"šŸ“Š Statistics:") print(f" Total archives: {len(archives)}") print(f" With cities: {sum(1 for a in archives if a.get('city'))}") print(f" Need geocoding: {len(need_geocoding)}") print() if not need_geocoding: print("āœ… All archives already geocoded!") return print(f"šŸš€ Starting geocoding of {len(need_geocoding)} archives...") print() success = 0 failed = 0 for i, archive in enumerate(need_geocoding, 1): city = archive['city'] region = archive.get('region', '') country = archive.get('country', 'DE') print(f"[{i}/{len(need_geocoding)}] {archive['name']}") print(f" šŸ“ {city}, {region}, {country}") lat, lon, display = geocode_location(city, region, country) if lat and lon: archive['latitude'] = lat archive['longitude'] = lon archive['geocoded_display_name'] = display success += 1 print(f" āœ… {lat:.6f}, {lon:.6f}") else: failed += 1 print(f" āŒ Failed") print() # Save updated data output_file = input_file.parent / f"{input_file.stem}_geocoded.json" data['geocoding_metadata'] = { 'geocoded_at': datetime.now(timezone.utc).isoformat(), 'success_count': success, 'failed_count': failed, 'total_processed': len(need_geocoding) } with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"\nāœ… Geocoding complete!") print(f" Success: {success}/{len(need_geocoding)} ({success/len(need_geocoding)*100:.1f}%)") print(f" Failed: {failed}") print(f" Output: {output_file}") if __name__ == '__main__': main()