#!/usr/bin/env python3 """ OpenStreetMap Enrichment Script - BATCHED VERSION Saves progress every 20 institutions to avoid data loss on timeout. """ import yaml import requests import json from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Any from collections import defaultdict import time import re # Configuration OVERPASS_URL = "https://overpass-api.de/api/interpreter" OVERPASS_TIMEOUT = 30 RATE_LIMIT_DELAY = 2.0 MAX_RETRIES = 3 RETRY_DELAY = 5.0 BATCH_SIZE = 20 # Save progress every 20 institutions OVERPASS_MIRRORS = [ "https://overpass-api.de/api/interpreter", "https://overpass.kumi.systems/api/interpreter", "https://overpass.openstreetmap.ru/cgi/interpreter" ] class OSMEnricher: """Enriches heritage institution records using OpenStreetMap data""" def __init__(self, input_file: Path, output_file: Path): self.input_file = input_file self.output_file = output_file self.institutions = [] self.enrichment_stats = defaultdict(int) self.enrichment_details = [] self.current_mirror = 0 def load_institutions(self): """Load institutions from YAML file""" print(f"Loading institutions from {self.input_file}") with open(self.input_file, 'r', encoding='utf-8') as f: self.institutions = yaml.safe_load(f) if not isinstance(self.institutions, list): raise ValueError("Expected list of institutions in YAML file") self.enrichment_stats['total_institutions'] = len(self.institutions) print(f"✅ Loaded {len(self.institutions)} institutions\n") def get_overpass_url(self): """Get current Overpass API URL (with failover)""" return OVERPASS_MIRRORS[self.current_mirror % len(OVERPASS_MIRRORS)] def fetch_osm_element(self, osm_id: str, retry_count: int = 0) -> Optional[Dict]: """Fetch OSM element data via Overpass API""" # Parse OSM ID format (e.g., "node/123", "way/456", "relation/789") match = re.match(r'(node|way|relation)/(\d+)', osm_id) if not match: print(f" ⚠️ Invalid OSM ID format: {osm_id}") return None element_type, element_id = match.groups() # Construct Overpass query query = f""" [out:json][timeout:{OVERPASS_TIMEOUT}]; {element_type}({element_id}); out body; >; out skel qt; """ try: url = self.get_overpass_url() response = requests.post( url, data={'data': query}, timeout=OVERPASS_TIMEOUT + 5 ) if response.status_code == 200: data = response.json() if data.get('elements'): return data['elements'][0] else: print(f" ⚠️ No data returned for OSM {osm_id}") return None elif response.status_code == 429: # Rate limit - wait and retry if retry_count < MAX_RETRIES: wait_time = RETRY_DELAY * 2 print(f" ⚠️ Rate limited (429). Waiting {wait_time}s before retry {retry_count+1}/{MAX_RETRIES}") time.sleep(wait_time) return self.fetch_osm_element(osm_id, retry_count + 1) else: print(f" ❌ Rate limit exceeded after {MAX_RETRIES} retries") return None elif response.status_code in [502, 503, 504]: # Server error - retry with different mirror if retry_count < MAX_RETRIES: self.current_mirror += 1 new_url = self.get_overpass_url() print(f" ⚠️ Server error ({response.status_code}). Switching to {new_url}") time.sleep(RETRY_DELAY) return self.fetch_osm_element(osm_id, retry_count + 1) else: print(f" ❌ Server error after {MAX_RETRIES} retries") return None else: print(f" ❌ HTTP {response.status_code}: {response.text[:100]}") return None except requests.Timeout: if retry_count < MAX_RETRIES: print(f" ⚠️ Timeout. Retry {retry_count+1}/{MAX_RETRIES}") time.sleep(RETRY_DELAY) return self.fetch_osm_element(osm_id, retry_count + 1) else: print(f" ❌ Timeout after {MAX_RETRIES} retries") return None except Exception as e: print(f" ❌ Error fetching OSM data: {e}") return None def calculate_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float: """Calculate distance in meters between two coordinates (Haversine formula)""" from math import radians, sin, cos, sqrt, atan2 R = 6371000 # Earth radius in meters lat1_rad = radians(lat1) lat2_rad = radians(lat2) dlat = radians(lat2 - lat1) dlon = radians(lon2 - lon1) a = sin(dlat/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon/2)**2 c = 2 * atan2(sqrt(a), sqrt(1-a)) return R * c def enrich_institution(self, institution: Dict) -> bool: """Enrich a single institution from OSM data""" # Check for OSM identifier identifiers = institution.get('identifiers', []) osm_ids = [id for id in identifiers if id.get('identifier_scheme') == 'OpenStreetMap'] if not osm_ids: return False self.enrichment_stats['osm_ids_found'] += 1 osm_id = osm_ids[0]['identifier_value'] inst_name = institution.get('name', 'Unknown') print(f"\n[{self.enrichment_stats['osm_ids_found']}] {inst_name}") print(f" OSM ID: {osm_id}") # Fetch OSM data osm_data = self.fetch_osm_element(osm_id) if not osm_data: self.enrichment_stats['osm_fetch_errors'] += 1 return False self.enrichment_stats['osm_records_fetched'] += 1 # Extract tags tags = osm_data.get('tags', {}) enriched = False enrichment_log = { 'institution_name': inst_name, 'osm_id': osm_id, 'improvements': [] } # 1. Coordinates upgrade if 'lat' in osm_data and 'lon' in osm_data: osm_lat = float(osm_data['lat']) osm_lon = float(osm_data['lon']) locations = institution.get('locations', []) if locations and locations[0].get('latitude') and locations[0].get('longitude'): current_lat = locations[0]['latitude'] current_lon = locations[0]['longitude'] distance = self.calculate_distance(current_lat, current_lon, osm_lat, osm_lon) if distance > 100: # More than 100m difference = upgrade locations[0]['latitude'] = osm_lat locations[0]['longitude'] = osm_lon enriched = True self.enrichment_stats['coordinates_upgraded'] += 1 enrichment_log['improvements'].append( f"Upgraded coordinates (precision improved by {int(distance)}m)" ) elif locations: locations[0]['latitude'] = osm_lat locations[0]['longitude'] = osm_lon enriched = True self.enrichment_stats['coordinates_upgraded'] += 1 enrichment_log['improvements'].append("Added precise coordinates from OSM") # 2. Street address addr_street = tags.get('addr:street') addr_housenumber = tags.get('addr:housenumber') addr_postcode = tags.get('addr:postcode') if addr_street or addr_housenumber or addr_postcode: locations = institution.get('locations', []) if locations: location = locations[0] if addr_street and addr_housenumber: full_address = f"{addr_street} {addr_housenumber}" if not location.get('street_address') or len(full_address) > len(location.get('street_address', '')): location['street_address'] = full_address enriched = True self.enrichment_stats['addresses_improved'] += 1 enrichment_log['improvements'].append(f"Added street address: {full_address}") if addr_postcode and not location.get('postal_code'): location['postal_code'] = addr_postcode enriched = True enrichment_log['improvements'].append(f"Added postal code: {addr_postcode}") # 3. Contact information phone = tags.get('phone') or tags.get('contact:phone') email = tags.get('email') or tags.get('contact:email') website = tags.get('website') or tags.get('url') or tags.get('contact:website') contact_added = False if phone or email: # Store in description for now (no dedicated contact field in schema) contact_info = [] if phone: contact_info.append(f"Phone: {phone}") if email: contact_info.append(f"Email: {email}") if contact_info: contact_text = " | ".join(contact_info) current_desc = institution.get('description', '') if contact_text not in current_desc: institution['description'] = (current_desc + f"\n\nContact: {contact_text}").strip() enriched = True contact_added = True self.enrichment_stats['contact_info_added'] += 1 enrichment_log['improvements'].append(f"Added contact info: {contact_text}") # 4. Website if website: identifiers = institution.get('identifiers', []) website_ids = [id for id in identifiers if id.get('identifier_scheme') == 'Website'] if not website_ids: identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': website, 'identifier_url': website }) institution['identifiers'] = identifiers enriched = True self.enrichment_stats['websites_added'] += 1 enrichment_log['improvements'].append(f"Added website: {website}") # 5. Opening hours opening_hours = tags.get('opening_hours') if opening_hours: current_desc = institution.get('description', '') hours_text = f"Opening hours: {opening_hours}" if hours_text not in current_desc: institution['description'] = (current_desc + f"\n\n{hours_text}").strip() enriched = True self.enrichment_stats['opening_hours_added'] += 1 enrichment_log['improvements'].append(f"Added opening hours: {opening_hours}") # 6. Alternative names alt_names = [] for key in ['alt_name', 'official_name', 'name:en', 'name:es', 'name:pt']: if key in tags and tags[key] != institution.get('name'): alt_names.append(tags[key]) if alt_names: existing_alt_names = institution.get('alternative_names', []) new_alt_names = [n for n in alt_names if n not in existing_alt_names] if new_alt_names: institution['alternative_names'] = existing_alt_names + new_alt_names enriched = True self.enrichment_stats['alternative_names_added'] += len(new_alt_names) enrichment_log['improvements'].append( f"Added {len(new_alt_names)} alternative names" ) # Update provenance if enriched: self.enrichment_stats['institutions_enriched'] += 1 self.enrichment_details.append(enrichment_log) if 'provenance' in institution: existing_notes = institution['provenance'].get('notes', '') osm_note = f"\nOpenStreetMap enrichment (2025-11-06): Fetched OSM element {osm_id}. " osm_note += f"Improvements: {', '.join(enrichment_log['improvements'][:3])}." institution['provenance']['notes'] = (existing_notes + osm_note).strip() return enriched def save_progress(self, batch_num: int): """Save current progress to output file""" print(f"\n💾 Saving progress (batch {batch_num})...") with open(self.output_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n") f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Batch: {batch_num}\n") f.write("#\n") f.write("# OpenStreetMap Enrichment Summary (partial):\n") for key, value in self.enrichment_stats.items(): f.write(f"# - {key}: {value}\n") f.write("\n") yaml.dump(self.institutions, f, allow_unicode=True, sort_keys=False) print(f"✅ Saved progress: {self.enrichment_stats['institutions_enriched']} institutions enriched so far\n") def process_all_institutions(self): """Process all institutions with batched progress saving""" print(f"\n{'='*70}") print("OpenStreetMap Enrichment Process (BATCHED)") print(f"{'='*70}\n") batch_num = 0 for idx, institution in enumerate(self.institutions, 1): enriched = self.enrich_institution(institution) if enriched: print(f" ✅ Enrichment successful") # Save progress every BATCH_SIZE institutions if idx % BATCH_SIZE == 0: batch_num += 1 self.save_progress(batch_num) # Rate limiting if idx < len(self.institutions): time.sleep(RATE_LIMIT_DELAY) # Save final state if not already saved if len(self.institutions) % BATCH_SIZE != 0: batch_num += 1 self.save_progress(batch_num) print(f"\n{'='*70}") print("OpenStreetMap Enrichment Complete") print(f"{'='*70}\n") def generate_report(self): """Generate enrichment report""" print("\n" + "="*70) print("OPENSTREETMAP ENRICHMENT REPORT") print("="*70 + "\n") print(f"Total institutions processed: {self.enrichment_stats['total_institutions']}") print(f"Institutions with OSM IDs: {self.enrichment_stats['osm_ids_found']}") print(f"OSM records successfully fetched: {self.enrichment_stats['osm_records_fetched']}") print(f"OSM fetch errors: {self.enrichment_stats['osm_fetch_errors']}") print(f"\nEnrichment Results:") print(f" Coordinates upgraded: {self.enrichment_stats['coordinates_upgraded']}") print(f" Addresses improved: {self.enrichment_stats['addresses_improved']}") print(f" Contact info added: {self.enrichment_stats['contact_info_added']}") print(f" Opening hours added: {self.enrichment_stats['opening_hours_added']}") print(f" Alternative names added: {self.enrichment_stats['alternative_names_added']}") print(f" Websites added: {self.enrichment_stats['websites_added']}") print(f" Institutions enriched: {self.enrichment_stats['institutions_enriched']}") if self.enrichment_details: print(f"\nDetailed Enrichment Log (showing first 20):") for detail in self.enrichment_details[:20]: print(f"\n {detail['institution_name']} (OSM {detail['osm_id']})") for improvement in detail['improvements'][:5]: print(f" + {improvement}") print("\n" + "="*70 + "\n") def main(): """Main execution""" base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "instances" / "latin_american_institutions_documented.yaml" output_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml" if not input_file.exists(): print(f"❌ Error: Input file not found: {input_file}") return 1 enricher = OSMEnricher(input_file, output_file) enricher.load_institutions() enricher.process_all_institutions() enricher.generate_report() print(f"\n✅ Enrichment complete! Output saved to:") print(f" {output_file}") return 0 if __name__ == "__main__": exit(main())