#!/usr/bin/env python3 """ OpenStreetMap Enrichment Script for Latin American Institutions Purpose: Fetch OpenStreetMap data for institutions with OSM identifiers and extract: - Precise building-level coordinates (upgrade from city-level) - Contact information (phone, email, website if tagged) - Opening hours (if tagged) - Street addresses (if more detailed than current data) - Additional names/tags Strategy: 1. Load documented Latin American institutions dataset 2. Find all institutions with OpenStreetMap identifiers (currently 186) 3. Fetch OSM data via Overpass API for each OSM ID 4. Parse JSON to extract location and contact metadata 5. Update institution records with enriched data 6. Generate enrichment report Author: Global GLAM Dataset Project Date: 2025-11-06 """ import yaml import requests import json from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Any from collections import defaultdict import time import re # Overpass API Configuration OVERPASS_URL = "https://overpass-api.de/api/interpreter" OVERPASS_TIMEOUT = 30 # seconds RATE_LIMIT_DELAY = 2.0 # seconds between requests (increased to avoid 429) MAX_RETRIES = 3 # Retry failed requests up to 3 times RETRY_DELAY = 5.0 # seconds to wait before retry # Alternative Overpass instances (if main is down) OVERPASS_MIRRORS = [ "https://overpass-api.de/api/interpreter", "https://overpass.kumi.systems/api/interpreter", "https://overpass.openstreetmap.ru/cgi/interpreter" ] class OSMEnricher: """Enriches heritage institution records using OpenStreetMap data""" def __init__(self, input_file: Path, output_file: Path): self.input_file = input_file self.output_file = output_file self.institutions = [] self.overpass_url = OVERPASS_MIRRORS[0] self.enrichment_stats = { 'total_institutions': 0, 'osm_ids_found': 0, 'osm_records_fetched': 0, 'osm_fetch_errors': 0, 'coordinates_upgraded': 0, # City-level → Building-level 'addresses_improved': 0, 'contact_info_added': 0, 'opening_hours_added': 0, 'alternative_names_added': 0, 'websites_added': 0, 'institutions_enriched': 0 } self.enrichment_details = [] def load_institutions(self): """Load institutions from YAML file""" print(f"Loading institutions from {self.input_file}") with open(self.input_file, 'r', encoding='utf-8') as f: self.institutions = yaml.safe_load(f) self.enrichment_stats['total_institutions'] = len(self.institutions) print(f"Loaded {len(self.institutions)} institutions") def fetch_osm_data(self, osm_id: str) -> Optional[Dict[str, Any]]: """ Fetch OSM data via Overpass API with retry logic Args: osm_id: OpenStreetMap identifier (format: "way/123456" or "node/123456" or "relation/123456") Returns: OSM element data as dictionary or None if fetch failed """ # Parse OSM ID format if '/' in osm_id: osm_type, osm_number = osm_id.split('/') else: # Assume it's just a number, try as node first osm_type = 'node' osm_number = osm_id # Build Overpass QL query # Request: element by ID, include tags and geometry overpass_query = f""" [out:json][timeout:{OVERPASS_TIMEOUT}]; {osm_type}({osm_number}); out center tags; """ # Retry logic for transient failures for attempt in range(MAX_RETRIES): try: if attempt > 0: print(f" Retry {attempt}/{MAX_RETRIES-1}...") time.sleep(RETRY_DELAY) response = requests.post( self.overpass_url, data={'data': overpass_query}, timeout=OVERPASS_TIMEOUT ) if response.status_code == 200: data = response.json() elements = data.get('elements', []) if elements: return elements[0] # Return first element else: print(f" ⚠️ OSM element not found: {osm_type}/{osm_number}") return None elif response.status_code == 429: # Rate limit - wait longer and retry if attempt < MAX_RETRIES - 1: print(f" ⚠️ Rate limited (429), waiting {RETRY_DELAY*2}s...") time.sleep(RETRY_DELAY * 2) continue else: print(f" ⚠️ OSM fetch failed: HTTP 429 (rate limit)") return None elif response.status_code in [504, 503, 502]: # Server error - retry if attempt < MAX_RETRIES - 1: print(f" ⚠️ Server error ({response.status_code}), retrying...") continue else: print(f" ⚠️ OSM fetch failed: HTTP {response.status_code}") return None else: print(f" ⚠️ OSM fetch failed: HTTP {response.status_code}") return None except requests.RequestException as e: if attempt < MAX_RETRIES - 1: print(f" ⚠️ Request error: {e}, retrying...") continue else: print(f" ❌ OSM fetch error: {e}") return None except json.JSONDecodeError as e: print(f" ❌ JSON parse error: {e}") return None return None def extract_coordinates(self, osm_element: Dict[str, Any]) -> Optional[Dict[str, float]]: """ Extract precise coordinates from OSM element Returns: {'latitude': float, 'longitude': float} or None """ # For nodes: lat/lon directly if 'lat' in osm_element and 'lon' in osm_element: return { 'latitude': osm_element['lat'], 'longitude': osm_element['lon'] } # For ways/relations: use center coordinates if 'center' in osm_element: center = osm_element['center'] if 'lat' in center and 'lon' in center: return { 'latitude': center['lat'], 'longitude': center['lon'] } return None def extract_address(self, tags: Dict[str, str]) -> Dict[str, str]: """ Extract address components from OSM tags Returns: Dictionary with address fields (street_address, city, postal_code, etc.) """ address = {} # OSM address tags: addr:street, addr:housenumber, addr:city, addr:postcode street = tags.get('addr:street', '') housenumber = tags.get('addr:housenumber', '') if street and housenumber: address['street_address'] = f"{street} {housenumber}".strip() elif street: address['street_address'] = street if 'addr:city' in tags: address['city'] = tags['addr:city'] if 'addr:postcode' in tags: address['postal_code'] = tags['addr:postcode'] if 'addr:state' in tags: address['region'] = tags['addr:state'] if 'addr:country' in tags: address['country'] = tags['addr:country'] return address def extract_contact_info(self, tags: Dict[str, str]) -> Dict[str, str]: """ Extract contact information from OSM tags Returns: Dictionary with contact:phone, contact:email, contact:website, etc. """ contact = {} # Phone numbers for key in ['phone', 'contact:phone', 'telephone']: if key in tags: contact['phone'] = tags[key] break # Email for key in ['email', 'contact:email']: if key in tags: contact['email'] = tags[key] break # Website (distinct from identifier URLs) for key in ['website', 'contact:website', 'url']: if key in tags: contact['website'] = tags[key] break # Opening hours if 'opening_hours' in tags: contact['opening_hours'] = tags['opening_hours'] return contact def extract_alternative_names(self, tags: Dict[str, str]) -> List[str]: """Extract alternative name variants from OSM tags""" names = [] # Common name tags in OSM name_keys = [ 'alt_name', 'official_name', 'short_name', 'old_name', 'name:en', 'name:es', 'name:pt' # Common languages for Latin America ] for key in name_keys: if key in tags and tags[key]: name = tags[key].strip() if name and name not in names: names.append(name) return names def enrich_institution(self, institution: Dict[str, Any]) -> bool: """ Enrich a single institution with OSM data Returns: True if enrichment occurred, False otherwise """ # Find OSM identifier osm_id = None identifiers = institution.get('identifiers', []) for identifier in identifiers: if identifier.get('identifier_scheme') == 'OpenStreetMap': osm_id = identifier.get('identifier_value') break if not osm_id: return False self.enrichment_stats['osm_ids_found'] += 1 print(f"\n🗺️ Enriching: {institution.get('name')} (OSM {osm_id})") # Fetch OSM data osm_element = self.fetch_osm_data(osm_id) if osm_element is None: self.enrichment_stats['osm_fetch_errors'] += 1 return False self.enrichment_stats['osm_records_fetched'] += 1 enriched = False enrichment_log = { 'institution_name': institution.get('name'), 'osm_id': osm_id, 'improvements': [] } tags = osm_element.get('tags', {}) # Extract and update coordinates coords = self.extract_coordinates(osm_element) if coords: locations = institution.get('locations', []) if locations: # Check if we're upgrading from city-level to building-level current_location = locations[0] current_lat = current_location.get('latitude') current_lon = current_location.get('longitude') # If coordinates differ significantly (>0.001 degrees ≈ 100m), it's an upgrade if current_lat and current_lon: lat_diff = abs(coords['latitude'] - current_lat) lon_diff = abs(coords['longitude'] - current_lon) if lat_diff > 0.001 or lon_diff > 0.001: print(f" ✅ Upgraded coordinates: precision improved") current_location['latitude'] = coords['latitude'] current_location['longitude'] = coords['longitude'] self.enrichment_stats['coordinates_upgraded'] += 1 enrichment_log['improvements'].append('Coordinates upgraded to building-level precision') enriched = True else: # No coordinates yet, add them print(f" ✅ Added coordinates: {coords['latitude']}, {coords['longitude']}") current_location['latitude'] = coords['latitude'] current_location['longitude'] = coords['longitude'] self.enrichment_stats['coordinates_upgraded'] += 1 enrichment_log['improvements'].append('Added building coordinates') enriched = True # Extract and update address address = self.extract_address(tags) if address: locations = institution.get('locations', []) if locations: current_location = locations[0] # Add street address if better than current if 'street_address' in address and not current_location.get('street_address'): print(f" ✅ Added street address: {address['street_address']}") current_location['street_address'] = address['street_address'] self.enrichment_stats['addresses_improved'] += 1 enrichment_log['improvements'].append(f"Street address: {address['street_address']}") enriched = True # Add postal code if missing if 'postal_code' in address and not current_location.get('postal_code'): print(f" ✅ Added postal code: {address['postal_code']}") current_location['postal_code'] = address['postal_code'] enrichment_log['improvements'].append(f"Postal code: {address['postal_code']}") enriched = True # Extract contact information contact = self.extract_contact_info(tags) # Add phone number as identifier if 'phone' in contact: has_phone = any( id.get('identifier_scheme') == 'Phone' for id in identifiers ) if not has_phone: print(f" ✅ Added phone: {contact['phone']}") identifiers.append({ 'identifier_scheme': 'Phone', 'identifier_value': contact['phone'], 'identifier_url': None }) self.enrichment_stats['contact_info_added'] += 1 enrichment_log['improvements'].append(f"Phone: {contact['phone']}") enriched = True # Add email as identifier if 'email' in contact: has_email = any( id.get('identifier_scheme') == 'Email' for id in identifiers ) if not has_email: print(f" ✅ Added email: {contact['email']}") identifiers.append({ 'identifier_scheme': 'Email', 'identifier_value': contact['email'], 'identifier_url': f"mailto:{contact['email']}" }) self.enrichment_stats['contact_info_added'] += 1 enrichment_log['improvements'].append(f"Email: {contact['email']}") enriched = True # Add website if different from existing if 'website' in contact: existing_websites = [ id.get('identifier_value') for id in identifiers if id.get('identifier_scheme') == 'Website' ] if contact['website'] not in existing_websites: print(f" ✅ Added website: {contact['website']}") identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': contact['website'], 'identifier_url': contact['website'] }) self.enrichment_stats['websites_added'] += 1 enrichment_log['improvements'].append(f"Website: {contact['website']}") enriched = True # Add opening hours to description or as note if 'opening_hours' in contact: # Add to description description = institution.get('description', '') hours_text = f"Opening hours: {contact['opening_hours']}" if hours_text not in description: print(f" ✅ Added opening hours: {contact['opening_hours']}") if description: institution['description'] = f"{description} {hours_text}" else: institution['description'] = hours_text self.enrichment_stats['opening_hours_added'] += 1 enrichment_log['improvements'].append(f"Opening hours: {contact['opening_hours']}") enriched = True # Extract alternative names alt_names = self.extract_alternative_names(tags) if alt_names: existing_alt_names = institution.get('alternative_names', []) new_names = [name for name in alt_names if name not in existing_alt_names] if new_names: print(f" ✅ Found {len(new_names)} alternative names") institution['alternative_names'] = existing_alt_names + new_names self.enrichment_stats['alternative_names_added'] += len(new_names) enrichment_log['improvements'].append(f"Alternative names: {', '.join(new_names[:3])}") enriched = True if enriched: self.enrichment_stats['institutions_enriched'] += 1 self.enrichment_details.append(enrichment_log) # Update provenance if 'provenance' in institution: existing_notes = institution['provenance'].get('notes', '') osm_note = f"\nOpenStreetMap enrichment (2025-11-06): Fetched OSM element {osm_id}. " osm_note += f"Improvements: {', '.join(enrichment_log['improvements'][:3])}." institution['provenance']['notes'] = (existing_notes + osm_note).strip() return enriched def process_all_institutions(self): """Process all institutions and enrich from OpenStreetMap""" print(f"\n{'='*70}") print("OpenStreetMap Enrichment Process") print(f"{'='*70}\n") for idx, institution in enumerate(self.institutions, 1): enriched = self.enrich_institution(institution) if enriched: print(f" ✅ Enrichment successful") # Rate limiting if idx < len(self.institutions): time.sleep(RATE_LIMIT_DELAY) print(f"\n{'='*70}") print("OpenStreetMap Enrichment Complete") print(f"{'='*70}\n") def save_enriched_dataset(self): """Save enriched institutions to output file""" print(f"Saving enriched dataset to {self.output_file}") with open(self.output_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n") f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") f.write("#\n") f.write("# OpenStreetMap Enrichment Summary:\n") for key, value in self.enrichment_stats.items(): f.write(f"# - {key}: {value}\n") f.write("\n") yaml.dump(self.institutions, f, allow_unicode=True, sort_keys=False) print(f"✅ Saved {len(self.institutions)} institutions") def generate_report(self): """Generate enrichment report""" print("\n" + "="*70) print("OPENSTREETMAP ENRICHMENT REPORT") print("="*70 + "\n") print(f"Total institutions processed: {self.enrichment_stats['total_institutions']}") print(f"Institutions with OSM IDs: {self.enrichment_stats['osm_ids_found']}") print(f"OSM records successfully fetched: {self.enrichment_stats['osm_records_fetched']}") print(f"OSM fetch errors: {self.enrichment_stats['osm_fetch_errors']}") print(f"\nEnrichment Results:") print(f" Coordinates upgraded: {self.enrichment_stats['coordinates_upgraded']}") print(f" Addresses improved: {self.enrichment_stats['addresses_improved']}") print(f" Contact info added: {self.enrichment_stats['contact_info_added']}") print(f" Opening hours added: {self.enrichment_stats['opening_hours_added']}") print(f" Alternative names added: {self.enrichment_stats['alternative_names_added']}") print(f" Websites added: {self.enrichment_stats['websites_added']}") print(f" Institutions enriched: {self.enrichment_stats['institutions_enriched']}") if self.enrichment_details: print(f"\nDetailed Enrichment Log (showing first 10):") for detail in self.enrichment_details[:10]: print(f"\n {detail['institution_name']} (OSM {detail['osm_id']})") for improvement in detail['improvements'][:3]: print(f" + {improvement}") print("\n" + "="*70 + "\n") def main(): """Main execution""" # File paths base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "instances" / "latin_american_institutions_documented.yaml" output_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml" # Validate input file exists if not input_file.exists(): print(f"❌ Error: Input file not found: {input_file}") print(" Please ensure the documented dataset exists.") return 1 # Create enricher enricher = OSMEnricher(input_file, output_file) # Load institutions enricher.load_institutions() # Process all institutions enricher.process_all_institutions() # Save enriched dataset enricher.save_enriched_dataset() # Generate report enricher.generate_report() print(f"✅ OpenStreetMap enrichment complete!") print(f" Input: {input_file}") print(f" Output: {output_file}") return 0 if __name__ == '__main__': exit(main())