#!/usr/bin/env python3 """ Resume OSM Enrichment - Process institutions 101-304 """ import yaml import requests import json from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional from collections import defaultdict import time import re # Configuration OVERPASS_URL = "https://overpass-api.de/api/interpreter" OVERPASS_TIMEOUT = 30 RATE_LIMIT_DELAY = 3.0 # Increased to 3 seconds MAX_RETRIES = 3 RETRY_DELAY = 10.0 # Increased to 10 seconds BATCH_SIZE = 20 START_INDEX = 100 # Resume from institution 101 (0-indexed = 100) print(f"Resuming OSM enrichment from institution {START_INDEX + 1}") print(f"Rate limit: {RATE_LIMIT_DELAY}s between requests") print(f"Batch size: {BATCH_SIZE} institutions\n") # Load partially enriched dataset base_dir = Path(__file__).parent.parent enriched_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml" print(f"Loading {enriched_file}") with open(enriched_file, 'r', encoding='utf-8') as f: content = f.read() # Skip header yaml_content = content.split('\n\n', 1)[1] if '\n\n' in content else content institutions = yaml.safe_load(yaml_content) print(f"Loaded {len(institutions)} institutions") print(f"Will process institutions {START_INDEX + 1} to {len(institutions)}\n") # Stats stats = defaultdict(int) stats['total_institutions'] = len(institutions) details = [] def fetch_osm_element(osm_id: str, retry_count: int = 0) -> Optional[Dict]: """Fetch OSM element data via Overpass API""" match = re.match(r'(node|way|relation)/(\d+)', osm_id) if not match: return None element_type, element_id = match.groups() query = f""" [out:json][timeout:{OVERPASS_TIMEOUT}]; {element_type}({element_id}); out body; >; out skel qt; """ try: response = requests.post( OVERPASS_URL, data={'data': query}, timeout=OVERPASS_TIMEOUT + 5 ) if response.status_code == 200: data = response.json() return data.get('elements', [None])[0] elif response.status_code == 429 and retry_count < MAX_RETRIES: print(f" ⚠️ Rate limited. Waiting {RETRY_DELAY}s...") time.sleep(RETRY_DELAY) return fetch_osm_element(osm_id, retry_count + 1) elif response.status_code in [502, 503, 504] and retry_count < MAX_RETRIES: print(f" ⚠️ Server error {response.status_code}. Retry {retry_count+1}/{MAX_RETRIES}") time.sleep(RETRY_DELAY) return fetch_osm_element(osm_id, retry_count + 1) else: print(f" ❌ HTTP {response.status_code}") return None except requests.Timeout: if retry_count < MAX_RETRIES: print(f" ⚠️ Timeout. Retry {retry_count+1}/{MAX_RETRIES}") time.sleep(RETRY_DELAY) return fetch_osm_element(osm_id, retry_count + 1) return None except Exception as e: print(f" ❌ Error: {e}") return None def calculate_distance(lat1, lon1, lat2, lon2): """Calculate distance between coordinates""" from math import radians, sin, cos, sqrt, atan2 R = 6371000 lat1, lat2 = radians(lat1), radians(lat2) dlat, dlon = radians(lat2 - lat1), radians(lon2 - lon1) a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 return R * 2 * atan2(sqrt(a), sqrt(1-a)) # Process institutions from START_INDEX onwards batch_num = 5 # Continue from batch 5 processed_count = 0 for idx in range(START_INDEX, len(institutions)): institution = institutions[idx] # Check for OSM ID identifiers = institution.get('identifiers', []) osm_ids = [id for id in identifiers if id.get('identifier_scheme') == 'OpenStreetMap'] if not osm_ids: continue stats['osm_ids_found'] += 1 osm_id = osm_ids[0]['identifier_value'] inst_name = institution.get('name', 'Unknown') print(f"\n[{idx + 1}/{len(institutions)}] {inst_name}") print(f" OSM ID: {osm_id}") # Fetch OSM data osm_data = fetch_osm_element(osm_id) if not osm_data: stats['osm_fetch_errors'] += 1 processed_count += 1 time.sleep(RATE_LIMIT_DELAY) continue stats['osm_records_fetched'] += 1 tags = osm_data.get('tags', {}) enriched = False improvements = [] # 1. Coordinates if 'lat' in osm_data and 'lon' in osm_data: osm_lat, osm_lon = float(osm_data['lat']), float(osm_data['lon']) locations = institution.get('locations', []) if locations: if locations[0].get('latitude') and locations[0].get('longitude'): distance = calculate_distance( locations[0]['latitude'], locations[0]['longitude'], osm_lat, osm_lon ) if distance > 100: locations[0]['latitude'] = osm_lat locations[0]['longitude'] = osm_lon enriched = True stats['coordinates_upgraded'] += 1 improvements.append(f"Coordinates upgraded ({int(distance)}m)") else: locations[0]['latitude'] = osm_lat locations[0]['longitude'] = osm_lon enriched = True stats['coordinates_upgraded'] += 1 improvements.append("Added coordinates") # 2. Address addr_street = tags.get('addr:street') addr_num = tags.get('addr:housenumber') addr_postcode = tags.get('addr:postcode') if (addr_street or addr_postcode) and institution.get('locations'): location = institution['locations'][0] if addr_street and addr_num: full_address = f"{addr_street} {addr_num}" if not location.get('street_address') or len(full_address) > len(location.get('street_address', '')): location['street_address'] = full_address enriched = True stats['addresses_improved'] += 1 improvements.append(f"Address: {full_address}") if addr_postcode and not location.get('postal_code'): location['postal_code'] = addr_postcode enriched = True improvements.append(f"Postcode: {addr_postcode}") # 3. Contact info phone = tags.get('phone') or tags.get('contact:phone') email = tags.get('email') or tags.get('contact:email') if phone or email: contact_info = [] if phone: contact_info.append(f"Phone: {phone}") if email: contact_info.append(f"Email: {email}") contact_text = " | ".join(contact_info) current_desc = institution.get('description', '') if contact_text not in current_desc: institution['description'] = (current_desc + f"\n\nContact: {contact_text}").strip() enriched = True stats['contact_info_added'] += 1 improvements.append(f"Contact: {contact_text}") # 4. Website website = tags.get('website') or tags.get('url') or tags.get('contact:website') if website: website_ids = [id for id in identifiers if id.get('identifier_scheme') == 'Website'] if not website_ids: identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': website, 'identifier_url': website }) institution['identifiers'] = identifiers enriched = True stats['websites_added'] += 1 improvements.append(f"Website: {website}") # 5. Opening hours opening_hours = tags.get('opening_hours') if opening_hours: current_desc = institution.get('description', '') hours_text = f"Opening hours: {opening_hours}" if hours_text not in current_desc: institution['description'] = (current_desc + f"\n\n{hours_text}").strip() enriched = True stats['opening_hours_added'] += 1 improvements.append(f"Hours: {opening_hours}") # 6. Alternative names alt_names = [] for key in ['alt_name', 'official_name', 'name:en', 'name:es', 'name:pt']: if key in tags and tags[key] != institution.get('name'): alt_names.append(tags[key]) if alt_names: existing = institution.get('alternative_names', []) new_names = [n for n in alt_names if n not in existing] if new_names: institution['alternative_names'] = existing + new_names enriched = True stats['alternative_names_added'] += len(new_names) improvements.append(f"{len(new_names)} alt names") # Update provenance if enriched: stats['institutions_enriched'] += 1 details.append({'name': inst_name, 'osm_id': osm_id, 'improvements': improvements}) if 'provenance' in institution: notes = institution['provenance'].get('notes', '') osm_note = f"\nOpenStreetMap enrichment (2025-11-06): {osm_id}. {', '.join(improvements[:3])}." institution['provenance']['notes'] = (notes + osm_note).strip() print(f" ✅ Enriched: {', '.join(improvements[:3])}") processed_count += 1 # Save progress every BATCH_SIZE if processed_count % BATCH_SIZE == 0: batch_num += 1 print(f"\n💾 Saving batch {batch_num}...") with open(enriched_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n") f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Batch: {batch_num}\n#\n") f.write("# OpenStreetMap Enrichment Summary:\n") for key, value in stats.items(): f.write(f"# - {key}: {value}\n") f.write("\n") yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) print(f"✅ Saved. Enriched: {stats['institutions_enriched']}\n") # Rate limiting time.sleep(RATE_LIMIT_DELAY) # Save final if processed_count % BATCH_SIZE != 0: batch_num += 1 print(f"\n💾 Saving final batch {batch_num}...") with open(enriched_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n") f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Batch: {batch_num}\n#\n") f.write("# OpenStreetMap Enrichment Summary:\n") for key, value in stats.items(): f.write(f"# - {key}: {value}\n") f.write("\n") yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) print(f"✅ Saved!\n") # Report print("\n" + "="*70) print("OPENSTREETMAP ENRICHMENT REPORT (RESUMED)") print("="*70) print(f"\nProcessed: institutions {START_INDEX + 1} to {len(institutions)}") print(f"OSM IDs found: {stats['osm_ids_found']}") print(f"Records fetched: {stats['osm_records_fetched']}") print(f"Fetch errors: {stats['osm_fetch_errors']}") print(f"\nEnrichments:") print(f" Coordinates: {stats['coordinates_upgraded']}") print(f" Addresses: {stats['addresses_improved']}") print(f" Contact: {stats['contact_info_added']}") print(f" Hours: {stats['opening_hours_added']}") print(f" Websites: {stats['websites_added']}") print(f" Alt names: {stats['alternative_names_added']}") print(f" Total enriched: {stats['institutions_enriched']}") print("\n" + "="*70 + "\n")