glam/scripts/enrich_from_osm_batched.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

418 lines
17 KiB
Python
Executable file

#!/usr/bin/env python3
"""
OpenStreetMap Enrichment Script - BATCHED VERSION
Saves progress every 20 institutions to avoid data loss on timeout.
"""
import yaml
import requests
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
from collections import defaultdict
import time
import re
# Configuration
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
OVERPASS_TIMEOUT = 30
RATE_LIMIT_DELAY = 2.0
MAX_RETRIES = 3
RETRY_DELAY = 5.0
BATCH_SIZE = 20 # Save progress every 20 institutions
OVERPASS_MIRRORS = [
"https://overpass-api.de/api/interpreter",
"https://overpass.kumi.systems/api/interpreter",
"https://overpass.openstreetmap.ru/cgi/interpreter"
]
class OSMEnricher:
"""Enriches heritage institution records using OpenStreetMap data"""
def __init__(self, input_file: Path, output_file: Path):
self.input_file = input_file
self.output_file = output_file
self.institutions = []
self.enrichment_stats = defaultdict(int)
self.enrichment_details = []
self.current_mirror = 0
def load_institutions(self):
"""Load institutions from YAML file"""
print(f"Loading institutions from {self.input_file}")
with open(self.input_file, 'r', encoding='utf-8') as f:
self.institutions = yaml.safe_load(f)
if not isinstance(self.institutions, list):
raise ValueError("Expected list of institutions in YAML file")
self.enrichment_stats['total_institutions'] = len(self.institutions)
print(f"✅ Loaded {len(self.institutions)} institutions\n")
def get_overpass_url(self):
"""Get current Overpass API URL (with failover)"""
return OVERPASS_MIRRORS[self.current_mirror % len(OVERPASS_MIRRORS)]
def fetch_osm_element(self, osm_id: str, retry_count: int = 0) -> Optional[Dict]:
"""Fetch OSM element data via Overpass API"""
# Parse OSM ID format (e.g., "node/123", "way/456", "relation/789")
match = re.match(r'(node|way|relation)/(\d+)', osm_id)
if not match:
print(f" ⚠️ Invalid OSM ID format: {osm_id}")
return None
element_type, element_id = match.groups()
# Construct Overpass query
query = f"""
[out:json][timeout:{OVERPASS_TIMEOUT}];
{element_type}({element_id});
out body;
>;
out skel qt;
"""
try:
url = self.get_overpass_url()
response = requests.post(
url,
data={'data': query},
timeout=OVERPASS_TIMEOUT + 5
)
if response.status_code == 200:
data = response.json()
if data.get('elements'):
return data['elements'][0]
else:
print(f" ⚠️ No data returned for OSM {osm_id}")
return None
elif response.status_code == 429:
# Rate limit - wait and retry
if retry_count < MAX_RETRIES:
wait_time = RETRY_DELAY * 2
print(f" ⚠️ Rate limited (429). Waiting {wait_time}s before retry {retry_count+1}/{MAX_RETRIES}")
time.sleep(wait_time)
return self.fetch_osm_element(osm_id, retry_count + 1)
else:
print(f" ❌ Rate limit exceeded after {MAX_RETRIES} retries")
return None
elif response.status_code in [502, 503, 504]:
# Server error - retry with different mirror
if retry_count < MAX_RETRIES:
self.current_mirror += 1
new_url = self.get_overpass_url()
print(f" ⚠️ Server error ({response.status_code}). Switching to {new_url}")
time.sleep(RETRY_DELAY)
return self.fetch_osm_element(osm_id, retry_count + 1)
else:
print(f" ❌ Server error after {MAX_RETRIES} retries")
return None
else:
print(f" ❌ HTTP {response.status_code}: {response.text[:100]}")
return None
except requests.Timeout:
if retry_count < MAX_RETRIES:
print(f" ⚠️ Timeout. Retry {retry_count+1}/{MAX_RETRIES}")
time.sleep(RETRY_DELAY)
return self.fetch_osm_element(osm_id, retry_count + 1)
else:
print(f" ❌ Timeout after {MAX_RETRIES} retries")
return None
except Exception as e:
print(f" ❌ Error fetching OSM data: {e}")
return None
def calculate_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float:
"""Calculate distance in meters between two coordinates (Haversine formula)"""
from math import radians, sin, cos, sqrt, atan2
R = 6371000 # Earth radius in meters
lat1_rad = radians(lat1)
lat2_rad = radians(lat2)
dlat = radians(lat2 - lat1)
dlon = radians(lon2 - lon1)
a = sin(dlat/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon/2)**2
c = 2 * atan2(sqrt(a), sqrt(1-a))
return R * c
def enrich_institution(self, institution: Dict) -> bool:
"""Enrich a single institution from OSM data"""
# Check for OSM identifier
identifiers = institution.get('identifiers', [])
osm_ids = [id for id in identifiers if id.get('identifier_scheme') == 'OpenStreetMap']
if not osm_ids:
return False
self.enrichment_stats['osm_ids_found'] += 1
osm_id = osm_ids[0]['identifier_value']
inst_name = institution.get('name', 'Unknown')
print(f"\n[{self.enrichment_stats['osm_ids_found']}] {inst_name}")
print(f" OSM ID: {osm_id}")
# Fetch OSM data
osm_data = self.fetch_osm_element(osm_id)
if not osm_data:
self.enrichment_stats['osm_fetch_errors'] += 1
return False
self.enrichment_stats['osm_records_fetched'] += 1
# Extract tags
tags = osm_data.get('tags', {})
enriched = False
enrichment_log = {
'institution_name': inst_name,
'osm_id': osm_id,
'improvements': []
}
# 1. Coordinates upgrade
if 'lat' in osm_data and 'lon' in osm_data:
osm_lat = float(osm_data['lat'])
osm_lon = float(osm_data['lon'])
locations = institution.get('locations', [])
if locations and locations[0].get('latitude') and locations[0].get('longitude'):
current_lat = locations[0]['latitude']
current_lon = locations[0]['longitude']
distance = self.calculate_distance(current_lat, current_lon, osm_lat, osm_lon)
if distance > 100: # More than 100m difference = upgrade
locations[0]['latitude'] = osm_lat
locations[0]['longitude'] = osm_lon
enriched = True
self.enrichment_stats['coordinates_upgraded'] += 1
enrichment_log['improvements'].append(
f"Upgraded coordinates (precision improved by {int(distance)}m)"
)
elif locations:
locations[0]['latitude'] = osm_lat
locations[0]['longitude'] = osm_lon
enriched = True
self.enrichment_stats['coordinates_upgraded'] += 1
enrichment_log['improvements'].append("Added precise coordinates from OSM")
# 2. Street address
addr_street = tags.get('addr:street')
addr_housenumber = tags.get('addr:housenumber')
addr_postcode = tags.get('addr:postcode')
if addr_street or addr_housenumber or addr_postcode:
locations = institution.get('locations', [])
if locations:
location = locations[0]
if addr_street and addr_housenumber:
full_address = f"{addr_street} {addr_housenumber}"
if not location.get('street_address') or len(full_address) > len(location.get('street_address', '')):
location['street_address'] = full_address
enriched = True
self.enrichment_stats['addresses_improved'] += 1
enrichment_log['improvements'].append(f"Added street address: {full_address}")
if addr_postcode and not location.get('postal_code'):
location['postal_code'] = addr_postcode
enriched = True
enrichment_log['improvements'].append(f"Added postal code: {addr_postcode}")
# 3. Contact information
phone = tags.get('phone') or tags.get('contact:phone')
email = tags.get('email') or tags.get('contact:email')
website = tags.get('website') or tags.get('url') or tags.get('contact:website')
contact_added = False
if phone or email:
# Store in description for now (no dedicated contact field in schema)
contact_info = []
if phone:
contact_info.append(f"Phone: {phone}")
if email:
contact_info.append(f"Email: {email}")
if contact_info:
contact_text = " | ".join(contact_info)
current_desc = institution.get('description', '')
if contact_text not in current_desc:
institution['description'] = (current_desc + f"\n\nContact: {contact_text}").strip()
enriched = True
contact_added = True
self.enrichment_stats['contact_info_added'] += 1
enrichment_log['improvements'].append(f"Added contact info: {contact_text}")
# 4. Website
if website:
identifiers = institution.get('identifiers', [])
website_ids = [id for id in identifiers if id.get('identifier_scheme') == 'Website']
if not website_ids:
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': website,
'identifier_url': website
})
institution['identifiers'] = identifiers
enriched = True
self.enrichment_stats['websites_added'] += 1
enrichment_log['improvements'].append(f"Added website: {website}")
# 5. Opening hours
opening_hours = tags.get('opening_hours')
if opening_hours:
current_desc = institution.get('description', '')
hours_text = f"Opening hours: {opening_hours}"
if hours_text not in current_desc:
institution['description'] = (current_desc + f"\n\n{hours_text}").strip()
enriched = True
self.enrichment_stats['opening_hours_added'] += 1
enrichment_log['improvements'].append(f"Added opening hours: {opening_hours}")
# 6. Alternative names
alt_names = []
for key in ['alt_name', 'official_name', 'name:en', 'name:es', 'name:pt']:
if key in tags and tags[key] != institution.get('name'):
alt_names.append(tags[key])
if alt_names:
existing_alt_names = institution.get('alternative_names', [])
new_alt_names = [n for n in alt_names if n not in existing_alt_names]
if new_alt_names:
institution['alternative_names'] = existing_alt_names + new_alt_names
enriched = True
self.enrichment_stats['alternative_names_added'] += len(new_alt_names)
enrichment_log['improvements'].append(
f"Added {len(new_alt_names)} alternative names"
)
# Update provenance
if enriched:
self.enrichment_stats['institutions_enriched'] += 1
self.enrichment_details.append(enrichment_log)
if 'provenance' in institution:
existing_notes = institution['provenance'].get('notes', '')
osm_note = f"\nOpenStreetMap enrichment (2025-11-06): Fetched OSM element {osm_id}. "
osm_note += f"Improvements: {', '.join(enrichment_log['improvements'][:3])}."
institution['provenance']['notes'] = (existing_notes + osm_note).strip()
return enriched
def save_progress(self, batch_num: int):
"""Save current progress to output file"""
print(f"\n💾 Saving progress (batch {batch_num})...")
with open(self.output_file, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Batch: {batch_num}\n")
f.write("#\n")
f.write("# OpenStreetMap Enrichment Summary (partial):\n")
for key, value in self.enrichment_stats.items():
f.write(f"# - {key}: {value}\n")
f.write("\n")
yaml.dump(self.institutions, f, allow_unicode=True, sort_keys=False)
print(f"✅ Saved progress: {self.enrichment_stats['institutions_enriched']} institutions enriched so far\n")
def process_all_institutions(self):
"""Process all institutions with batched progress saving"""
print(f"\n{'='*70}")
print("OpenStreetMap Enrichment Process (BATCHED)")
print(f"{'='*70}\n")
batch_num = 0
for idx, institution in enumerate(self.institutions, 1):
enriched = self.enrich_institution(institution)
if enriched:
print(f" ✅ Enrichment successful")
# Save progress every BATCH_SIZE institutions
if idx % BATCH_SIZE == 0:
batch_num += 1
self.save_progress(batch_num)
# Rate limiting
if idx < len(self.institutions):
time.sleep(RATE_LIMIT_DELAY)
# Save final state if not already saved
if len(self.institutions) % BATCH_SIZE != 0:
batch_num += 1
self.save_progress(batch_num)
print(f"\n{'='*70}")
print("OpenStreetMap Enrichment Complete")
print(f"{'='*70}\n")
def generate_report(self):
"""Generate enrichment report"""
print("\n" + "="*70)
print("OPENSTREETMAP ENRICHMENT REPORT")
print("="*70 + "\n")
print(f"Total institutions processed: {self.enrichment_stats['total_institutions']}")
print(f"Institutions with OSM IDs: {self.enrichment_stats['osm_ids_found']}")
print(f"OSM records successfully fetched: {self.enrichment_stats['osm_records_fetched']}")
print(f"OSM fetch errors: {self.enrichment_stats['osm_fetch_errors']}")
print(f"\nEnrichment Results:")
print(f" Coordinates upgraded: {self.enrichment_stats['coordinates_upgraded']}")
print(f" Addresses improved: {self.enrichment_stats['addresses_improved']}")
print(f" Contact info added: {self.enrichment_stats['contact_info_added']}")
print(f" Opening hours added: {self.enrichment_stats['opening_hours_added']}")
print(f" Alternative names added: {self.enrichment_stats['alternative_names_added']}")
print(f" Websites added: {self.enrichment_stats['websites_added']}")
print(f" Institutions enriched: {self.enrichment_stats['institutions_enriched']}")
if self.enrichment_details:
print(f"\nDetailed Enrichment Log (showing first 20):")
for detail in self.enrichment_details[:20]:
print(f"\n {detail['institution_name']} (OSM {detail['osm_id']})")
for improvement in detail['improvements'][:5]:
print(f" + {improvement}")
print("\n" + "="*70 + "\n")
def main():
"""Main execution"""
base_dir = Path(__file__).parent.parent
input_file = base_dir / "data" / "instances" / "latin_american_institutions_documented.yaml"
output_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml"
if not input_file.exists():
print(f"❌ Error: Input file not found: {input_file}")
return 1
enricher = OSMEnricher(input_file, output_file)
enricher.load_institutions()
enricher.process_all_institutions()
enricher.generate_report()
print(f"\n✅ Enrichment complete! Output saved to:")
print(f" {output_file}")
return 0
if __name__ == "__main__":
exit(main())