glam/scripts/enrich_from_osm.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

565 lines
22 KiB
Python
Executable file

#!/usr/bin/env python3
"""
OpenStreetMap Enrichment Script for Latin American Institutions
Purpose: Fetch OpenStreetMap data for institutions with OSM identifiers and extract:
- Precise building-level coordinates (upgrade from city-level)
- Contact information (phone, email, website if tagged)
- Opening hours (if tagged)
- Street addresses (if more detailed than current data)
- Additional names/tags
Strategy:
1. Load documented Latin American institutions dataset
2. Find all institutions with OpenStreetMap identifiers (currently 186)
3. Fetch OSM data via Overpass API for each OSM ID
4. Parse JSON to extract location and contact metadata
5. Update institution records with enriched data
6. Generate enrichment report
Author: Global GLAM Dataset Project
Date: 2025-11-06
"""
import yaml
import requests
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
from collections import defaultdict
import time
import re
# Overpass API Configuration
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
OVERPASS_TIMEOUT = 30 # seconds
RATE_LIMIT_DELAY = 2.0 # seconds between requests (increased to avoid 429)
MAX_RETRIES = 3 # Retry failed requests up to 3 times
RETRY_DELAY = 5.0 # seconds to wait before retry
# Alternative Overpass instances (if main is down)
OVERPASS_MIRRORS = [
"https://overpass-api.de/api/interpreter",
"https://overpass.kumi.systems/api/interpreter",
"https://overpass.openstreetmap.ru/cgi/interpreter"
]
class OSMEnricher:
"""Enriches heritage institution records using OpenStreetMap data"""
def __init__(self, input_file: Path, output_file: Path):
self.input_file = input_file
self.output_file = output_file
self.institutions = []
self.overpass_url = OVERPASS_MIRRORS[0]
self.enrichment_stats = {
'total_institutions': 0,
'osm_ids_found': 0,
'osm_records_fetched': 0,
'osm_fetch_errors': 0,
'coordinates_upgraded': 0, # City-level → Building-level
'addresses_improved': 0,
'contact_info_added': 0,
'opening_hours_added': 0,
'alternative_names_added': 0,
'websites_added': 0,
'institutions_enriched': 0
}
self.enrichment_details = []
def load_institutions(self):
"""Load institutions from YAML file"""
print(f"Loading institutions from {self.input_file}")
with open(self.input_file, 'r', encoding='utf-8') as f:
self.institutions = yaml.safe_load(f)
self.enrichment_stats['total_institutions'] = len(self.institutions)
print(f"Loaded {len(self.institutions)} institutions")
def fetch_osm_data(self, osm_id: str) -> Optional[Dict[str, Any]]:
"""
Fetch OSM data via Overpass API with retry logic
Args:
osm_id: OpenStreetMap identifier (format: "way/123456" or "node/123456" or "relation/123456")
Returns:
OSM element data as dictionary or None if fetch failed
"""
# Parse OSM ID format
if '/' in osm_id:
osm_type, osm_number = osm_id.split('/')
else:
# Assume it's just a number, try as node first
osm_type = 'node'
osm_number = osm_id
# Build Overpass QL query
# Request: element by ID, include tags and geometry
overpass_query = f"""
[out:json][timeout:{OVERPASS_TIMEOUT}];
{osm_type}({osm_number});
out center tags;
"""
# Retry logic for transient failures
for attempt in range(MAX_RETRIES):
try:
if attempt > 0:
print(f" Retry {attempt}/{MAX_RETRIES-1}...")
time.sleep(RETRY_DELAY)
response = requests.post(
self.overpass_url,
data={'data': overpass_query},
timeout=OVERPASS_TIMEOUT
)
if response.status_code == 200:
data = response.json()
elements = data.get('elements', [])
if elements:
return elements[0] # Return first element
else:
print(f" ⚠️ OSM element not found: {osm_type}/{osm_number}")
return None
elif response.status_code == 429:
# Rate limit - wait longer and retry
if attempt < MAX_RETRIES - 1:
print(f" ⚠️ Rate limited (429), waiting {RETRY_DELAY*2}s...")
time.sleep(RETRY_DELAY * 2)
continue
else:
print(f" ⚠️ OSM fetch failed: HTTP 429 (rate limit)")
return None
elif response.status_code in [504, 503, 502]:
# Server error - retry
if attempt < MAX_RETRIES - 1:
print(f" ⚠️ Server error ({response.status_code}), retrying...")
continue
else:
print(f" ⚠️ OSM fetch failed: HTTP {response.status_code}")
return None
else:
print(f" ⚠️ OSM fetch failed: HTTP {response.status_code}")
return None
except requests.RequestException as e:
if attempt < MAX_RETRIES - 1:
print(f" ⚠️ Request error: {e}, retrying...")
continue
else:
print(f" ❌ OSM fetch error: {e}")
return None
except json.JSONDecodeError as e:
print(f" ❌ JSON parse error: {e}")
return None
return None
def extract_coordinates(self, osm_element: Dict[str, Any]) -> Optional[Dict[str, float]]:
"""
Extract precise coordinates from OSM element
Returns:
{'latitude': float, 'longitude': float} or None
"""
# For nodes: lat/lon directly
if 'lat' in osm_element and 'lon' in osm_element:
return {
'latitude': osm_element['lat'],
'longitude': osm_element['lon']
}
# For ways/relations: use center coordinates
if 'center' in osm_element:
center = osm_element['center']
if 'lat' in center and 'lon' in center:
return {
'latitude': center['lat'],
'longitude': center['lon']
}
return None
def extract_address(self, tags: Dict[str, str]) -> Dict[str, str]:
"""
Extract address components from OSM tags
Returns:
Dictionary with address fields (street_address, city, postal_code, etc.)
"""
address = {}
# OSM address tags: addr:street, addr:housenumber, addr:city, addr:postcode
street = tags.get('addr:street', '')
housenumber = tags.get('addr:housenumber', '')
if street and housenumber:
address['street_address'] = f"{street} {housenumber}".strip()
elif street:
address['street_address'] = street
if 'addr:city' in tags:
address['city'] = tags['addr:city']
if 'addr:postcode' in tags:
address['postal_code'] = tags['addr:postcode']
if 'addr:state' in tags:
address['region'] = tags['addr:state']
if 'addr:country' in tags:
address['country'] = tags['addr:country']
return address
def extract_contact_info(self, tags: Dict[str, str]) -> Dict[str, str]:
"""
Extract contact information from OSM tags
Returns:
Dictionary with contact:phone, contact:email, contact:website, etc.
"""
contact = {}
# Phone numbers
for key in ['phone', 'contact:phone', 'telephone']:
if key in tags:
contact['phone'] = tags[key]
break
# Email
for key in ['email', 'contact:email']:
if key in tags:
contact['email'] = tags[key]
break
# Website (distinct from identifier URLs)
for key in ['website', 'contact:website', 'url']:
if key in tags:
contact['website'] = tags[key]
break
# Opening hours
if 'opening_hours' in tags:
contact['opening_hours'] = tags['opening_hours']
return contact
def extract_alternative_names(self, tags: Dict[str, str]) -> List[str]:
"""Extract alternative name variants from OSM tags"""
names = []
# Common name tags in OSM
name_keys = [
'alt_name', 'official_name', 'short_name', 'old_name',
'name:en', 'name:es', 'name:pt' # Common languages for Latin America
]
for key in name_keys:
if key in tags and tags[key]:
name = tags[key].strip()
if name and name not in names:
names.append(name)
return names
def enrich_institution(self, institution: Dict[str, Any]) -> bool:
"""
Enrich a single institution with OSM data
Returns:
True if enrichment occurred, False otherwise
"""
# Find OSM identifier
osm_id = None
identifiers = institution.get('identifiers', [])
for identifier in identifiers:
if identifier.get('identifier_scheme') == 'OpenStreetMap':
osm_id = identifier.get('identifier_value')
break
if not osm_id:
return False
self.enrichment_stats['osm_ids_found'] += 1
print(f"\n🗺️ Enriching: {institution.get('name')} (OSM {osm_id})")
# Fetch OSM data
osm_element = self.fetch_osm_data(osm_id)
if osm_element is None:
self.enrichment_stats['osm_fetch_errors'] += 1
return False
self.enrichment_stats['osm_records_fetched'] += 1
enriched = False
enrichment_log = {
'institution_name': institution.get('name'),
'osm_id': osm_id,
'improvements': []
}
tags = osm_element.get('tags', {})
# Extract and update coordinates
coords = self.extract_coordinates(osm_element)
if coords:
locations = institution.get('locations', [])
if locations:
# Check if we're upgrading from city-level to building-level
current_location = locations[0]
current_lat = current_location.get('latitude')
current_lon = current_location.get('longitude')
# If coordinates differ significantly (>0.001 degrees ≈ 100m), it's an upgrade
if current_lat and current_lon:
lat_diff = abs(coords['latitude'] - current_lat)
lon_diff = abs(coords['longitude'] - current_lon)
if lat_diff > 0.001 or lon_diff > 0.001:
print(f" ✅ Upgraded coordinates: precision improved")
current_location['latitude'] = coords['latitude']
current_location['longitude'] = coords['longitude']
self.enrichment_stats['coordinates_upgraded'] += 1
enrichment_log['improvements'].append('Coordinates upgraded to building-level precision')
enriched = True
else:
# No coordinates yet, add them
print(f" ✅ Added coordinates: {coords['latitude']}, {coords['longitude']}")
current_location['latitude'] = coords['latitude']
current_location['longitude'] = coords['longitude']
self.enrichment_stats['coordinates_upgraded'] += 1
enrichment_log['improvements'].append('Added building coordinates')
enriched = True
# Extract and update address
address = self.extract_address(tags)
if address:
locations = institution.get('locations', [])
if locations:
current_location = locations[0]
# Add street address if better than current
if 'street_address' in address and not current_location.get('street_address'):
print(f" ✅ Added street address: {address['street_address']}")
current_location['street_address'] = address['street_address']
self.enrichment_stats['addresses_improved'] += 1
enrichment_log['improvements'].append(f"Street address: {address['street_address']}")
enriched = True
# Add postal code if missing
if 'postal_code' in address and not current_location.get('postal_code'):
print(f" ✅ Added postal code: {address['postal_code']}")
current_location['postal_code'] = address['postal_code']
enrichment_log['improvements'].append(f"Postal code: {address['postal_code']}")
enriched = True
# Extract contact information
contact = self.extract_contact_info(tags)
# Add phone number as identifier
if 'phone' in contact:
has_phone = any(
id.get('identifier_scheme') == 'Phone'
for id in identifiers
)
if not has_phone:
print(f" ✅ Added phone: {contact['phone']}")
identifiers.append({
'identifier_scheme': 'Phone',
'identifier_value': contact['phone'],
'identifier_url': None
})
self.enrichment_stats['contact_info_added'] += 1
enrichment_log['improvements'].append(f"Phone: {contact['phone']}")
enriched = True
# Add email as identifier
if 'email' in contact:
has_email = any(
id.get('identifier_scheme') == 'Email'
for id in identifiers
)
if not has_email:
print(f" ✅ Added email: {contact['email']}")
identifiers.append({
'identifier_scheme': 'Email',
'identifier_value': contact['email'],
'identifier_url': f"mailto:{contact['email']}"
})
self.enrichment_stats['contact_info_added'] += 1
enrichment_log['improvements'].append(f"Email: {contact['email']}")
enriched = True
# Add website if different from existing
if 'website' in contact:
existing_websites = [
id.get('identifier_value')
for id in identifiers
if id.get('identifier_scheme') == 'Website'
]
if contact['website'] not in existing_websites:
print(f" ✅ Added website: {contact['website']}")
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': contact['website'],
'identifier_url': contact['website']
})
self.enrichment_stats['websites_added'] += 1
enrichment_log['improvements'].append(f"Website: {contact['website']}")
enriched = True
# Add opening hours to description or as note
if 'opening_hours' in contact:
# Add to description
description = institution.get('description', '')
hours_text = f"Opening hours: {contact['opening_hours']}"
if hours_text not in description:
print(f" ✅ Added opening hours: {contact['opening_hours']}")
if description:
institution['description'] = f"{description} {hours_text}"
else:
institution['description'] = hours_text
self.enrichment_stats['opening_hours_added'] += 1
enrichment_log['improvements'].append(f"Opening hours: {contact['opening_hours']}")
enriched = True
# Extract alternative names
alt_names = self.extract_alternative_names(tags)
if alt_names:
existing_alt_names = institution.get('alternative_names', [])
new_names = [name for name in alt_names if name not in existing_alt_names]
if new_names:
print(f" ✅ Found {len(new_names)} alternative names")
institution['alternative_names'] = existing_alt_names + new_names
self.enrichment_stats['alternative_names_added'] += len(new_names)
enrichment_log['improvements'].append(f"Alternative names: {', '.join(new_names[:3])}")
enriched = True
if enriched:
self.enrichment_stats['institutions_enriched'] += 1
self.enrichment_details.append(enrichment_log)
# Update provenance
if 'provenance' in institution:
existing_notes = institution['provenance'].get('notes', '')
osm_note = f"\nOpenStreetMap enrichment (2025-11-06): Fetched OSM element {osm_id}. "
osm_note += f"Improvements: {', '.join(enrichment_log['improvements'][:3])}."
institution['provenance']['notes'] = (existing_notes + osm_note).strip()
return enriched
def process_all_institutions(self):
"""Process all institutions and enrich from OpenStreetMap"""
print(f"\n{'='*70}")
print("OpenStreetMap Enrichment Process")
print(f"{'='*70}\n")
for idx, institution in enumerate(self.institutions, 1):
enriched = self.enrich_institution(institution)
if enriched:
print(f" ✅ Enrichment successful")
# Rate limiting
if idx < len(self.institutions):
time.sleep(RATE_LIMIT_DELAY)
print(f"\n{'='*70}")
print("OpenStreetMap Enrichment Complete")
print(f"{'='*70}\n")
def save_enriched_dataset(self):
"""Save enriched institutions to output file"""
print(f"Saving enriched dataset to {self.output_file}")
with open(self.output_file, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
f.write("#\n")
f.write("# OpenStreetMap Enrichment Summary:\n")
for key, value in self.enrichment_stats.items():
f.write(f"# - {key}: {value}\n")
f.write("\n")
yaml.dump(self.institutions, f, allow_unicode=True, sort_keys=False)
print(f"✅ Saved {len(self.institutions)} institutions")
def generate_report(self):
"""Generate enrichment report"""
print("\n" + "="*70)
print("OPENSTREETMAP ENRICHMENT REPORT")
print("="*70 + "\n")
print(f"Total institutions processed: {self.enrichment_stats['total_institutions']}")
print(f"Institutions with OSM IDs: {self.enrichment_stats['osm_ids_found']}")
print(f"OSM records successfully fetched: {self.enrichment_stats['osm_records_fetched']}")
print(f"OSM fetch errors: {self.enrichment_stats['osm_fetch_errors']}")
print(f"\nEnrichment Results:")
print(f" Coordinates upgraded: {self.enrichment_stats['coordinates_upgraded']}")
print(f" Addresses improved: {self.enrichment_stats['addresses_improved']}")
print(f" Contact info added: {self.enrichment_stats['contact_info_added']}")
print(f" Opening hours added: {self.enrichment_stats['opening_hours_added']}")
print(f" Alternative names added: {self.enrichment_stats['alternative_names_added']}")
print(f" Websites added: {self.enrichment_stats['websites_added']}")
print(f" Institutions enriched: {self.enrichment_stats['institutions_enriched']}")
if self.enrichment_details:
print(f"\nDetailed Enrichment Log (showing first 10):")
for detail in self.enrichment_details[:10]:
print(f"\n {detail['institution_name']} (OSM {detail['osm_id']})")
for improvement in detail['improvements'][:3]:
print(f" + {improvement}")
print("\n" + "="*70 + "\n")
def main():
"""Main execution"""
# File paths
base_dir = Path(__file__).parent.parent
input_file = base_dir / "data" / "instances" / "latin_american_institutions_documented.yaml"
output_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml"
# Validate input file exists
if not input_file.exists():
print(f"❌ Error: Input file not found: {input_file}")
print(" Please ensure the documented dataset exists.")
return 1
# Create enricher
enricher = OSMEnricher(input_file, output_file)
# Load institutions
enricher.load_institutions()
# Process all institutions
enricher.process_all_institutions()
# Save enriched dataset
enricher.save_enriched_dataset()
# Generate report
enricher.generate_report()
print(f"✅ OpenStreetMap enrichment complete!")
print(f" Input: {input_file}")
print(f" Output: {output_file}")
return 0
if __name__ == '__main__':
exit(main())