- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
565 lines
22 KiB
Python
Executable file
565 lines
22 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
OpenStreetMap Enrichment Script for Latin American Institutions
|
|
|
|
Purpose: Fetch OpenStreetMap data for institutions with OSM identifiers and extract:
|
|
- Precise building-level coordinates (upgrade from city-level)
|
|
- Contact information (phone, email, website if tagged)
|
|
- Opening hours (if tagged)
|
|
- Street addresses (if more detailed than current data)
|
|
- Additional names/tags
|
|
|
|
Strategy:
|
|
1. Load documented Latin American institutions dataset
|
|
2. Find all institutions with OpenStreetMap identifiers (currently 186)
|
|
3. Fetch OSM data via Overpass API for each OSM ID
|
|
4. Parse JSON to extract location and contact metadata
|
|
5. Update institution records with enriched data
|
|
6. Generate enrichment report
|
|
|
|
Author: Global GLAM Dataset Project
|
|
Date: 2025-11-06
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
from collections import defaultdict
|
|
import time
|
|
import re
|
|
|
|
# Overpass API Configuration
|
|
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
|
|
OVERPASS_TIMEOUT = 30 # seconds
|
|
RATE_LIMIT_DELAY = 2.0 # seconds between requests (increased to avoid 429)
|
|
MAX_RETRIES = 3 # Retry failed requests up to 3 times
|
|
RETRY_DELAY = 5.0 # seconds to wait before retry
|
|
|
|
# Alternative Overpass instances (if main is down)
|
|
OVERPASS_MIRRORS = [
|
|
"https://overpass-api.de/api/interpreter",
|
|
"https://overpass.kumi.systems/api/interpreter",
|
|
"https://overpass.openstreetmap.ru/cgi/interpreter"
|
|
]
|
|
|
|
|
|
class OSMEnricher:
|
|
"""Enriches heritage institution records using OpenStreetMap data"""
|
|
|
|
def __init__(self, input_file: Path, output_file: Path):
|
|
self.input_file = input_file
|
|
self.output_file = output_file
|
|
self.institutions = []
|
|
self.overpass_url = OVERPASS_MIRRORS[0]
|
|
self.enrichment_stats = {
|
|
'total_institutions': 0,
|
|
'osm_ids_found': 0,
|
|
'osm_records_fetched': 0,
|
|
'osm_fetch_errors': 0,
|
|
'coordinates_upgraded': 0, # City-level → Building-level
|
|
'addresses_improved': 0,
|
|
'contact_info_added': 0,
|
|
'opening_hours_added': 0,
|
|
'alternative_names_added': 0,
|
|
'websites_added': 0,
|
|
'institutions_enriched': 0
|
|
}
|
|
self.enrichment_details = []
|
|
|
|
def load_institutions(self):
|
|
"""Load institutions from YAML file"""
|
|
print(f"Loading institutions from {self.input_file}")
|
|
with open(self.input_file, 'r', encoding='utf-8') as f:
|
|
self.institutions = yaml.safe_load(f)
|
|
|
|
self.enrichment_stats['total_institutions'] = len(self.institutions)
|
|
print(f"Loaded {len(self.institutions)} institutions")
|
|
|
|
def fetch_osm_data(self, osm_id: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Fetch OSM data via Overpass API with retry logic
|
|
|
|
Args:
|
|
osm_id: OpenStreetMap identifier (format: "way/123456" or "node/123456" or "relation/123456")
|
|
|
|
Returns:
|
|
OSM element data as dictionary or None if fetch failed
|
|
"""
|
|
# Parse OSM ID format
|
|
if '/' in osm_id:
|
|
osm_type, osm_number = osm_id.split('/')
|
|
else:
|
|
# Assume it's just a number, try as node first
|
|
osm_type = 'node'
|
|
osm_number = osm_id
|
|
|
|
# Build Overpass QL query
|
|
# Request: element by ID, include tags and geometry
|
|
overpass_query = f"""
|
|
[out:json][timeout:{OVERPASS_TIMEOUT}];
|
|
{osm_type}({osm_number});
|
|
out center tags;
|
|
"""
|
|
|
|
# Retry logic for transient failures
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
if attempt > 0:
|
|
print(f" Retry {attempt}/{MAX_RETRIES-1}...")
|
|
time.sleep(RETRY_DELAY)
|
|
|
|
response = requests.post(
|
|
self.overpass_url,
|
|
data={'data': overpass_query},
|
|
timeout=OVERPASS_TIMEOUT
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
elements = data.get('elements', [])
|
|
if elements:
|
|
return elements[0] # Return first element
|
|
else:
|
|
print(f" ⚠️ OSM element not found: {osm_type}/{osm_number}")
|
|
return None
|
|
elif response.status_code == 429:
|
|
# Rate limit - wait longer and retry
|
|
if attempt < MAX_RETRIES - 1:
|
|
print(f" ⚠️ Rate limited (429), waiting {RETRY_DELAY*2}s...")
|
|
time.sleep(RETRY_DELAY * 2)
|
|
continue
|
|
else:
|
|
print(f" ⚠️ OSM fetch failed: HTTP 429 (rate limit)")
|
|
return None
|
|
elif response.status_code in [504, 503, 502]:
|
|
# Server error - retry
|
|
if attempt < MAX_RETRIES - 1:
|
|
print(f" ⚠️ Server error ({response.status_code}), retrying...")
|
|
continue
|
|
else:
|
|
print(f" ⚠️ OSM fetch failed: HTTP {response.status_code}")
|
|
return None
|
|
else:
|
|
print(f" ⚠️ OSM fetch failed: HTTP {response.status_code}")
|
|
return None
|
|
|
|
except requests.RequestException as e:
|
|
if attempt < MAX_RETRIES - 1:
|
|
print(f" ⚠️ Request error: {e}, retrying...")
|
|
continue
|
|
else:
|
|
print(f" ❌ OSM fetch error: {e}")
|
|
return None
|
|
except json.JSONDecodeError as e:
|
|
print(f" ❌ JSON parse error: {e}")
|
|
return None
|
|
|
|
return None
|
|
|
|
def extract_coordinates(self, osm_element: Dict[str, Any]) -> Optional[Dict[str, float]]:
|
|
"""
|
|
Extract precise coordinates from OSM element
|
|
|
|
Returns:
|
|
{'latitude': float, 'longitude': float} or None
|
|
"""
|
|
# For nodes: lat/lon directly
|
|
if 'lat' in osm_element and 'lon' in osm_element:
|
|
return {
|
|
'latitude': osm_element['lat'],
|
|
'longitude': osm_element['lon']
|
|
}
|
|
|
|
# For ways/relations: use center coordinates
|
|
if 'center' in osm_element:
|
|
center = osm_element['center']
|
|
if 'lat' in center and 'lon' in center:
|
|
return {
|
|
'latitude': center['lat'],
|
|
'longitude': center['lon']
|
|
}
|
|
|
|
return None
|
|
|
|
def extract_address(self, tags: Dict[str, str]) -> Dict[str, str]:
|
|
"""
|
|
Extract address components from OSM tags
|
|
|
|
Returns:
|
|
Dictionary with address fields (street_address, city, postal_code, etc.)
|
|
"""
|
|
address = {}
|
|
|
|
# OSM address tags: addr:street, addr:housenumber, addr:city, addr:postcode
|
|
street = tags.get('addr:street', '')
|
|
housenumber = tags.get('addr:housenumber', '')
|
|
|
|
if street and housenumber:
|
|
address['street_address'] = f"{street} {housenumber}".strip()
|
|
elif street:
|
|
address['street_address'] = street
|
|
|
|
if 'addr:city' in tags:
|
|
address['city'] = tags['addr:city']
|
|
|
|
if 'addr:postcode' in tags:
|
|
address['postal_code'] = tags['addr:postcode']
|
|
|
|
if 'addr:state' in tags:
|
|
address['region'] = tags['addr:state']
|
|
|
|
if 'addr:country' in tags:
|
|
address['country'] = tags['addr:country']
|
|
|
|
return address
|
|
|
|
def extract_contact_info(self, tags: Dict[str, str]) -> Dict[str, str]:
|
|
"""
|
|
Extract contact information from OSM tags
|
|
|
|
Returns:
|
|
Dictionary with contact:phone, contact:email, contact:website, etc.
|
|
"""
|
|
contact = {}
|
|
|
|
# Phone numbers
|
|
for key in ['phone', 'contact:phone', 'telephone']:
|
|
if key in tags:
|
|
contact['phone'] = tags[key]
|
|
break
|
|
|
|
# Email
|
|
for key in ['email', 'contact:email']:
|
|
if key in tags:
|
|
contact['email'] = tags[key]
|
|
break
|
|
|
|
# Website (distinct from identifier URLs)
|
|
for key in ['website', 'contact:website', 'url']:
|
|
if key in tags:
|
|
contact['website'] = tags[key]
|
|
break
|
|
|
|
# Opening hours
|
|
if 'opening_hours' in tags:
|
|
contact['opening_hours'] = tags['opening_hours']
|
|
|
|
return contact
|
|
|
|
def extract_alternative_names(self, tags: Dict[str, str]) -> List[str]:
|
|
"""Extract alternative name variants from OSM tags"""
|
|
names = []
|
|
|
|
# Common name tags in OSM
|
|
name_keys = [
|
|
'alt_name', 'official_name', 'short_name', 'old_name',
|
|
'name:en', 'name:es', 'name:pt' # Common languages for Latin America
|
|
]
|
|
|
|
for key in name_keys:
|
|
if key in tags and tags[key]:
|
|
name = tags[key].strip()
|
|
if name and name not in names:
|
|
names.append(name)
|
|
|
|
return names
|
|
|
|
def enrich_institution(self, institution: Dict[str, Any]) -> bool:
|
|
"""
|
|
Enrich a single institution with OSM data
|
|
|
|
Returns:
|
|
True if enrichment occurred, False otherwise
|
|
"""
|
|
# Find OSM identifier
|
|
osm_id = None
|
|
identifiers = institution.get('identifiers', [])
|
|
|
|
for identifier in identifiers:
|
|
if identifier.get('identifier_scheme') == 'OpenStreetMap':
|
|
osm_id = identifier.get('identifier_value')
|
|
break
|
|
|
|
if not osm_id:
|
|
return False
|
|
|
|
self.enrichment_stats['osm_ids_found'] += 1
|
|
|
|
print(f"\n🗺️ Enriching: {institution.get('name')} (OSM {osm_id})")
|
|
|
|
# Fetch OSM data
|
|
osm_element = self.fetch_osm_data(osm_id)
|
|
if osm_element is None:
|
|
self.enrichment_stats['osm_fetch_errors'] += 1
|
|
return False
|
|
|
|
self.enrichment_stats['osm_records_fetched'] += 1
|
|
|
|
enriched = False
|
|
enrichment_log = {
|
|
'institution_name': institution.get('name'),
|
|
'osm_id': osm_id,
|
|
'improvements': []
|
|
}
|
|
|
|
tags = osm_element.get('tags', {})
|
|
|
|
# Extract and update coordinates
|
|
coords = self.extract_coordinates(osm_element)
|
|
if coords:
|
|
locations = institution.get('locations', [])
|
|
if locations:
|
|
# Check if we're upgrading from city-level to building-level
|
|
current_location = locations[0]
|
|
current_lat = current_location.get('latitude')
|
|
current_lon = current_location.get('longitude')
|
|
|
|
# If coordinates differ significantly (>0.001 degrees ≈ 100m), it's an upgrade
|
|
if current_lat and current_lon:
|
|
lat_diff = abs(coords['latitude'] - current_lat)
|
|
lon_diff = abs(coords['longitude'] - current_lon)
|
|
|
|
if lat_diff > 0.001 or lon_diff > 0.001:
|
|
print(f" ✅ Upgraded coordinates: precision improved")
|
|
current_location['latitude'] = coords['latitude']
|
|
current_location['longitude'] = coords['longitude']
|
|
self.enrichment_stats['coordinates_upgraded'] += 1
|
|
enrichment_log['improvements'].append('Coordinates upgraded to building-level precision')
|
|
enriched = True
|
|
else:
|
|
# No coordinates yet, add them
|
|
print(f" ✅ Added coordinates: {coords['latitude']}, {coords['longitude']}")
|
|
current_location['latitude'] = coords['latitude']
|
|
current_location['longitude'] = coords['longitude']
|
|
self.enrichment_stats['coordinates_upgraded'] += 1
|
|
enrichment_log['improvements'].append('Added building coordinates')
|
|
enriched = True
|
|
|
|
# Extract and update address
|
|
address = self.extract_address(tags)
|
|
if address:
|
|
locations = institution.get('locations', [])
|
|
if locations:
|
|
current_location = locations[0]
|
|
|
|
# Add street address if better than current
|
|
if 'street_address' in address and not current_location.get('street_address'):
|
|
print(f" ✅ Added street address: {address['street_address']}")
|
|
current_location['street_address'] = address['street_address']
|
|
self.enrichment_stats['addresses_improved'] += 1
|
|
enrichment_log['improvements'].append(f"Street address: {address['street_address']}")
|
|
enriched = True
|
|
|
|
# Add postal code if missing
|
|
if 'postal_code' in address and not current_location.get('postal_code'):
|
|
print(f" ✅ Added postal code: {address['postal_code']}")
|
|
current_location['postal_code'] = address['postal_code']
|
|
enrichment_log['improvements'].append(f"Postal code: {address['postal_code']}")
|
|
enriched = True
|
|
|
|
# Extract contact information
|
|
contact = self.extract_contact_info(tags)
|
|
|
|
# Add phone number as identifier
|
|
if 'phone' in contact:
|
|
has_phone = any(
|
|
id.get('identifier_scheme') == 'Phone'
|
|
for id in identifiers
|
|
)
|
|
if not has_phone:
|
|
print(f" ✅ Added phone: {contact['phone']}")
|
|
identifiers.append({
|
|
'identifier_scheme': 'Phone',
|
|
'identifier_value': contact['phone'],
|
|
'identifier_url': None
|
|
})
|
|
self.enrichment_stats['contact_info_added'] += 1
|
|
enrichment_log['improvements'].append(f"Phone: {contact['phone']}")
|
|
enriched = True
|
|
|
|
# Add email as identifier
|
|
if 'email' in contact:
|
|
has_email = any(
|
|
id.get('identifier_scheme') == 'Email'
|
|
for id in identifiers
|
|
)
|
|
if not has_email:
|
|
print(f" ✅ Added email: {contact['email']}")
|
|
identifiers.append({
|
|
'identifier_scheme': 'Email',
|
|
'identifier_value': contact['email'],
|
|
'identifier_url': f"mailto:{contact['email']}"
|
|
})
|
|
self.enrichment_stats['contact_info_added'] += 1
|
|
enrichment_log['improvements'].append(f"Email: {contact['email']}")
|
|
enriched = True
|
|
|
|
# Add website if different from existing
|
|
if 'website' in contact:
|
|
existing_websites = [
|
|
id.get('identifier_value')
|
|
for id in identifiers
|
|
if id.get('identifier_scheme') == 'Website'
|
|
]
|
|
if contact['website'] not in existing_websites:
|
|
print(f" ✅ Added website: {contact['website']}")
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': contact['website'],
|
|
'identifier_url': contact['website']
|
|
})
|
|
self.enrichment_stats['websites_added'] += 1
|
|
enrichment_log['improvements'].append(f"Website: {contact['website']}")
|
|
enriched = True
|
|
|
|
# Add opening hours to description or as note
|
|
if 'opening_hours' in contact:
|
|
# Add to description
|
|
description = institution.get('description', '')
|
|
hours_text = f"Opening hours: {contact['opening_hours']}"
|
|
|
|
if hours_text not in description:
|
|
print(f" ✅ Added opening hours: {contact['opening_hours']}")
|
|
if description:
|
|
institution['description'] = f"{description} {hours_text}"
|
|
else:
|
|
institution['description'] = hours_text
|
|
|
|
self.enrichment_stats['opening_hours_added'] += 1
|
|
enrichment_log['improvements'].append(f"Opening hours: {contact['opening_hours']}")
|
|
enriched = True
|
|
|
|
# Extract alternative names
|
|
alt_names = self.extract_alternative_names(tags)
|
|
if alt_names:
|
|
existing_alt_names = institution.get('alternative_names', [])
|
|
new_names = [name for name in alt_names if name not in existing_alt_names]
|
|
|
|
if new_names:
|
|
print(f" ✅ Found {len(new_names)} alternative names")
|
|
institution['alternative_names'] = existing_alt_names + new_names
|
|
self.enrichment_stats['alternative_names_added'] += len(new_names)
|
|
enrichment_log['improvements'].append(f"Alternative names: {', '.join(new_names[:3])}")
|
|
enriched = True
|
|
|
|
if enriched:
|
|
self.enrichment_stats['institutions_enriched'] += 1
|
|
self.enrichment_details.append(enrichment_log)
|
|
|
|
# Update provenance
|
|
if 'provenance' in institution:
|
|
existing_notes = institution['provenance'].get('notes', '')
|
|
osm_note = f"\nOpenStreetMap enrichment (2025-11-06): Fetched OSM element {osm_id}. "
|
|
osm_note += f"Improvements: {', '.join(enrichment_log['improvements'][:3])}."
|
|
|
|
institution['provenance']['notes'] = (existing_notes + osm_note).strip()
|
|
|
|
return enriched
|
|
|
|
def process_all_institutions(self):
|
|
"""Process all institutions and enrich from OpenStreetMap"""
|
|
print(f"\n{'='*70}")
|
|
print("OpenStreetMap Enrichment Process")
|
|
print(f"{'='*70}\n")
|
|
|
|
for idx, institution in enumerate(self.institutions, 1):
|
|
enriched = self.enrich_institution(institution)
|
|
|
|
if enriched:
|
|
print(f" ✅ Enrichment successful")
|
|
|
|
# Rate limiting
|
|
if idx < len(self.institutions):
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|
|
|
print(f"\n{'='*70}")
|
|
print("OpenStreetMap Enrichment Complete")
|
|
print(f"{'='*70}\n")
|
|
|
|
def save_enriched_dataset(self):
|
|
"""Save enriched institutions to output file"""
|
|
print(f"Saving enriched dataset to {self.output_file}")
|
|
|
|
with open(self.output_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
|
|
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write("#\n")
|
|
f.write("# OpenStreetMap Enrichment Summary:\n")
|
|
for key, value in self.enrichment_stats.items():
|
|
f.write(f"# - {key}: {value}\n")
|
|
f.write("\n")
|
|
|
|
yaml.dump(self.institutions, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"✅ Saved {len(self.institutions)} institutions")
|
|
|
|
def generate_report(self):
|
|
"""Generate enrichment report"""
|
|
print("\n" + "="*70)
|
|
print("OPENSTREETMAP ENRICHMENT REPORT")
|
|
print("="*70 + "\n")
|
|
|
|
print(f"Total institutions processed: {self.enrichment_stats['total_institutions']}")
|
|
print(f"Institutions with OSM IDs: {self.enrichment_stats['osm_ids_found']}")
|
|
print(f"OSM records successfully fetched: {self.enrichment_stats['osm_records_fetched']}")
|
|
print(f"OSM fetch errors: {self.enrichment_stats['osm_fetch_errors']}")
|
|
print(f"\nEnrichment Results:")
|
|
print(f" Coordinates upgraded: {self.enrichment_stats['coordinates_upgraded']}")
|
|
print(f" Addresses improved: {self.enrichment_stats['addresses_improved']}")
|
|
print(f" Contact info added: {self.enrichment_stats['contact_info_added']}")
|
|
print(f" Opening hours added: {self.enrichment_stats['opening_hours_added']}")
|
|
print(f" Alternative names added: {self.enrichment_stats['alternative_names_added']}")
|
|
print(f" Websites added: {self.enrichment_stats['websites_added']}")
|
|
print(f" Institutions enriched: {self.enrichment_stats['institutions_enriched']}")
|
|
|
|
if self.enrichment_details:
|
|
print(f"\nDetailed Enrichment Log (showing first 10):")
|
|
for detail in self.enrichment_details[:10]:
|
|
print(f"\n {detail['institution_name']} (OSM {detail['osm_id']})")
|
|
for improvement in detail['improvements'][:3]:
|
|
print(f" + {improvement}")
|
|
|
|
print("\n" + "="*70 + "\n")
|
|
|
|
|
|
def main():
|
|
"""Main execution"""
|
|
# File paths
|
|
base_dir = Path(__file__).parent.parent
|
|
input_file = base_dir / "data" / "instances" / "latin_american_institutions_documented.yaml"
|
|
output_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml"
|
|
|
|
# Validate input file exists
|
|
if not input_file.exists():
|
|
print(f"❌ Error: Input file not found: {input_file}")
|
|
print(" Please ensure the documented dataset exists.")
|
|
return 1
|
|
|
|
# Create enricher
|
|
enricher = OSMEnricher(input_file, output_file)
|
|
|
|
# Load institutions
|
|
enricher.load_institutions()
|
|
|
|
# Process all institutions
|
|
enricher.process_all_institutions()
|
|
|
|
# Save enriched dataset
|
|
enricher.save_enriched_dataset()
|
|
|
|
# Generate report
|
|
enricher.generate_report()
|
|
|
|
print(f"✅ OpenStreetMap enrichment complete!")
|
|
print(f" Input: {input_file}")
|
|
print(f" Output: {output_file}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
exit(main())
|