- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
418 lines
17 KiB
Python
Executable file
418 lines
17 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
OpenStreetMap Enrichment Script - BATCHED VERSION
|
|
Saves progress every 20 institutions to avoid data loss on timeout.
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
from collections import defaultdict
|
|
import time
|
|
import re
|
|
|
|
# Configuration
|
|
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
|
|
OVERPASS_TIMEOUT = 30
|
|
RATE_LIMIT_DELAY = 2.0
|
|
MAX_RETRIES = 3
|
|
RETRY_DELAY = 5.0
|
|
BATCH_SIZE = 20 # Save progress every 20 institutions
|
|
|
|
OVERPASS_MIRRORS = [
|
|
"https://overpass-api.de/api/interpreter",
|
|
"https://overpass.kumi.systems/api/interpreter",
|
|
"https://overpass.openstreetmap.ru/cgi/interpreter"
|
|
]
|
|
|
|
|
|
class OSMEnricher:
|
|
"""Enriches heritage institution records using OpenStreetMap data"""
|
|
|
|
def __init__(self, input_file: Path, output_file: Path):
|
|
self.input_file = input_file
|
|
self.output_file = output_file
|
|
self.institutions = []
|
|
self.enrichment_stats = defaultdict(int)
|
|
self.enrichment_details = []
|
|
self.current_mirror = 0
|
|
|
|
def load_institutions(self):
|
|
"""Load institutions from YAML file"""
|
|
print(f"Loading institutions from {self.input_file}")
|
|
with open(self.input_file, 'r', encoding='utf-8') as f:
|
|
self.institutions = yaml.safe_load(f)
|
|
|
|
if not isinstance(self.institutions, list):
|
|
raise ValueError("Expected list of institutions in YAML file")
|
|
|
|
self.enrichment_stats['total_institutions'] = len(self.institutions)
|
|
print(f"✅ Loaded {len(self.institutions)} institutions\n")
|
|
|
|
def get_overpass_url(self):
|
|
"""Get current Overpass API URL (with failover)"""
|
|
return OVERPASS_MIRRORS[self.current_mirror % len(OVERPASS_MIRRORS)]
|
|
|
|
def fetch_osm_element(self, osm_id: str, retry_count: int = 0) -> Optional[Dict]:
|
|
"""Fetch OSM element data via Overpass API"""
|
|
# Parse OSM ID format (e.g., "node/123", "way/456", "relation/789")
|
|
match = re.match(r'(node|way|relation)/(\d+)', osm_id)
|
|
if not match:
|
|
print(f" ⚠️ Invalid OSM ID format: {osm_id}")
|
|
return None
|
|
|
|
element_type, element_id = match.groups()
|
|
|
|
# Construct Overpass query
|
|
query = f"""
|
|
[out:json][timeout:{OVERPASS_TIMEOUT}];
|
|
{element_type}({element_id});
|
|
out body;
|
|
>;
|
|
out skel qt;
|
|
"""
|
|
|
|
try:
|
|
url = self.get_overpass_url()
|
|
response = requests.post(
|
|
url,
|
|
data={'data': query},
|
|
timeout=OVERPASS_TIMEOUT + 5
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data.get('elements'):
|
|
return data['elements'][0]
|
|
else:
|
|
print(f" ⚠️ No data returned for OSM {osm_id}")
|
|
return None
|
|
|
|
elif response.status_code == 429:
|
|
# Rate limit - wait and retry
|
|
if retry_count < MAX_RETRIES:
|
|
wait_time = RETRY_DELAY * 2
|
|
print(f" ⚠️ Rate limited (429). Waiting {wait_time}s before retry {retry_count+1}/{MAX_RETRIES}")
|
|
time.sleep(wait_time)
|
|
return self.fetch_osm_element(osm_id, retry_count + 1)
|
|
else:
|
|
print(f" ❌ Rate limit exceeded after {MAX_RETRIES} retries")
|
|
return None
|
|
|
|
elif response.status_code in [502, 503, 504]:
|
|
# Server error - retry with different mirror
|
|
if retry_count < MAX_RETRIES:
|
|
self.current_mirror += 1
|
|
new_url = self.get_overpass_url()
|
|
print(f" ⚠️ Server error ({response.status_code}). Switching to {new_url}")
|
|
time.sleep(RETRY_DELAY)
|
|
return self.fetch_osm_element(osm_id, retry_count + 1)
|
|
else:
|
|
print(f" ❌ Server error after {MAX_RETRIES} retries")
|
|
return None
|
|
|
|
else:
|
|
print(f" ❌ HTTP {response.status_code}: {response.text[:100]}")
|
|
return None
|
|
|
|
except requests.Timeout:
|
|
if retry_count < MAX_RETRIES:
|
|
print(f" ⚠️ Timeout. Retry {retry_count+1}/{MAX_RETRIES}")
|
|
time.sleep(RETRY_DELAY)
|
|
return self.fetch_osm_element(osm_id, retry_count + 1)
|
|
else:
|
|
print(f" ❌ Timeout after {MAX_RETRIES} retries")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error fetching OSM data: {e}")
|
|
return None
|
|
|
|
def calculate_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float:
|
|
"""Calculate distance in meters between two coordinates (Haversine formula)"""
|
|
from math import radians, sin, cos, sqrt, atan2
|
|
|
|
R = 6371000 # Earth radius in meters
|
|
|
|
lat1_rad = radians(lat1)
|
|
lat2_rad = radians(lat2)
|
|
dlat = radians(lat2 - lat1)
|
|
dlon = radians(lon2 - lon1)
|
|
|
|
a = sin(dlat/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon/2)**2
|
|
c = 2 * atan2(sqrt(a), sqrt(1-a))
|
|
|
|
return R * c
|
|
|
|
def enrich_institution(self, institution: Dict) -> bool:
|
|
"""Enrich a single institution from OSM data"""
|
|
# Check for OSM identifier
|
|
identifiers = institution.get('identifiers', [])
|
|
osm_ids = [id for id in identifiers if id.get('identifier_scheme') == 'OpenStreetMap']
|
|
|
|
if not osm_ids:
|
|
return False
|
|
|
|
self.enrichment_stats['osm_ids_found'] += 1
|
|
osm_id = osm_ids[0]['identifier_value']
|
|
|
|
inst_name = institution.get('name', 'Unknown')
|
|
print(f"\n[{self.enrichment_stats['osm_ids_found']}] {inst_name}")
|
|
print(f" OSM ID: {osm_id}")
|
|
|
|
# Fetch OSM data
|
|
osm_data = self.fetch_osm_element(osm_id)
|
|
|
|
if not osm_data:
|
|
self.enrichment_stats['osm_fetch_errors'] += 1
|
|
return False
|
|
|
|
self.enrichment_stats['osm_records_fetched'] += 1
|
|
|
|
# Extract tags
|
|
tags = osm_data.get('tags', {})
|
|
enriched = False
|
|
enrichment_log = {
|
|
'institution_name': inst_name,
|
|
'osm_id': osm_id,
|
|
'improvements': []
|
|
}
|
|
|
|
# 1. Coordinates upgrade
|
|
if 'lat' in osm_data and 'lon' in osm_data:
|
|
osm_lat = float(osm_data['lat'])
|
|
osm_lon = float(osm_data['lon'])
|
|
|
|
locations = institution.get('locations', [])
|
|
if locations and locations[0].get('latitude') and locations[0].get('longitude'):
|
|
current_lat = locations[0]['latitude']
|
|
current_lon = locations[0]['longitude']
|
|
distance = self.calculate_distance(current_lat, current_lon, osm_lat, osm_lon)
|
|
|
|
if distance > 100: # More than 100m difference = upgrade
|
|
locations[0]['latitude'] = osm_lat
|
|
locations[0]['longitude'] = osm_lon
|
|
enriched = True
|
|
self.enrichment_stats['coordinates_upgraded'] += 1
|
|
enrichment_log['improvements'].append(
|
|
f"Upgraded coordinates (precision improved by {int(distance)}m)"
|
|
)
|
|
elif locations:
|
|
locations[0]['latitude'] = osm_lat
|
|
locations[0]['longitude'] = osm_lon
|
|
enriched = True
|
|
self.enrichment_stats['coordinates_upgraded'] += 1
|
|
enrichment_log['improvements'].append("Added precise coordinates from OSM")
|
|
|
|
# 2. Street address
|
|
addr_street = tags.get('addr:street')
|
|
addr_housenumber = tags.get('addr:housenumber')
|
|
addr_postcode = tags.get('addr:postcode')
|
|
|
|
if addr_street or addr_housenumber or addr_postcode:
|
|
locations = institution.get('locations', [])
|
|
if locations:
|
|
location = locations[0]
|
|
|
|
if addr_street and addr_housenumber:
|
|
full_address = f"{addr_street} {addr_housenumber}"
|
|
if not location.get('street_address') or len(full_address) > len(location.get('street_address', '')):
|
|
location['street_address'] = full_address
|
|
enriched = True
|
|
self.enrichment_stats['addresses_improved'] += 1
|
|
enrichment_log['improvements'].append(f"Added street address: {full_address}")
|
|
|
|
if addr_postcode and not location.get('postal_code'):
|
|
location['postal_code'] = addr_postcode
|
|
enriched = True
|
|
enrichment_log['improvements'].append(f"Added postal code: {addr_postcode}")
|
|
|
|
# 3. Contact information
|
|
phone = tags.get('phone') or tags.get('contact:phone')
|
|
email = tags.get('email') or tags.get('contact:email')
|
|
website = tags.get('website') or tags.get('url') or tags.get('contact:website')
|
|
|
|
contact_added = False
|
|
if phone or email:
|
|
# Store in description for now (no dedicated contact field in schema)
|
|
contact_info = []
|
|
if phone:
|
|
contact_info.append(f"Phone: {phone}")
|
|
if email:
|
|
contact_info.append(f"Email: {email}")
|
|
|
|
if contact_info:
|
|
contact_text = " | ".join(contact_info)
|
|
current_desc = institution.get('description', '')
|
|
if contact_text not in current_desc:
|
|
institution['description'] = (current_desc + f"\n\nContact: {contact_text}").strip()
|
|
enriched = True
|
|
contact_added = True
|
|
self.enrichment_stats['contact_info_added'] += 1
|
|
enrichment_log['improvements'].append(f"Added contact info: {contact_text}")
|
|
|
|
# 4. Website
|
|
if website:
|
|
identifiers = institution.get('identifiers', [])
|
|
website_ids = [id for id in identifiers if id.get('identifier_scheme') == 'Website']
|
|
|
|
if not website_ids:
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': website,
|
|
'identifier_url': website
|
|
})
|
|
institution['identifiers'] = identifiers
|
|
enriched = True
|
|
self.enrichment_stats['websites_added'] += 1
|
|
enrichment_log['improvements'].append(f"Added website: {website}")
|
|
|
|
# 5. Opening hours
|
|
opening_hours = tags.get('opening_hours')
|
|
if opening_hours:
|
|
current_desc = institution.get('description', '')
|
|
hours_text = f"Opening hours: {opening_hours}"
|
|
if hours_text not in current_desc:
|
|
institution['description'] = (current_desc + f"\n\n{hours_text}").strip()
|
|
enriched = True
|
|
self.enrichment_stats['opening_hours_added'] += 1
|
|
enrichment_log['improvements'].append(f"Added opening hours: {opening_hours}")
|
|
|
|
# 6. Alternative names
|
|
alt_names = []
|
|
for key in ['alt_name', 'official_name', 'name:en', 'name:es', 'name:pt']:
|
|
if key in tags and tags[key] != institution.get('name'):
|
|
alt_names.append(tags[key])
|
|
|
|
if alt_names:
|
|
existing_alt_names = institution.get('alternative_names', [])
|
|
new_alt_names = [n for n in alt_names if n not in existing_alt_names]
|
|
|
|
if new_alt_names:
|
|
institution['alternative_names'] = existing_alt_names + new_alt_names
|
|
enriched = True
|
|
self.enrichment_stats['alternative_names_added'] += len(new_alt_names)
|
|
enrichment_log['improvements'].append(
|
|
f"Added {len(new_alt_names)} alternative names"
|
|
)
|
|
|
|
# Update provenance
|
|
if enriched:
|
|
self.enrichment_stats['institutions_enriched'] += 1
|
|
self.enrichment_details.append(enrichment_log)
|
|
|
|
if 'provenance' in institution:
|
|
existing_notes = institution['provenance'].get('notes', '')
|
|
osm_note = f"\nOpenStreetMap enrichment (2025-11-06): Fetched OSM element {osm_id}. "
|
|
osm_note += f"Improvements: {', '.join(enrichment_log['improvements'][:3])}."
|
|
|
|
institution['provenance']['notes'] = (existing_notes + osm_note).strip()
|
|
|
|
return enriched
|
|
|
|
def save_progress(self, batch_num: int):
|
|
"""Save current progress to output file"""
|
|
print(f"\n💾 Saving progress (batch {batch_num})...")
|
|
|
|
with open(self.output_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
|
|
f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Batch: {batch_num}\n")
|
|
f.write("#\n")
|
|
f.write("# OpenStreetMap Enrichment Summary (partial):\n")
|
|
for key, value in self.enrichment_stats.items():
|
|
f.write(f"# - {key}: {value}\n")
|
|
f.write("\n")
|
|
|
|
yaml.dump(self.institutions, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"✅ Saved progress: {self.enrichment_stats['institutions_enriched']} institutions enriched so far\n")
|
|
|
|
def process_all_institutions(self):
|
|
"""Process all institutions with batched progress saving"""
|
|
print(f"\n{'='*70}")
|
|
print("OpenStreetMap Enrichment Process (BATCHED)")
|
|
print(f"{'='*70}\n")
|
|
|
|
batch_num = 0
|
|
|
|
for idx, institution in enumerate(self.institutions, 1):
|
|
enriched = self.enrich_institution(institution)
|
|
|
|
if enriched:
|
|
print(f" ✅ Enrichment successful")
|
|
|
|
# Save progress every BATCH_SIZE institutions
|
|
if idx % BATCH_SIZE == 0:
|
|
batch_num += 1
|
|
self.save_progress(batch_num)
|
|
|
|
# Rate limiting
|
|
if idx < len(self.institutions):
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|
|
|
# Save final state if not already saved
|
|
if len(self.institutions) % BATCH_SIZE != 0:
|
|
batch_num += 1
|
|
self.save_progress(batch_num)
|
|
|
|
print(f"\n{'='*70}")
|
|
print("OpenStreetMap Enrichment Complete")
|
|
print(f"{'='*70}\n")
|
|
|
|
def generate_report(self):
|
|
"""Generate enrichment report"""
|
|
print("\n" + "="*70)
|
|
print("OPENSTREETMAP ENRICHMENT REPORT")
|
|
print("="*70 + "\n")
|
|
|
|
print(f"Total institutions processed: {self.enrichment_stats['total_institutions']}")
|
|
print(f"Institutions with OSM IDs: {self.enrichment_stats['osm_ids_found']}")
|
|
print(f"OSM records successfully fetched: {self.enrichment_stats['osm_records_fetched']}")
|
|
print(f"OSM fetch errors: {self.enrichment_stats['osm_fetch_errors']}")
|
|
print(f"\nEnrichment Results:")
|
|
print(f" Coordinates upgraded: {self.enrichment_stats['coordinates_upgraded']}")
|
|
print(f" Addresses improved: {self.enrichment_stats['addresses_improved']}")
|
|
print(f" Contact info added: {self.enrichment_stats['contact_info_added']}")
|
|
print(f" Opening hours added: {self.enrichment_stats['opening_hours_added']}")
|
|
print(f" Alternative names added: {self.enrichment_stats['alternative_names_added']}")
|
|
print(f" Websites added: {self.enrichment_stats['websites_added']}")
|
|
print(f" Institutions enriched: {self.enrichment_stats['institutions_enriched']}")
|
|
|
|
if self.enrichment_details:
|
|
print(f"\nDetailed Enrichment Log (showing first 20):")
|
|
for detail in self.enrichment_details[:20]:
|
|
print(f"\n {detail['institution_name']} (OSM {detail['osm_id']})")
|
|
for improvement in detail['improvements'][:5]:
|
|
print(f" + {improvement}")
|
|
|
|
print("\n" + "="*70 + "\n")
|
|
|
|
|
|
def main():
|
|
"""Main execution"""
|
|
base_dir = Path(__file__).parent.parent
|
|
input_file = base_dir / "data" / "instances" / "latin_american_institutions_documented.yaml"
|
|
output_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml"
|
|
|
|
if not input_file.exists():
|
|
print(f"❌ Error: Input file not found: {input_file}")
|
|
return 1
|
|
|
|
enricher = OSMEnricher(input_file, output_file)
|
|
enricher.load_institutions()
|
|
enricher.process_all_institutions()
|
|
enricher.generate_report()
|
|
|
|
print(f"\n✅ Enrichment complete! Output saved to:")
|
|
print(f" {output_file}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|