- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
313 lines
12 KiB
Python
Executable file
313 lines
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Resume OSM Enrichment - Process institutions 101-304
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
from collections import defaultdict
|
|
import time
|
|
import re
|
|
|
|
# Configuration
|
|
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
|
|
OVERPASS_TIMEOUT = 30
|
|
RATE_LIMIT_DELAY = 3.0 # Increased to 3 seconds
|
|
MAX_RETRIES = 3
|
|
RETRY_DELAY = 10.0 # Increased to 10 seconds
|
|
BATCH_SIZE = 20
|
|
START_INDEX = 100 # Resume from institution 101 (0-indexed = 100)
|
|
|
|
print(f"Resuming OSM enrichment from institution {START_INDEX + 1}")
|
|
print(f"Rate limit: {RATE_LIMIT_DELAY}s between requests")
|
|
print(f"Batch size: {BATCH_SIZE} institutions\n")
|
|
|
|
# Load partially enriched dataset
|
|
base_dir = Path(__file__).parent.parent
|
|
enriched_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml"
|
|
|
|
print(f"Loading {enriched_file}")
|
|
with open(enriched_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
# Skip header
|
|
yaml_content = content.split('\n\n', 1)[1] if '\n\n' in content else content
|
|
institutions = yaml.safe_load(yaml_content)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
print(f"Will process institutions {START_INDEX + 1} to {len(institutions)}\n")
|
|
|
|
# Stats
|
|
stats = defaultdict(int)
|
|
stats['total_institutions'] = len(institutions)
|
|
details = []
|
|
|
|
def fetch_osm_element(osm_id: str, retry_count: int = 0) -> Optional[Dict]:
|
|
"""Fetch OSM element data via Overpass API"""
|
|
match = re.match(r'(node|way|relation)/(\d+)', osm_id)
|
|
if not match:
|
|
return None
|
|
|
|
element_type, element_id = match.groups()
|
|
query = f"""
|
|
[out:json][timeout:{OVERPASS_TIMEOUT}];
|
|
{element_type}({element_id});
|
|
out body;
|
|
>;
|
|
out skel qt;
|
|
"""
|
|
|
|
try:
|
|
response = requests.post(
|
|
OVERPASS_URL,
|
|
data={'data': query},
|
|
timeout=OVERPASS_TIMEOUT + 5
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
return data.get('elements', [None])[0]
|
|
elif response.status_code == 429 and retry_count < MAX_RETRIES:
|
|
print(f" ⚠️ Rate limited. Waiting {RETRY_DELAY}s...")
|
|
time.sleep(RETRY_DELAY)
|
|
return fetch_osm_element(osm_id, retry_count + 1)
|
|
elif response.status_code in [502, 503, 504] and retry_count < MAX_RETRIES:
|
|
print(f" ⚠️ Server error {response.status_code}. Retry {retry_count+1}/{MAX_RETRIES}")
|
|
time.sleep(RETRY_DELAY)
|
|
return fetch_osm_element(osm_id, retry_count + 1)
|
|
else:
|
|
print(f" ❌ HTTP {response.status_code}")
|
|
return None
|
|
except requests.Timeout:
|
|
if retry_count < MAX_RETRIES:
|
|
print(f" ⚠️ Timeout. Retry {retry_count+1}/{MAX_RETRIES}")
|
|
time.sleep(RETRY_DELAY)
|
|
return fetch_osm_element(osm_id, retry_count + 1)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
return None
|
|
|
|
def calculate_distance(lat1, lon1, lat2, lon2):
|
|
"""Calculate distance between coordinates"""
|
|
from math import radians, sin, cos, sqrt, atan2
|
|
R = 6371000
|
|
lat1, lat2 = radians(lat1), radians(lat2)
|
|
dlat, dlon = radians(lat2 - lat1), radians(lon2 - lon1)
|
|
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
|
|
return R * 2 * atan2(sqrt(a), sqrt(1-a))
|
|
|
|
# Process institutions from START_INDEX onwards
|
|
batch_num = 5 # Continue from batch 5
|
|
processed_count = 0
|
|
|
|
for idx in range(START_INDEX, len(institutions)):
|
|
institution = institutions[idx]
|
|
|
|
# Check for OSM ID
|
|
identifiers = institution.get('identifiers', [])
|
|
osm_ids = [id for id in identifiers if id.get('identifier_scheme') == 'OpenStreetMap']
|
|
|
|
if not osm_ids:
|
|
continue
|
|
|
|
stats['osm_ids_found'] += 1
|
|
osm_id = osm_ids[0]['identifier_value']
|
|
inst_name = institution.get('name', 'Unknown')
|
|
|
|
print(f"\n[{idx + 1}/{len(institutions)}] {inst_name}")
|
|
print(f" OSM ID: {osm_id}")
|
|
|
|
# Fetch OSM data
|
|
osm_data = fetch_osm_element(osm_id)
|
|
|
|
if not osm_data:
|
|
stats['osm_fetch_errors'] += 1
|
|
processed_count += 1
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|
continue
|
|
|
|
stats['osm_records_fetched'] += 1
|
|
tags = osm_data.get('tags', {})
|
|
enriched = False
|
|
improvements = []
|
|
|
|
# 1. Coordinates
|
|
if 'lat' in osm_data and 'lon' in osm_data:
|
|
osm_lat, osm_lon = float(osm_data['lat']), float(osm_data['lon'])
|
|
locations = institution.get('locations', [])
|
|
|
|
if locations:
|
|
if locations[0].get('latitude') and locations[0].get('longitude'):
|
|
distance = calculate_distance(
|
|
locations[0]['latitude'], locations[0]['longitude'],
|
|
osm_lat, osm_lon
|
|
)
|
|
if distance > 100:
|
|
locations[0]['latitude'] = osm_lat
|
|
locations[0]['longitude'] = osm_lon
|
|
enriched = True
|
|
stats['coordinates_upgraded'] += 1
|
|
improvements.append(f"Coordinates upgraded ({int(distance)}m)")
|
|
else:
|
|
locations[0]['latitude'] = osm_lat
|
|
locations[0]['longitude'] = osm_lon
|
|
enriched = True
|
|
stats['coordinates_upgraded'] += 1
|
|
improvements.append("Added coordinates")
|
|
|
|
# 2. Address
|
|
addr_street = tags.get('addr:street')
|
|
addr_num = tags.get('addr:housenumber')
|
|
addr_postcode = tags.get('addr:postcode')
|
|
|
|
if (addr_street or addr_postcode) and institution.get('locations'):
|
|
location = institution['locations'][0]
|
|
|
|
if addr_street and addr_num:
|
|
full_address = f"{addr_street} {addr_num}"
|
|
if not location.get('street_address') or len(full_address) > len(location.get('street_address', '')):
|
|
location['street_address'] = full_address
|
|
enriched = True
|
|
stats['addresses_improved'] += 1
|
|
improvements.append(f"Address: {full_address}")
|
|
|
|
if addr_postcode and not location.get('postal_code'):
|
|
location['postal_code'] = addr_postcode
|
|
enriched = True
|
|
improvements.append(f"Postcode: {addr_postcode}")
|
|
|
|
# 3. Contact info
|
|
phone = tags.get('phone') or tags.get('contact:phone')
|
|
email = tags.get('email') or tags.get('contact:email')
|
|
|
|
if phone or email:
|
|
contact_info = []
|
|
if phone:
|
|
contact_info.append(f"Phone: {phone}")
|
|
if email:
|
|
contact_info.append(f"Email: {email}")
|
|
|
|
contact_text = " | ".join(contact_info)
|
|
current_desc = institution.get('description', '')
|
|
|
|
if contact_text not in current_desc:
|
|
institution['description'] = (current_desc + f"\n\nContact: {contact_text}").strip()
|
|
enriched = True
|
|
stats['contact_info_added'] += 1
|
|
improvements.append(f"Contact: {contact_text}")
|
|
|
|
# 4. Website
|
|
website = tags.get('website') or tags.get('url') or tags.get('contact:website')
|
|
if website:
|
|
website_ids = [id for id in identifiers if id.get('identifier_scheme') == 'Website']
|
|
if not website_ids:
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': website,
|
|
'identifier_url': website
|
|
})
|
|
institution['identifiers'] = identifiers
|
|
enriched = True
|
|
stats['websites_added'] += 1
|
|
improvements.append(f"Website: {website}")
|
|
|
|
# 5. Opening hours
|
|
opening_hours = tags.get('opening_hours')
|
|
if opening_hours:
|
|
current_desc = institution.get('description', '')
|
|
hours_text = f"Opening hours: {opening_hours}"
|
|
if hours_text not in current_desc:
|
|
institution['description'] = (current_desc + f"\n\n{hours_text}").strip()
|
|
enriched = True
|
|
stats['opening_hours_added'] += 1
|
|
improvements.append(f"Hours: {opening_hours}")
|
|
|
|
# 6. Alternative names
|
|
alt_names = []
|
|
for key in ['alt_name', 'official_name', 'name:en', 'name:es', 'name:pt']:
|
|
if key in tags and tags[key] != institution.get('name'):
|
|
alt_names.append(tags[key])
|
|
|
|
if alt_names:
|
|
existing = institution.get('alternative_names', [])
|
|
new_names = [n for n in alt_names if n not in existing]
|
|
if new_names:
|
|
institution['alternative_names'] = existing + new_names
|
|
enriched = True
|
|
stats['alternative_names_added'] += len(new_names)
|
|
improvements.append(f"{len(new_names)} alt names")
|
|
|
|
# Update provenance
|
|
if enriched:
|
|
stats['institutions_enriched'] += 1
|
|
details.append({'name': inst_name, 'osm_id': osm_id, 'improvements': improvements})
|
|
|
|
if 'provenance' in institution:
|
|
notes = institution['provenance'].get('notes', '')
|
|
osm_note = f"\nOpenStreetMap enrichment (2025-11-06): {osm_id}. {', '.join(improvements[:3])}."
|
|
institution['provenance']['notes'] = (notes + osm_note).strip()
|
|
|
|
print(f" ✅ Enriched: {', '.join(improvements[:3])}")
|
|
|
|
processed_count += 1
|
|
|
|
# Save progress every BATCH_SIZE
|
|
if processed_count % BATCH_SIZE == 0:
|
|
batch_num += 1
|
|
print(f"\n💾 Saving batch {batch_num}...")
|
|
|
|
with open(enriched_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
|
|
f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Batch: {batch_num}\n#\n")
|
|
f.write("# OpenStreetMap Enrichment Summary:\n")
|
|
for key, value in stats.items():
|
|
f.write(f"# - {key}: {value}\n")
|
|
f.write("\n")
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"✅ Saved. Enriched: {stats['institutions_enriched']}\n")
|
|
|
|
# Rate limiting
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|
|
|
# Save final
|
|
if processed_count % BATCH_SIZE != 0:
|
|
batch_num += 1
|
|
print(f"\n💾 Saving final batch {batch_num}...")
|
|
|
|
with open(enriched_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
|
|
f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Batch: {batch_num}\n#\n")
|
|
f.write("# OpenStreetMap Enrichment Summary:\n")
|
|
for key, value in stats.items():
|
|
f.write(f"# - {key}: {value}\n")
|
|
f.write("\n")
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"✅ Saved!\n")
|
|
|
|
# Report
|
|
print("\n" + "="*70)
|
|
print("OPENSTREETMAP ENRICHMENT REPORT (RESUMED)")
|
|
print("="*70)
|
|
print(f"\nProcessed: institutions {START_INDEX + 1} to {len(institutions)}")
|
|
print(f"OSM IDs found: {stats['osm_ids_found']}")
|
|
print(f"Records fetched: {stats['osm_records_fetched']}")
|
|
print(f"Fetch errors: {stats['osm_fetch_errors']}")
|
|
print(f"\nEnrichments:")
|
|
print(f" Coordinates: {stats['coordinates_upgraded']}")
|
|
print(f" Addresses: {stats['addresses_improved']}")
|
|
print(f" Contact: {stats['contact_info_added']}")
|
|
print(f" Hours: {stats['opening_hours_added']}")
|
|
print(f" Websites: {stats['websites_added']}")
|
|
print(f" Alt names: {stats['alternative_names_added']}")
|
|
print(f" Total enriched: {stats['institutions_enriched']}")
|
|
print("\n" + "="*70 + "\n")
|