glam/scripts/resume_osm_enrichment.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

313 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Resume OSM Enrichment - Process institutions 101-304
"""
import yaml
import requests
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
from collections import defaultdict
import time
import re
# Configuration
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
OVERPASS_TIMEOUT = 30
RATE_LIMIT_DELAY = 3.0 # Increased to 3 seconds
MAX_RETRIES = 3
RETRY_DELAY = 10.0 # Increased to 10 seconds
BATCH_SIZE = 20
START_INDEX = 100 # Resume from institution 101 (0-indexed = 100)
print(f"Resuming OSM enrichment from institution {START_INDEX + 1}")
print(f"Rate limit: {RATE_LIMIT_DELAY}s between requests")
print(f"Batch size: {BATCH_SIZE} institutions\n")
# Load partially enriched dataset
base_dir = Path(__file__).parent.parent
enriched_file = base_dir / "data" / "instances" / "latin_american_institutions_osm_enriched.yaml"
print(f"Loading {enriched_file}")
with open(enriched_file, 'r', encoding='utf-8') as f:
content = f.read()
# Skip header
yaml_content = content.split('\n\n', 1)[1] if '\n\n' in content else content
institutions = yaml.safe_load(yaml_content)
print(f"Loaded {len(institutions)} institutions")
print(f"Will process institutions {START_INDEX + 1} to {len(institutions)}\n")
# Stats
stats = defaultdict(int)
stats['total_institutions'] = len(institutions)
details = []
def fetch_osm_element(osm_id: str, retry_count: int = 0) -> Optional[Dict]:
"""Fetch OSM element data via Overpass API"""
match = re.match(r'(node|way|relation)/(\d+)', osm_id)
if not match:
return None
element_type, element_id = match.groups()
query = f"""
[out:json][timeout:{OVERPASS_TIMEOUT}];
{element_type}({element_id});
out body;
>;
out skel qt;
"""
try:
response = requests.post(
OVERPASS_URL,
data={'data': query},
timeout=OVERPASS_TIMEOUT + 5
)
if response.status_code == 200:
data = response.json()
return data.get('elements', [None])[0]
elif response.status_code == 429 and retry_count < MAX_RETRIES:
print(f" ⚠️ Rate limited. Waiting {RETRY_DELAY}s...")
time.sleep(RETRY_DELAY)
return fetch_osm_element(osm_id, retry_count + 1)
elif response.status_code in [502, 503, 504] and retry_count < MAX_RETRIES:
print(f" ⚠️ Server error {response.status_code}. Retry {retry_count+1}/{MAX_RETRIES}")
time.sleep(RETRY_DELAY)
return fetch_osm_element(osm_id, retry_count + 1)
else:
print(f" ❌ HTTP {response.status_code}")
return None
except requests.Timeout:
if retry_count < MAX_RETRIES:
print(f" ⚠️ Timeout. Retry {retry_count+1}/{MAX_RETRIES}")
time.sleep(RETRY_DELAY)
return fetch_osm_element(osm_id, retry_count + 1)
return None
except Exception as e:
print(f" ❌ Error: {e}")
return None
def calculate_distance(lat1, lon1, lat2, lon2):
"""Calculate distance between coordinates"""
from math import radians, sin, cos, sqrt, atan2
R = 6371000
lat1, lat2 = radians(lat1), radians(lat2)
dlat, dlon = radians(lat2 - lat1), radians(lon2 - lon1)
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
return R * 2 * atan2(sqrt(a), sqrt(1-a))
# Process institutions from START_INDEX onwards
batch_num = 5 # Continue from batch 5
processed_count = 0
for idx in range(START_INDEX, len(institutions)):
institution = institutions[idx]
# Check for OSM ID
identifiers = institution.get('identifiers', [])
osm_ids = [id for id in identifiers if id.get('identifier_scheme') == 'OpenStreetMap']
if not osm_ids:
continue
stats['osm_ids_found'] += 1
osm_id = osm_ids[0]['identifier_value']
inst_name = institution.get('name', 'Unknown')
print(f"\n[{idx + 1}/{len(institutions)}] {inst_name}")
print(f" OSM ID: {osm_id}")
# Fetch OSM data
osm_data = fetch_osm_element(osm_id)
if not osm_data:
stats['osm_fetch_errors'] += 1
processed_count += 1
time.sleep(RATE_LIMIT_DELAY)
continue
stats['osm_records_fetched'] += 1
tags = osm_data.get('tags', {})
enriched = False
improvements = []
# 1. Coordinates
if 'lat' in osm_data and 'lon' in osm_data:
osm_lat, osm_lon = float(osm_data['lat']), float(osm_data['lon'])
locations = institution.get('locations', [])
if locations:
if locations[0].get('latitude') and locations[0].get('longitude'):
distance = calculate_distance(
locations[0]['latitude'], locations[0]['longitude'],
osm_lat, osm_lon
)
if distance > 100:
locations[0]['latitude'] = osm_lat
locations[0]['longitude'] = osm_lon
enriched = True
stats['coordinates_upgraded'] += 1
improvements.append(f"Coordinates upgraded ({int(distance)}m)")
else:
locations[0]['latitude'] = osm_lat
locations[0]['longitude'] = osm_lon
enriched = True
stats['coordinates_upgraded'] += 1
improvements.append("Added coordinates")
# 2. Address
addr_street = tags.get('addr:street')
addr_num = tags.get('addr:housenumber')
addr_postcode = tags.get('addr:postcode')
if (addr_street or addr_postcode) and institution.get('locations'):
location = institution['locations'][0]
if addr_street and addr_num:
full_address = f"{addr_street} {addr_num}"
if not location.get('street_address') or len(full_address) > len(location.get('street_address', '')):
location['street_address'] = full_address
enriched = True
stats['addresses_improved'] += 1
improvements.append(f"Address: {full_address}")
if addr_postcode and not location.get('postal_code'):
location['postal_code'] = addr_postcode
enriched = True
improvements.append(f"Postcode: {addr_postcode}")
# 3. Contact info
phone = tags.get('phone') or tags.get('contact:phone')
email = tags.get('email') or tags.get('contact:email')
if phone or email:
contact_info = []
if phone:
contact_info.append(f"Phone: {phone}")
if email:
contact_info.append(f"Email: {email}")
contact_text = " | ".join(contact_info)
current_desc = institution.get('description', '')
if contact_text not in current_desc:
institution['description'] = (current_desc + f"\n\nContact: {contact_text}").strip()
enriched = True
stats['contact_info_added'] += 1
improvements.append(f"Contact: {contact_text}")
# 4. Website
website = tags.get('website') or tags.get('url') or tags.get('contact:website')
if website:
website_ids = [id for id in identifiers if id.get('identifier_scheme') == 'Website']
if not website_ids:
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': website,
'identifier_url': website
})
institution['identifiers'] = identifiers
enriched = True
stats['websites_added'] += 1
improvements.append(f"Website: {website}")
# 5. Opening hours
opening_hours = tags.get('opening_hours')
if opening_hours:
current_desc = institution.get('description', '')
hours_text = f"Opening hours: {opening_hours}"
if hours_text not in current_desc:
institution['description'] = (current_desc + f"\n\n{hours_text}").strip()
enriched = True
stats['opening_hours_added'] += 1
improvements.append(f"Hours: {opening_hours}")
# 6. Alternative names
alt_names = []
for key in ['alt_name', 'official_name', 'name:en', 'name:es', 'name:pt']:
if key in tags and tags[key] != institution.get('name'):
alt_names.append(tags[key])
if alt_names:
existing = institution.get('alternative_names', [])
new_names = [n for n in alt_names if n not in existing]
if new_names:
institution['alternative_names'] = existing + new_names
enriched = True
stats['alternative_names_added'] += len(new_names)
improvements.append(f"{len(new_names)} alt names")
# Update provenance
if enriched:
stats['institutions_enriched'] += 1
details.append({'name': inst_name, 'osm_id': osm_id, 'improvements': improvements})
if 'provenance' in institution:
notes = institution['provenance'].get('notes', '')
osm_note = f"\nOpenStreetMap enrichment (2025-11-06): {osm_id}. {', '.join(improvements[:3])}."
institution['provenance']['notes'] = (notes + osm_note).strip()
print(f" ✅ Enriched: {', '.join(improvements[:3])}")
processed_count += 1
# Save progress every BATCH_SIZE
if processed_count % BATCH_SIZE == 0:
batch_num += 1
print(f"\n💾 Saving batch {batch_num}...")
with open(enriched_file, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Batch: {batch_num}\n#\n")
f.write("# OpenStreetMap Enrichment Summary:\n")
for key, value in stats.items():
f.write(f"# - {key}: {value}\n")
f.write("\n")
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
print(f"✅ Saved. Enriched: {stats['institutions_enriched']}\n")
# Rate limiting
time.sleep(RATE_LIMIT_DELAY)
# Save final
if processed_count % BATCH_SIZE != 0:
batch_num += 1
print(f"\n💾 Saving final batch {batch_num}...")
with open(enriched_file, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write("# Latin American GLAM Institutions - OpenStreetMap Enriched\n")
f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Batch: {batch_num}\n#\n")
f.write("# OpenStreetMap Enrichment Summary:\n")
for key, value in stats.items():
f.write(f"# - {key}: {value}\n")
f.write("\n")
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
print(f"✅ Saved!\n")
# Report
print("\n" + "="*70)
print("OPENSTREETMAP ENRICHMENT REPORT (RESUMED)")
print("="*70)
print(f"\nProcessed: institutions {START_INDEX + 1} to {len(institutions)}")
print(f"OSM IDs found: {stats['osm_ids_found']}")
print(f"Records fetched: {stats['osm_records_fetched']}")
print(f"Fetch errors: {stats['osm_fetch_errors']}")
print(f"\nEnrichments:")
print(f" Coordinates: {stats['coordinates_upgraded']}")
print(f" Addresses: {stats['addresses_improved']}")
print(f" Contact: {stats['contact_info_added']}")
print(f" Hours: {stats['opening_hours_added']}")
print(f" Websites: {stats['websites_added']}")
print(f" Alt names: {stats['alternative_names_added']}")
print(f" Total enriched: {stats['institutions_enriched']}")
print("\n" + "="*70 + "\n")