glam/scripts/geocode_global_institutions.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

537 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Geocode Global Heritage Institutions
This script geocodes heritage institutions from the global dataset using the Nominatim API.
Features:
- Persistent SQLite cache (preserves geocoding across runs)
- Rate limiting (1 request/second for Nominatim)
- Progress tracking and resume capability
- Country-specific query optimization
- Error handling and retry logic
- Detailed logging and statistics
Usage:
python scripts/geocode_global_institutions.py [--dry-run] [--limit N] [--country CODE]
Options:
--dry-run Show what would be geocoded without making API calls
--limit N Only geocode first N institutions (for testing)
--country CODE Only geocode institutions from specific country (e.g., JP, NL, MX)
--force Re-geocode institutions that already have coordinates
--verbose Show detailed progress for each institution
"""
import argparse
import sqlite3
import time
import yaml
import requests
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple
import sys
class GeocodingCache:
"""Persistent SQLite cache for geocoding results."""
def __init__(self, cache_file: Path):
self.cache_file = cache_file
self.conn = sqlite3.connect(cache_file)
self._initialize_cache()
def _initialize_cache(self):
"""Create cache table if it doesn't exist."""
self.conn.execute("""
CREATE TABLE IF NOT EXISTS geocoding_cache (
query TEXT PRIMARY KEY,
latitude REAL,
longitude REAL,
geonames_id INTEGER,
display_name TEXT,
timestamp TEXT,
success INTEGER
)
""")
self.conn.commit()
def get(self, query: str) -> Optional[Dict]:
"""Retrieve cached geocoding result."""
cursor = self.conn.execute(
"SELECT latitude, longitude, geonames_id, display_name, success FROM geocoding_cache WHERE query = ?",
(query,)
)
row = cursor.fetchone()
if row:
if row[4]: # success
return {
'latitude': row[0],
'longitude': row[1],
'geonames_id': row[2],
'display_name': row[3]
}
else:
return None # Cached failure
return None # Not in cache
def put(self, query: str, result: Optional[Dict]):
"""Store geocoding result in cache."""
if result:
self.conn.execute("""
INSERT OR REPLACE INTO geocoding_cache
(query, latitude, longitude, geonames_id, display_name, timestamp, success)
VALUES (?, ?, ?, ?, ?, ?, 1)
""", (
query,
result.get('latitude'),
result.get('longitude'),
result.get('geonames_id'),
result.get('display_name'),
datetime.now(timezone.utc).isoformat()
))
else:
# Cache failure to avoid retrying
self.conn.execute("""
INSERT OR REPLACE INTO geocoding_cache
(query, latitude, longitude, geonames_id, display_name, timestamp, success)
VALUES (?, NULL, NULL, NULL, NULL, ?, 0)
""", (query, datetime.now(timezone.utc).isoformat()))
self.conn.commit()
def stats(self) -> Dict[str, int]:
"""Get cache statistics."""
cursor = self.conn.execute("SELECT COUNT(*), SUM(success) FROM geocoding_cache")
total, successful = cursor.fetchone()
return {
'total_queries': total or 0,
'successful': successful or 0,
'failed': (total or 0) - (successful or 0)
}
def close(self):
"""Close database connection."""
self.conn.close()
class GlobalGeocoder:
"""Geocode heritage institutions with caching and rate limiting."""
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
USER_AGENT = "GLAM-Heritage-Data-Extraction/1.0 (https://github.com/cultural-heritage/glam-extractor)"
def __init__(self, cache_file: Path, rate_limit: float = 1.0):
"""
Initialize geocoder.
Args:
cache_file: Path to SQLite cache database
rate_limit: Minimum seconds between API calls
"""
self.cache = GeocodingCache(cache_file)
self.rate_limit = rate_limit
self.last_request_time = 0
self.stats = {
'total': 0,
'already_geocoded': 0,
'cache_hits': 0,
'api_calls': 0,
'successful': 0,
'failed': 0,
'skipped': 0
}
def _wait_for_rate_limit(self):
"""Enforce rate limiting."""
elapsed = time.time() - self.last_request_time
if elapsed < self.rate_limit:
time.sleep(self.rate_limit - elapsed)
self.last_request_time = time.time()
def _build_query(self, location: Dict, country: str) -> str:
"""
Build geocoding query optimized for each country.
Args:
location: Location dict with city, region, country, etc.
country: ISO 3166-1 alpha-2 country code
Returns:
Query string for geocoding API
"""
city = location.get('city', '').strip()
region = location.get('region', '').strip()
country_code = location.get('country', country).strip()
# Country-specific query optimization
if country_code == 'JP':
# Japanese cities often include administrative level (e.g., "SAPPORO SHI KITA KU")
# Remove " SHI" suffix for better matching
if city:
city_clean = city.replace(' SHI', '').replace(' KU', '').replace(' CHO', '').replace(' MURA', '')
if region:
# Region is prefecture (e.g., "HOKKAIDO")
return f"{city_clean}, {region}, Japan"
else:
return f"{city_clean}, Japan"
elif country_code == 'NL':
# Dutch addresses: prioritize postal code + city
postal_code = location.get('postal_code', '').strip()
if postal_code and city:
return f"{postal_code}, {city}, Netherlands"
elif city:
return f"{city}, Netherlands"
elif country_code == 'MX':
# Mexican locations: city + state + country
if city and region:
return f"{city}, {region}, Mexico"
elif city:
return f"{city}, Mexico"
elif country_code == 'BR':
# Brazilian locations: city + state abbreviation + country
if city and region:
return f"{city}, {region}, Brazil"
elif city:
return f"{city}, Brazil"
elif country_code == 'CL':
# Chilean locations: city + region + country
if city and region:
return f"{city}, {region}, Chile"
elif city:
return f"{city}, Chile"
# Generic fallback
parts = []
if city:
parts.append(city)
if region:
parts.append(region)
if country_code:
parts.append(country_code)
return ', '.join(parts) if parts else ''
def geocode_location(self, location: Dict, country: str, dry_run: bool = False) -> Optional[Dict]:
"""
Geocode a single location.
Args:
location: Location dict from institution record
country: ISO country code
dry_run: If True, don't make API calls
Returns:
Dict with latitude, longitude, geonames_id (if available)
"""
query = self._build_query(location, country)
if not query:
self.stats['skipped'] += 1
return None
# Check cache first
cached_result = self.cache.get(query)
if cached_result is not None:
self.stats['cache_hits'] += 1
return cached_result
if dry_run:
print(f" [DRY RUN] Would geocode: {query}")
return None
# Make API call
self._wait_for_rate_limit()
self.stats['api_calls'] += 1
try:
params = {
'q': query,
'format': 'json',
'limit': 1,
'addressdetails': 1,
'extratags': 1
}
headers = {
'User-Agent': self.USER_AGENT
}
response = requests.get(
self.NOMINATIM_URL,
params=params,
headers=headers,
timeout=10
)
response.raise_for_status()
results = response.json()
if results and len(results) > 0:
location_result = results[0]
result = {
'latitude': float(location_result['lat']),
'longitude': float(location_result['lon']),
'display_name': location_result.get('display_name', '')
}
# Extract geonames_id if available
extratags = location_result.get('extratags')
if extratags and isinstance(extratags, dict) and 'geonames:id' in extratags:
result['geonames_id'] = int(extratags['geonames:id'])
self.cache.put(query, result)
self.stats['successful'] += 1
return result
else:
# No result found - cache failure
self.cache.put(query, None)
self.stats['failed'] += 1
return None
except (requests.RequestException, ValueError, KeyError) as e:
print(f" ⚠️ Geocoding error for '{query}': {e}")
# Don't cache errors - allow retry later
self.stats['failed'] += 1
return None
def geocode_institution(self, institution: Dict, dry_run: bool = False, force: bool = False, verbose: bool = False) -> bool:
"""
Geocode all locations for an institution.
Args:
institution: Institution record
dry_run: If True, don't make API calls or modify data
force: If True, re-geocode institutions that already have coordinates
verbose: If True, print detailed progress
Returns:
True if any location was updated
"""
self.stats['total'] += 1
locations = institution.get('locations', [])
if not locations:
self.stats['skipped'] += 1
return False
country = locations[0].get('country', 'Unknown')
name = institution.get('name', 'Unknown')
updated = False
for i, location in enumerate(locations):
# Skip if already geocoded (unless --force)
if not force and location.get('latitude') and location.get('longitude'):
self.stats['already_geocoded'] += 1
if verbose:
print(f" ✓ Location {i+1} already geocoded")
continue
if verbose:
query = self._build_query(location, country)
print(f" 🌍 Geocoding location {i+1}: {query}")
result = self.geocode_location(location, country, dry_run=dry_run)
if result and not dry_run:
location['latitude'] = result['latitude']
location['longitude'] = result['longitude']
if 'geonames_id' in result:
location['geonames_id'] = result['geonames_id']
updated = True
if verbose:
print(f" ✓ ({result['latitude']:.4f}, {result['longitude']:.4f})")
elif verbose and not dry_run:
print(f" ✗ Geocoding failed")
return updated
def print_stats(self):
"""Print geocoding statistics."""
cache_stats = self.cache.stats()
print("\n" + "=" * 80)
print("GEOCODING STATISTICS")
print("=" * 80)
print(f"Institutions processed: {self.stats['total']:,}")
print(f"Already geocoded (skipped): {self.stats['already_geocoded']:,}")
print(f"Cache hits: {self.stats['cache_hits']:,}")
print(f"API calls: {self.stats['api_calls']:,}")
print(f"Successful geocoding: {self.stats['successful']:,}")
print(f"Failed geocoding: {self.stats['failed']:,}")
print(f"Skipped (no location): {self.stats['skipped']:,}")
print(f"\nCache Statistics:")
print(f"Total cached queries: {cache_stats['total_queries']:,}")
print(f"Successful: {cache_stats['successful']:,}")
print(f"Failed: {cache_stats['failed']:,}")
print("=" * 80)
def close(self):
"""Close resources."""
self.cache.close()
def main():
parser = argparse.ArgumentParser(
description="Geocode global heritage institutions using Nominatim API"
)
parser.add_argument(
'--dry-run',
action='store_true',
help="Show what would be geocoded without making API calls"
)
parser.add_argument(
'--limit',
type=int,
help="Only geocode first N institutions (for testing)"
)
parser.add_argument(
'--country',
type=str,
help="Only geocode institutions from specific country (e.g., JP, NL, MX)"
)
parser.add_argument(
'--force',
action='store_true',
help="Re-geocode institutions that already have coordinates"
)
parser.add_argument(
'--verbose',
action='store_true',
help="Show detailed progress for each institution"
)
args = parser.parse_args()
# Paths
base_dir = Path(__file__).parent.parent
global_file = base_dir / 'data' / 'instances' / 'global' / 'global_heritage_institutions.yaml'
cache_file = base_dir / 'data' / 'cache' / 'geocoding_cache.db'
# Create cache directory if needed
cache_file.parent.mkdir(parents=True, exist_ok=True)
# Load global dataset
print(f"Loading global dataset from {global_file}")
with open(global_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Loaded {len(institutions):,} institutions")
# Filter by country if specified
if args.country:
institutions = [
inst for inst in institutions
if inst.get('locations') and inst['locations'][0].get('country') == args.country
]
print(f"Filtered to {len(institutions):,} institutions in {args.country}")
# Limit if specified
institutions_to_process = institutions
if args.limit:
institutions_to_process = institutions[:args.limit]
print(f"⚠️ Processing only first {len(institutions_to_process):,} institutions (--limit flag)")
print(f" Full dataset of {len(institutions):,} institutions will be preserved")
if args.dry_run:
print("\n⚠️ DRY RUN MODE - No API calls or file modifications will be made")
# Initialize geocoder
print(f"Initializing geocoder with cache at {cache_file}")
geocoder = GlobalGeocoder(cache_file)
# Show cache stats
cache_stats = geocoder.cache.stats()
print(f"Cache contains {cache_stats['total_queries']:,} queries ({cache_stats['successful']:,} successful)")
# Geocode institutions
print(f"\nGeocoding institutions...")
print("=" * 80)
updated_count = 0
start_time = time.time()
for i, institution in enumerate(institutions_to_process, 1):
name = institution.get('name', 'Unknown')
ghcid = institution.get('ghcid', 'Unknown')
country = institution.get('locations', [{}])[0].get('country', '??') if institution.get('locations') else '??'
if args.verbose or i % 100 == 0 or i <= 10:
print(f"\n[{i}/{len(institutions_to_process)}] {country} - {name}")
print(f" GHCID: {ghcid}")
updated = geocoder.geocode_institution(
institution,
dry_run=args.dry_run,
force=args.force,
verbose=args.verbose
)
if updated:
updated_count += 1
# Save progress every 100 institutions
if not args.dry_run and i % 100 == 0 and updated_count > 0:
print(f"\n💾 Saving progress at {i}/{len(institutions_to_process)} institutions...")
with open(global_file, 'w', encoding='utf-8') as f:
yaml.dump(
institutions, # Save FULL dataset, not just processed subset
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120
)
print(f"✅ Progress saved ({updated_count:,} institutions updated so far)")
# Progress indicator (without newline)
if not args.verbose and i % 10 == 0:
elapsed = time.time() - start_time
rate = i / elapsed if elapsed > 0 else 0
remaining = (len(institutions_to_process) - i) / rate if rate > 0 else 0
print(f"\rProgress: {i}/{len(institutions_to_process)} ({i/len(institutions_to_process)*100:.1f}%) | "
f"Rate: {rate:.1f}/sec | ETA: {remaining/60:.1f} min", end='', flush=True)
print() # Newline after progress
# Print statistics
geocoder.print_stats()
# Save updated dataset (final save)
if not args.dry_run and updated_count > 0:
output_file = global_file
print(f"\n💾 Saving final results to {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(
institutions,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120
)
print(f"✅ Saved {len(institutions):,} institutions ({updated_count:,} updated with coordinates)")
elif args.dry_run:
print(f"\n⚠️ DRY RUN - No files were modified")
print(f"Would have updated {updated_count:,} institutions")
else:
print(f"\n✓ No institutions needed updating")
# Close geocoder
geocoder.close()
# Final summary
elapsed = time.time() - start_time
print(f"\nTotal execution time: {elapsed/60:.1f} minutes")
if geocoder.stats['api_calls'] > 0:
print(f"Average API call rate: {geocoder.stats['api_calls']/elapsed:.2f} requests/second")
if __name__ == '__main__':
main()