- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
537 lines
19 KiB
Python
537 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Geocode Global Heritage Institutions
|
|
|
|
This script geocodes heritage institutions from the global dataset using the Nominatim API.
|
|
|
|
Features:
|
|
- Persistent SQLite cache (preserves geocoding across runs)
|
|
- Rate limiting (1 request/second for Nominatim)
|
|
- Progress tracking and resume capability
|
|
- Country-specific query optimization
|
|
- Error handling and retry logic
|
|
- Detailed logging and statistics
|
|
|
|
Usage:
|
|
python scripts/geocode_global_institutions.py [--dry-run] [--limit N] [--country CODE]
|
|
|
|
Options:
|
|
--dry-run Show what would be geocoded without making API calls
|
|
--limit N Only geocode first N institutions (for testing)
|
|
--country CODE Only geocode institutions from specific country (e.g., JP, NL, MX)
|
|
--force Re-geocode institutions that already have coordinates
|
|
--verbose Show detailed progress for each institution
|
|
"""
|
|
|
|
import argparse
|
|
import sqlite3
|
|
import time
|
|
import yaml
|
|
import requests
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Tuple
|
|
import sys
|
|
|
|
|
|
class GeocodingCache:
|
|
"""Persistent SQLite cache for geocoding results."""
|
|
|
|
def __init__(self, cache_file: Path):
|
|
self.cache_file = cache_file
|
|
self.conn = sqlite3.connect(cache_file)
|
|
self._initialize_cache()
|
|
|
|
def _initialize_cache(self):
|
|
"""Create cache table if it doesn't exist."""
|
|
self.conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS geocoding_cache (
|
|
query TEXT PRIMARY KEY,
|
|
latitude REAL,
|
|
longitude REAL,
|
|
geonames_id INTEGER,
|
|
display_name TEXT,
|
|
timestamp TEXT,
|
|
success INTEGER
|
|
)
|
|
""")
|
|
self.conn.commit()
|
|
|
|
def get(self, query: str) -> Optional[Dict]:
|
|
"""Retrieve cached geocoding result."""
|
|
cursor = self.conn.execute(
|
|
"SELECT latitude, longitude, geonames_id, display_name, success FROM geocoding_cache WHERE query = ?",
|
|
(query,)
|
|
)
|
|
row = cursor.fetchone()
|
|
if row:
|
|
if row[4]: # success
|
|
return {
|
|
'latitude': row[0],
|
|
'longitude': row[1],
|
|
'geonames_id': row[2],
|
|
'display_name': row[3]
|
|
}
|
|
else:
|
|
return None # Cached failure
|
|
return None # Not in cache
|
|
|
|
def put(self, query: str, result: Optional[Dict]):
|
|
"""Store geocoding result in cache."""
|
|
if result:
|
|
self.conn.execute("""
|
|
INSERT OR REPLACE INTO geocoding_cache
|
|
(query, latitude, longitude, geonames_id, display_name, timestamp, success)
|
|
VALUES (?, ?, ?, ?, ?, ?, 1)
|
|
""", (
|
|
query,
|
|
result.get('latitude'),
|
|
result.get('longitude'),
|
|
result.get('geonames_id'),
|
|
result.get('display_name'),
|
|
datetime.now(timezone.utc).isoformat()
|
|
))
|
|
else:
|
|
# Cache failure to avoid retrying
|
|
self.conn.execute("""
|
|
INSERT OR REPLACE INTO geocoding_cache
|
|
(query, latitude, longitude, geonames_id, display_name, timestamp, success)
|
|
VALUES (?, NULL, NULL, NULL, NULL, ?, 0)
|
|
""", (query, datetime.now(timezone.utc).isoformat()))
|
|
self.conn.commit()
|
|
|
|
def stats(self) -> Dict[str, int]:
|
|
"""Get cache statistics."""
|
|
cursor = self.conn.execute("SELECT COUNT(*), SUM(success) FROM geocoding_cache")
|
|
total, successful = cursor.fetchone()
|
|
return {
|
|
'total_queries': total or 0,
|
|
'successful': successful or 0,
|
|
'failed': (total or 0) - (successful or 0)
|
|
}
|
|
|
|
def close(self):
|
|
"""Close database connection."""
|
|
self.conn.close()
|
|
|
|
|
|
class GlobalGeocoder:
|
|
"""Geocode heritage institutions with caching and rate limiting."""
|
|
|
|
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
|
|
USER_AGENT = "GLAM-Heritage-Data-Extraction/1.0 (https://github.com/cultural-heritage/glam-extractor)"
|
|
|
|
def __init__(self, cache_file: Path, rate_limit: float = 1.0):
|
|
"""
|
|
Initialize geocoder.
|
|
|
|
Args:
|
|
cache_file: Path to SQLite cache database
|
|
rate_limit: Minimum seconds between API calls
|
|
"""
|
|
self.cache = GeocodingCache(cache_file)
|
|
self.rate_limit = rate_limit
|
|
self.last_request_time = 0
|
|
self.stats = {
|
|
'total': 0,
|
|
'already_geocoded': 0,
|
|
'cache_hits': 0,
|
|
'api_calls': 0,
|
|
'successful': 0,
|
|
'failed': 0,
|
|
'skipped': 0
|
|
}
|
|
|
|
def _wait_for_rate_limit(self):
|
|
"""Enforce rate limiting."""
|
|
elapsed = time.time() - self.last_request_time
|
|
if elapsed < self.rate_limit:
|
|
time.sleep(self.rate_limit - elapsed)
|
|
self.last_request_time = time.time()
|
|
|
|
def _build_query(self, location: Dict, country: str) -> str:
|
|
"""
|
|
Build geocoding query optimized for each country.
|
|
|
|
Args:
|
|
location: Location dict with city, region, country, etc.
|
|
country: ISO 3166-1 alpha-2 country code
|
|
|
|
Returns:
|
|
Query string for geocoding API
|
|
"""
|
|
city = location.get('city', '').strip()
|
|
region = location.get('region', '').strip()
|
|
country_code = location.get('country', country).strip()
|
|
|
|
# Country-specific query optimization
|
|
if country_code == 'JP':
|
|
# Japanese cities often include administrative level (e.g., "SAPPORO SHI KITA KU")
|
|
# Remove " SHI" suffix for better matching
|
|
if city:
|
|
city_clean = city.replace(' SHI', '').replace(' KU', '').replace(' CHO', '').replace(' MURA', '')
|
|
if region:
|
|
# Region is prefecture (e.g., "HOKKAIDO")
|
|
return f"{city_clean}, {region}, Japan"
|
|
else:
|
|
return f"{city_clean}, Japan"
|
|
|
|
elif country_code == 'NL':
|
|
# Dutch addresses: prioritize postal code + city
|
|
postal_code = location.get('postal_code', '').strip()
|
|
if postal_code and city:
|
|
return f"{postal_code}, {city}, Netherlands"
|
|
elif city:
|
|
return f"{city}, Netherlands"
|
|
|
|
elif country_code == 'MX':
|
|
# Mexican locations: city + state + country
|
|
if city and region:
|
|
return f"{city}, {region}, Mexico"
|
|
elif city:
|
|
return f"{city}, Mexico"
|
|
|
|
elif country_code == 'BR':
|
|
# Brazilian locations: city + state abbreviation + country
|
|
if city and region:
|
|
return f"{city}, {region}, Brazil"
|
|
elif city:
|
|
return f"{city}, Brazil"
|
|
|
|
elif country_code == 'CL':
|
|
# Chilean locations: city + region + country
|
|
if city and region:
|
|
return f"{city}, {region}, Chile"
|
|
elif city:
|
|
return f"{city}, Chile"
|
|
|
|
# Generic fallback
|
|
parts = []
|
|
if city:
|
|
parts.append(city)
|
|
if region:
|
|
parts.append(region)
|
|
if country_code:
|
|
parts.append(country_code)
|
|
|
|
return ', '.join(parts) if parts else ''
|
|
|
|
def geocode_location(self, location: Dict, country: str, dry_run: bool = False) -> Optional[Dict]:
|
|
"""
|
|
Geocode a single location.
|
|
|
|
Args:
|
|
location: Location dict from institution record
|
|
country: ISO country code
|
|
dry_run: If True, don't make API calls
|
|
|
|
Returns:
|
|
Dict with latitude, longitude, geonames_id (if available)
|
|
"""
|
|
query = self._build_query(location, country)
|
|
if not query:
|
|
self.stats['skipped'] += 1
|
|
return None
|
|
|
|
# Check cache first
|
|
cached_result = self.cache.get(query)
|
|
if cached_result is not None:
|
|
self.stats['cache_hits'] += 1
|
|
return cached_result
|
|
|
|
if dry_run:
|
|
print(f" [DRY RUN] Would geocode: {query}")
|
|
return None
|
|
|
|
# Make API call
|
|
self._wait_for_rate_limit()
|
|
self.stats['api_calls'] += 1
|
|
|
|
try:
|
|
params = {
|
|
'q': query,
|
|
'format': 'json',
|
|
'limit': 1,
|
|
'addressdetails': 1,
|
|
'extratags': 1
|
|
}
|
|
headers = {
|
|
'User-Agent': self.USER_AGENT
|
|
}
|
|
|
|
response = requests.get(
|
|
self.NOMINATIM_URL,
|
|
params=params,
|
|
headers=headers,
|
|
timeout=10
|
|
)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
|
|
if results and len(results) > 0:
|
|
location_result = results[0]
|
|
result = {
|
|
'latitude': float(location_result['lat']),
|
|
'longitude': float(location_result['lon']),
|
|
'display_name': location_result.get('display_name', '')
|
|
}
|
|
|
|
# Extract geonames_id if available
|
|
extratags = location_result.get('extratags')
|
|
if extratags and isinstance(extratags, dict) and 'geonames:id' in extratags:
|
|
result['geonames_id'] = int(extratags['geonames:id'])
|
|
|
|
self.cache.put(query, result)
|
|
self.stats['successful'] += 1
|
|
return result
|
|
else:
|
|
# No result found - cache failure
|
|
self.cache.put(query, None)
|
|
self.stats['failed'] += 1
|
|
return None
|
|
|
|
except (requests.RequestException, ValueError, KeyError) as e:
|
|
print(f" ⚠️ Geocoding error for '{query}': {e}")
|
|
# Don't cache errors - allow retry later
|
|
self.stats['failed'] += 1
|
|
return None
|
|
|
|
def geocode_institution(self, institution: Dict, dry_run: bool = False, force: bool = False, verbose: bool = False) -> bool:
|
|
"""
|
|
Geocode all locations for an institution.
|
|
|
|
Args:
|
|
institution: Institution record
|
|
dry_run: If True, don't make API calls or modify data
|
|
force: If True, re-geocode institutions that already have coordinates
|
|
verbose: If True, print detailed progress
|
|
|
|
Returns:
|
|
True if any location was updated
|
|
"""
|
|
self.stats['total'] += 1
|
|
|
|
locations = institution.get('locations', [])
|
|
if not locations:
|
|
self.stats['skipped'] += 1
|
|
return False
|
|
|
|
country = locations[0].get('country', 'Unknown')
|
|
name = institution.get('name', 'Unknown')
|
|
|
|
updated = False
|
|
for i, location in enumerate(locations):
|
|
# Skip if already geocoded (unless --force)
|
|
if not force and location.get('latitude') and location.get('longitude'):
|
|
self.stats['already_geocoded'] += 1
|
|
if verbose:
|
|
print(f" ✓ Location {i+1} already geocoded")
|
|
continue
|
|
|
|
if verbose:
|
|
query = self._build_query(location, country)
|
|
print(f" 🌍 Geocoding location {i+1}: {query}")
|
|
|
|
result = self.geocode_location(location, country, dry_run=dry_run)
|
|
|
|
if result and not dry_run:
|
|
location['latitude'] = result['latitude']
|
|
location['longitude'] = result['longitude']
|
|
if 'geonames_id' in result:
|
|
location['geonames_id'] = result['geonames_id']
|
|
updated = True
|
|
|
|
if verbose:
|
|
print(f" ✓ ({result['latitude']:.4f}, {result['longitude']:.4f})")
|
|
elif verbose and not dry_run:
|
|
print(f" ✗ Geocoding failed")
|
|
|
|
return updated
|
|
|
|
def print_stats(self):
|
|
"""Print geocoding statistics."""
|
|
cache_stats = self.cache.stats()
|
|
|
|
print("\n" + "=" * 80)
|
|
print("GEOCODING STATISTICS")
|
|
print("=" * 80)
|
|
print(f"Institutions processed: {self.stats['total']:,}")
|
|
print(f"Already geocoded (skipped): {self.stats['already_geocoded']:,}")
|
|
print(f"Cache hits: {self.stats['cache_hits']:,}")
|
|
print(f"API calls: {self.stats['api_calls']:,}")
|
|
print(f"Successful geocoding: {self.stats['successful']:,}")
|
|
print(f"Failed geocoding: {self.stats['failed']:,}")
|
|
print(f"Skipped (no location): {self.stats['skipped']:,}")
|
|
print(f"\nCache Statistics:")
|
|
print(f"Total cached queries: {cache_stats['total_queries']:,}")
|
|
print(f"Successful: {cache_stats['successful']:,}")
|
|
print(f"Failed: {cache_stats['failed']:,}")
|
|
print("=" * 80)
|
|
|
|
def close(self):
|
|
"""Close resources."""
|
|
self.cache.close()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Geocode global heritage institutions using Nominatim API"
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help="Show what would be geocoded without making API calls"
|
|
)
|
|
parser.add_argument(
|
|
'--limit',
|
|
type=int,
|
|
help="Only geocode first N institutions (for testing)"
|
|
)
|
|
parser.add_argument(
|
|
'--country',
|
|
type=str,
|
|
help="Only geocode institutions from specific country (e.g., JP, NL, MX)"
|
|
)
|
|
parser.add_argument(
|
|
'--force',
|
|
action='store_true',
|
|
help="Re-geocode institutions that already have coordinates"
|
|
)
|
|
parser.add_argument(
|
|
'--verbose',
|
|
action='store_true',
|
|
help="Show detailed progress for each institution"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Paths
|
|
base_dir = Path(__file__).parent.parent
|
|
global_file = base_dir / 'data' / 'instances' / 'global' / 'global_heritage_institutions.yaml'
|
|
cache_file = base_dir / 'data' / 'cache' / 'geocoding_cache.db'
|
|
|
|
# Create cache directory if needed
|
|
cache_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load global dataset
|
|
print(f"Loading global dataset from {global_file}")
|
|
with open(global_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(institutions):,} institutions")
|
|
|
|
# Filter by country if specified
|
|
if args.country:
|
|
institutions = [
|
|
inst for inst in institutions
|
|
if inst.get('locations') and inst['locations'][0].get('country') == args.country
|
|
]
|
|
print(f"Filtered to {len(institutions):,} institutions in {args.country}")
|
|
|
|
# Limit if specified
|
|
institutions_to_process = institutions
|
|
if args.limit:
|
|
institutions_to_process = institutions[:args.limit]
|
|
print(f"⚠️ Processing only first {len(institutions_to_process):,} institutions (--limit flag)")
|
|
print(f" Full dataset of {len(institutions):,} institutions will be preserved")
|
|
|
|
if args.dry_run:
|
|
print("\n⚠️ DRY RUN MODE - No API calls or file modifications will be made")
|
|
|
|
# Initialize geocoder
|
|
print(f"Initializing geocoder with cache at {cache_file}")
|
|
geocoder = GlobalGeocoder(cache_file)
|
|
|
|
# Show cache stats
|
|
cache_stats = geocoder.cache.stats()
|
|
print(f"Cache contains {cache_stats['total_queries']:,} queries ({cache_stats['successful']:,} successful)")
|
|
|
|
# Geocode institutions
|
|
print(f"\nGeocoding institutions...")
|
|
print("=" * 80)
|
|
|
|
updated_count = 0
|
|
start_time = time.time()
|
|
|
|
for i, institution in enumerate(institutions_to_process, 1):
|
|
name = institution.get('name', 'Unknown')
|
|
ghcid = institution.get('ghcid', 'Unknown')
|
|
country = institution.get('locations', [{}])[0].get('country', '??') if institution.get('locations') else '??'
|
|
|
|
if args.verbose or i % 100 == 0 or i <= 10:
|
|
print(f"\n[{i}/{len(institutions_to_process)}] {country} - {name}")
|
|
print(f" GHCID: {ghcid}")
|
|
|
|
updated = geocoder.geocode_institution(
|
|
institution,
|
|
dry_run=args.dry_run,
|
|
force=args.force,
|
|
verbose=args.verbose
|
|
)
|
|
|
|
if updated:
|
|
updated_count += 1
|
|
|
|
# Save progress every 100 institutions
|
|
if not args.dry_run and i % 100 == 0 and updated_count > 0:
|
|
print(f"\n💾 Saving progress at {i}/{len(institutions_to_process)} institutions...")
|
|
with open(global_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
institutions, # Save FULL dataset, not just processed subset
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120
|
|
)
|
|
print(f"✅ Progress saved ({updated_count:,} institutions updated so far)")
|
|
|
|
# Progress indicator (without newline)
|
|
if not args.verbose and i % 10 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = i / elapsed if elapsed > 0 else 0
|
|
remaining = (len(institutions_to_process) - i) / rate if rate > 0 else 0
|
|
print(f"\rProgress: {i}/{len(institutions_to_process)} ({i/len(institutions_to_process)*100:.1f}%) | "
|
|
f"Rate: {rate:.1f}/sec | ETA: {remaining/60:.1f} min", end='', flush=True)
|
|
|
|
print() # Newline after progress
|
|
|
|
# Print statistics
|
|
geocoder.print_stats()
|
|
|
|
# Save updated dataset (final save)
|
|
if not args.dry_run and updated_count > 0:
|
|
output_file = global_file
|
|
print(f"\n💾 Saving final results to {output_file}")
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
institutions,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120
|
|
)
|
|
|
|
print(f"✅ Saved {len(institutions):,} institutions ({updated_count:,} updated with coordinates)")
|
|
elif args.dry_run:
|
|
print(f"\n⚠️ DRY RUN - No files were modified")
|
|
print(f"Would have updated {updated_count:,} institutions")
|
|
else:
|
|
print(f"\n✓ No institutions needed updating")
|
|
|
|
# Close geocoder
|
|
geocoder.close()
|
|
|
|
# Final summary
|
|
elapsed = time.time() - start_time
|
|
print(f"\nTotal execution time: {elapsed/60:.1f} minutes")
|
|
|
|
if geocoder.stats['api_calls'] > 0:
|
|
print(f"Average API call rate: {geocoder.stats['api_calls']/elapsed:.2f} requests/second")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|