glam/scripts/geocode_mexican_institutions.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

519 lines
19 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Geocode Mexican Heritage Institutions using Nominatim API
Takes mexican_institutions_curated.yaml as input and enriches location data with:
- City names
- Latitude/longitude coordinates
- OpenStreetMap identifiers
Respects Nominatim usage policy: 1 request/second, caching, descriptive User-Agent.
"""
import yaml
import requests
import time
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass
import sys
# Configuration
INPUT_FILE = Path("data/instances/mexican_institutions_curated.yaml")
OUTPUT_FILE = Path("data/instances/mexican_institutions_geocoded.yaml")
REPORT_FILE = Path("data/instances/mexican_geocoding_report.md")
CACHE_FILE = Path("data/instances/.geocoding_cache_mexico.yaml")
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
USER_AGENT = "GLAM-Heritage-Data-Project/1.0 (https://github.com/cultural-heritage/glam-extractor)"
REQUEST_DELAY = 1.1 # Seconds between requests (Nominatim requires 1/sec max)
@dataclass
class GeocodingResult:
"""Result from Nominatim geocoding"""
city: Optional[str]
latitude: Optional[float]
longitude: Optional[float]
osm_type: Optional[str]
osm_id: Optional[str]
display_name: Optional[str]
confidence: float
class GeocodingCache:
"""Cache for geocoding results to avoid duplicate API calls"""
def __init__(self, cache_file: Path):
self.cache_file = cache_file
self.cache: Dict[str, Dict[str, Any]] = {}
self.load()
def load(self):
"""Load cache from file"""
if self.cache_file.exists():
with open(self.cache_file, 'r', encoding='utf-8') as f:
self.cache = yaml.safe_load(f) or {}
print(f"✓ Loaded {len(self.cache)} cached geocoding results")
def save(self):
"""Save cache to file"""
with open(self.cache_file, 'w', encoding='utf-8') as f:
yaml.dump(self.cache, f, allow_unicode=True, default_flow_style=False)
def get(self, query: str) -> Optional[Dict[str, Any]]:
"""Get cached result for query"""
return self.cache.get(query)
def put(self, query: str, result: Dict[str, Any]):
"""Store result in cache"""
self.cache[query] = result
self.save()
class MexicanGeocoder:
"""Geocode Mexican institutions using Nominatim API"""
def __init__(self, cache: GeocodingCache):
self.cache = cache
self.stats = {
'total': 0,
'cached': 0,
'api_calls': 0,
'geocoded': 0,
'failed': 0,
'already_geocoded': 0
}
def geocode_institution(self, name: str, region: str) -> Optional[GeocodingResult]:
"""
Geocode an institution by name and region.
Uses fallback strategies with simplified queries if initial search fails.
Args:
name: Institution name
region: Mexican state/region name
Returns:
GeocodingResult if successful, None otherwise
"""
# Build search queries with fallback strategies
queries = self._build_fallback_queries(name, region)
for i, query in enumerate(queries):
# Check cache first
cached = self.cache.get(query)
if cached:
self.stats['cached'] += 1
if i == 0:
print(f" [CACHE] {name[:60]}")
else:
print(f" [CACHE-FALLBACK-{i}] {query[:60]}")
return self._dict_to_result(cached)
# Make API request
if i == 0:
print(f" [API] {name[:60]}")
else:
print(f" [API-FALLBACK-{i}] {query[:60]}")
params = {
'q': query,
'format': 'json',
'limit': 1,
'addressdetails': 1
}
headers = {
'User-Agent': USER_AGENT
}
try:
response = requests.get(NOMINATIM_URL, params=params, headers=headers, timeout=10)
response.raise_for_status()
self.stats['api_calls'] += 1
time.sleep(REQUEST_DELAY) # Respect rate limit
results = response.json()
if not results:
if i < len(queries) - 1:
print(f" ⚠ No results, trying fallback...")
continue # Try next fallback
else:
print(f" ⚠ No results found (all strategies exhausted)")
self.cache.put(queries[0], {'found': False})
return None
# Success! Cache under original query for future lookups
result = self._parse_nominatim_result(results[0])
if result:
result_dict = self._result_to_dict(result)
self.cache.put(queries[0], result_dict) # Cache under original query
if i > 0:
print(f" ✓ Found via fallback strategy {i}")
return result
except Exception as e:
print(f" ✗ Error: {e}")
if i == len(queries) - 1:
self.cache.put(queries[0], {'found': False, 'error': str(e)})
continue # Try next fallback
return None
def _build_fallback_queries(self, name: str, region: str) -> List[str]:
"""
Build a list of fallback queries with progressively simplified names.
Strategy (adapted for Mexican institutions):
1. Full name + region + Mexico
2. Remove parenthetical content + region + Mexico
3. Extract museo/archivo/biblioteca name + region + Mexico
4. Just region + Mexico (last resort - gives region center)
"""
queries = []
# Strategy 1: Full name
queries.append(f"{name}, {region}, Mexico")
# Strategy 2: Remove parenthetical content (acronyms, etc.)
clean_name = re.sub(r'\s*\([^)]*\)', '', name).strip()
if clean_name != name:
queries.append(f"{clean_name}, {region}, Mexico")
# Strategy 3: Extract key museum/archive words
# For Mexican institutions, common patterns include:
# - "Museo Nacional de..." -> "Museo Nacional"
# - "Archivo Histórico de..." -> "Archivo Histórico"
# - "Biblioteca [Name]" -> "Biblioteca [distinctive part]"
# - INAH institutions -> Remove "INAH" prefix, keep distinctive part
if any(keyword in name for keyword in ['Museo', 'Archivo', 'Biblioteca', 'Centro', 'Instituto']):
# Try to extract the most distinctive part
words = name.split()
# Handle INAH institutions (Instituto Nacional de Antropología e Historia)
if 'INAH' in name or 'Instituto Nacional' in name:
# Remove INAH/Instituto Nacional, keep distinctive part
distinctive = re.sub(r'INAH\s*-?\s*', '', name)
distinctive = re.sub(r'Instituto Nacional de\s+', '', distinctive)
distinctive = re.sub(r'\s*\([^)]*\)', '', distinctive).strip()
if distinctive != name:
queries.append(f"{distinctive}, {region}, Mexico")
# Handle "Museo [Type] de [Location]" patterns
for inst_type in ['Museo', 'Archivo', 'Biblioteca']:
if inst_type in words:
idx = words.index(inst_type)
# Take institution type + last 2-3 significant words
distinctive = ' '.join(words[idx:idx+1] + words[-3:])
distinctive = re.sub(r'\s*\([^)]*\)', '', distinctive).strip()
if len(distinctive) > len(inst_type) + 5: # Has meaningful content
queries.append(f"{distinctive}, {region}, Mexico")
break
# Strategy 4: Generic institution type + region (last resort)
for inst_type in ['Museo', 'Archivo', 'Biblioteca', 'Centro Cultural', 'Universidad', 'Secretaría']:
if inst_type in name:
queries.append(f"{inst_type}, {region}, Mexico")
break
# Remove duplicates while preserving order
seen = set()
unique_queries = []
for q in queries:
if q not in seen:
seen.add(q)
unique_queries.append(q)
return unique_queries
def _parse_nominatim_result(self, result: Dict[str, Any]) -> Optional[GeocodingResult]:
"""Parse a Nominatim result into GeocodingResult"""
try:
address = result.get('address', {})
# Extract city name (try multiple fields)
city = (
address.get('city') or
address.get('town') or
address.get('municipality') or
address.get('village') or
address.get('county') or
address.get('state') # For Mexican states, sometimes city isn't available
)
geocoding_result = GeocodingResult(
city=city,
latitude=float(result['lat']),
longitude=float(result['lon']),
osm_type=result.get('osm_type'),
osm_id=result.get('osm_id'),
display_name=result.get('display_name'),
confidence=0.8 # Medium confidence for geocoded data
)
if city:
print(f"{city} ({geocoding_result.latitude:.4f}, {geocoding_result.longitude:.4f})")
else:
print(f" ⚠ No city found ({geocoding_result.latitude:.4f}, {geocoding_result.longitude:.4f})")
return geocoding_result
except Exception as e:
print(f" ✗ Parse error: {e}")
return None
def _result_to_dict(self, result: GeocodingResult) -> Dict[str, Any]:
"""Convert GeocodingResult to dict for caching"""
return {
'found': True,
'city': result.city,
'latitude': result.latitude,
'longitude': result.longitude,
'osm_type': result.osm_type,
'osm_id': result.osm_id,
'display_name': result.display_name,
'confidence': result.confidence
}
def _dict_to_result(self, data: Dict[str, Any]) -> Optional[GeocodingResult]:
"""Convert cached dict to GeocodingResult"""
if not data.get('found', False):
return None
return GeocodingResult(
city=data.get('city'),
latitude=data.get('latitude'),
longitude=data.get('longitude'),
osm_type=data.get('osm_type'),
osm_id=data.get('osm_id'),
display_name=data.get('display_name'),
confidence=data.get('confidence', 0.8)
)
def load_institutions(filepath: Path) -> List[Dict[str, Any]]:
"""Load institutions from YAML file"""
with open(filepath, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
return institutions
def save_institutions(institutions: List[Dict[str, Any]], filepath: Path):
"""Save institutions to YAML file"""
with open(filepath, 'w', encoding='utf-8') as f:
# Write header comment
f.write("---\n")
f.write("# Mexican GLAM Institutions - Geocoded Edition\n")
f.write(f"# Geocoding date: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Total institutions: {len(institutions)}\n")
f.write("\n")
# Write YAML (skip first --- since we wrote it manually)
yaml_content = yaml.dump(institutions, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Remove the leading --- that yaml.dump adds
if yaml_content.startswith('---\n'):
yaml_content = yaml_content[4:]
f.write(yaml_content)
def enrich_institution(institution: Dict[str, Any], geocoder: MexicanGeocoder) -> Dict[str, Any]:
"""
Enrich a single institution with geocoding data.
Returns updated institution dict.
"""
geocoder.stats['total'] += 1
name = institution.get('name', 'Unknown')
locations = institution.get('locations', [])
if not locations:
print(f"⚠ No location data for: {name}")
geocoder.stats['failed'] += 1
return institution
location = locations[0] # Take first location
# Check if already geocoded
if location.get('city') and location.get('latitude') and location.get('longitude'):
print(f"✓ Already geocoded: {name} ({location.get('city')})")
geocoder.stats['already_geocoded'] += 1
return institution
region = location.get('region')
if not region:
print(f"⚠ No region data for: {name}")
geocoder.stats['failed'] += 1
return institution
# Geocode
print(f"\n[{geocoder.stats['total']}] Geocoding: {name}")
result = geocoder.geocode_institution(name, region)
if result:
# Update location
if result.city:
location['city'] = result.city
location['latitude'] = result.latitude
location['longitude'] = result.longitude
# Update provenance
if 'provenance' in institution:
old_method = institution['provenance'].get('extraction_method', '')
institution['provenance']['extraction_method'] = f"{old_method} + Nominatim geocoding"
institution['provenance']['confidence_score'] = min(
institution['provenance'].get('confidence_score', 0.85) + 0.05,
0.95
)
# Add OSM identifier
if result.osm_type and result.osm_id:
osm_identifier = {
'identifier_scheme': 'OpenStreetMap',
'identifier_value': f"{result.osm_type}/{result.osm_id}",
'identifier_url': f"https://www.openstreetmap.org/{result.osm_type}/{result.osm_id}"
}
if 'identifiers' not in institution:
institution['identifiers'] = []
# Check if OSM identifier already exists
has_osm = any(
id.get('identifier_scheme') == 'OpenStreetMap'
for id in institution['identifiers']
)
if not has_osm:
institution['identifiers'].append(osm_identifier)
geocoder.stats['geocoded'] += 1
else:
geocoder.stats['failed'] += 1
return institution
def generate_report(stats: Dict[str, int], output_file: Path, report_file: Path):
"""Generate geocoding report"""
total = stats['total']
geocoded = stats['geocoded']
already_geocoded = stats['already_geocoded']
failed = stats['failed']
api_calls = stats['api_calls']
cached = stats['cached']
total_with_coords = geocoded + already_geocoded
coverage_pct = (total_with_coords / total * 100) if total > 0 else 0
report = f"""# Mexican Institutions Geocoding Report
**Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
## Summary
- **Total institutions**: {total}
- **Successfully geocoded**: {geocoded}
- **Already geocoded**: {already_geocoded}
- **Failed to geocode**: {failed}
- **Total with coordinates**: {total_with_coords} ({coverage_pct:.1f}%)
## API Usage
- **Nominatim API calls**: {api_calls}
- **Cache hits**: {cached}
- **Cache efficiency**: {(cached / (api_calls + cached) * 100) if (api_calls + cached) > 0 else 0:.1f}%
## Target Achievement
- **Target coverage**: 60% (70+ institutions)
- **Actual coverage**: {coverage_pct:.1f}% ({total_with_coords} institutions)
- **Status**: {'✓ TARGET MET' if coverage_pct >= 60 else '✗ Below target'}
## Output Files
- **Geocoded YAML**: `{output_file}`
- **Cache file**: `{CACHE_FILE}`
- **This report**: `{report_file}`
## Next Steps
{'- ✓ Mexican geocoding complete!' if coverage_pct >= 60 else '- Review failed geocoding attempts and retry with refined queries'}
- Combine geocoded data for Brazil (97 institutions), Chile (90 institutions), and Mexico ({total} institutions)
- Final deliverable: {97 + 90 + total} institutions with comprehensive geocoding
---
*Geocoding performed using Nominatim API with 1 req/sec rate limit*
"""
with open(report_file, 'w', encoding='utf-8') as f:
f.write(report)
print(f"\n✓ Report saved to: {report_file}")
return report
def main():
"""Main geocoding workflow"""
print("=" * 80)
print("Mexican Heritage Institutions Geocoding")
print("=" * 80)
# Check input file exists
if not INPUT_FILE.exists():
print(f"✗ Input file not found: {INPUT_FILE}")
sys.exit(1)
# Load institutions
print(f"\n1. Loading institutions from: {INPUT_FILE}")
institutions = load_institutions(INPUT_FILE)
print(f" ✓ Loaded {len(institutions)} institutions")
# Initialize geocoder with cache
print(f"\n2. Initializing geocoder with cache: {CACHE_FILE}")
cache = GeocodingCache(CACHE_FILE)
geocoder = MexicanGeocoder(cache)
# Enrich institutions
print(f"\n3. Geocoding institutions...")
print(" (This may take several minutes due to 1 req/sec rate limit)\n")
enriched = []
for institution in institutions:
enriched_inst = enrich_institution(institution, geocoder)
enriched.append(enriched_inst)
# Save results
print(f"\n4. Saving geocoded institutions to: {OUTPUT_FILE}")
save_institutions(enriched, OUTPUT_FILE)
print(f" ✓ Saved {len(enriched)} institutions")
# Generate report
print(f"\n5. Generating report...")
report = generate_report(geocoder.stats, OUTPUT_FILE, REPORT_FILE)
print("\n" + "=" * 80)
print(report)
print("=" * 80)
# Summary
total_with_coords = geocoder.stats['geocoded'] + geocoder.stats['already_geocoded']
coverage_pct = (total_with_coords / geocoder.stats['total'] * 100) if geocoder.stats['total'] > 0 else 0
if coverage_pct >= 60:
print("\n✓ SUCCESS: Mexican geocoding complete!")
print(f" Achieved {coverage_pct:.1f}% coverage (target: 60%)")
else:
print(f"\n⚠ WARNING: Coverage below target ({coverage_pct:.1f}% < 60%)")
print(" Review failed attempts and consider manual geocoding")
return 0
if __name__ == '__main__':
sys.exit(main())