#!/usr/bin/env python3 """ Geocode Chilean Heritage Institutions using Nominatim API Takes chilean_institutions_curated.yaml as input and enriches location data with: - City names - Latitude/longitude coordinates - OpenStreetMap identifiers Respects Nominatim usage policy: 1 request/second, caching, descriptive User-Agent. """ import yaml import requests import time import re from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Any from dataclasses import dataclass import sys # Configuration INPUT_FILE = Path("data/instances/chilean_institutions_curated.yaml") OUTPUT_FILE = Path("data/instances/chilean_institutions_geocoded_v2.yaml") REPORT_FILE = Path("data/instances/chilean_geocoding_report_v2.md") CACHE_FILE = Path("data/instances/.geocoding_cache_chile.yaml") NOMINATIM_URL = "https://nominatim.openstreetmap.org/search" USER_AGENT = "GLAM-Heritage-Data-Project/1.0 (https://github.com/cultural-heritage/glam-extractor)" REQUEST_DELAY = 1.1 # Seconds between requests (Nominatim requires 1/sec max) @dataclass class GeocodingResult: """Result from Nominatim geocoding""" city: Optional[str] latitude: Optional[float] longitude: Optional[float] osm_type: Optional[str] osm_id: Optional[str] display_name: Optional[str] confidence: float class GeocodingCache: """Cache for geocoding results to avoid duplicate API calls""" def __init__(self, cache_file: Path): self.cache_file = cache_file self.cache: Dict[str, Dict[str, Any]] = {} self.load() def load(self): """Load cache from file""" if self.cache_file.exists(): with open(self.cache_file, 'r', encoding='utf-8') as f: self.cache = yaml.safe_load(f) or {} print(f"✓ Loaded {len(self.cache)} cached geocoding results") def save(self): """Save cache to file""" with open(self.cache_file, 'w', encoding='utf-8') as f: yaml.dump(self.cache, f, allow_unicode=True, default_flow_style=False) def get(self, query: str) -> Optional[Dict[str, Any]]: """Get cached result for query""" return self.cache.get(query) def put(self, query: str, result: Dict[str, Any]): """Store result in cache""" self.cache[query] = result self.save() class ChileanGeocoder: """Geocode Chilean institutions using Nominatim API""" def __init__(self, cache: GeocodingCache): self.cache = cache self.stats = { 'total': 0, 'cached': 0, 'api_calls': 0, 'geocoded': 0, 'failed': 0, 'already_geocoded': 0 } def geocode_institution(self, name: str, region: str) -> Optional[GeocodingResult]: """ Geocode an institution by name and region. Uses fallback strategies with simplified queries if initial search fails. Args: name: Institution name region: Chilean region name Returns: GeocodingResult if successful, None otherwise """ # Build search queries with fallback strategies queries = self._build_fallback_queries(name, region) for i, query in enumerate(queries): # Check cache first cached = self.cache.get(query) if cached: self.stats['cached'] += 1 if i == 0: print(f" [CACHE] {name[:60]}") else: print(f" [CACHE-FALLBACK-{i}] {query[:60]}") return self._dict_to_result(cached) # Make API request if i == 0: print(f" [API] {name[:60]}") else: print(f" [API-FALLBACK-{i}] {query[:60]}") params = { 'q': query, 'format': 'json', 'limit': 1, 'addressdetails': 1 } headers = { 'User-Agent': USER_AGENT } try: response = requests.get(NOMINATIM_URL, params=params, headers=headers, timeout=10) response.raise_for_status() self.stats['api_calls'] += 1 time.sleep(REQUEST_DELAY) # Respect rate limit results = response.json() if not results: if i < len(queries) - 1: print(f" ⚠ No results, trying fallback...") continue # Try next fallback else: print(f" ⚠ No results found (all strategies exhausted)") self.cache.put(queries[0], {'found': False}) return None # Success! Cache under original query for future lookups result = self._parse_nominatim_result(results[0]) if result: result_dict = self._result_to_dict(result) self.cache.put(queries[0], result_dict) # Cache under original query if i > 0: print(f" ✓ Found via fallback strategy {i}") return result except Exception as e: print(f" ✗ Error: {e}") if i == len(queries) - 1: self.cache.put(queries[0], {'found': False, 'error': str(e)}) continue # Try next fallback return None def _build_fallback_queries(self, name: str, region: str) -> List[str]: """ Build a list of fallback queries with progressively simplified names. Strategy: 1. Full name + region + Chile 2. Remove parenthetical content + region + Chile 3. Extract museum/archive/library name + region + Chile 4. Just region + Chile (last resort - gives region center) """ queries = [] # Strategy 1: Full name queries.append(f"{name}, {region}, Chile") # Strategy 2: Remove parenthetical content (MASMA, MUHNCAL, etc.) clean_name = re.sub(r'\s*\([^)]*\)', '', name).strip() if clean_name != name: queries.append(f"{clean_name}, {region}, Chile") # Strategy 3: Extract key museum/archive words # For "Museo Universidad de Tarapacá San Miguel de Azapa" -> "Museo San Miguel de Azapa" if 'Museo' in name or 'Archivo' in name or 'Biblioteca' in name: # Try to extract the most distinctive part words = name.split() if 'Museo' in words: idx = words.index('Museo') # Take "Museo" + last 2-3 significant words distinctive = ' '.join(words[idx:idx+1] + words[-3:]) distinctive = re.sub(r'\s*\([^)]*\)', '', distinctive).strip() queries.append(f"{distinctive}, {region}, Chile") # Strategy 4: Generic institution type + region (last resort) for inst_type in ['Museo', 'Archivo', 'Biblioteca', 'Universidad']: if inst_type in name: queries.append(f"{inst_type}, {region}, Chile") break # Remove duplicates while preserving order seen = set() unique_queries = [] for q in queries: if q not in seen: seen.add(q) unique_queries.append(q) return unique_queries def _parse_nominatim_result(self, result: Dict[str, Any]) -> Optional[GeocodingResult]: """Parse a Nominatim result into GeocodingResult""" try: address = result.get('address', {}) # Extract city name (try multiple fields) city = ( address.get('city') or address.get('town') or address.get('municipality') or address.get('village') or address.get('county') ) geocoding_result = GeocodingResult( city=city, latitude=float(result['lat']), longitude=float(result['lon']), osm_type=result.get('osm_type'), osm_id=result.get('osm_id'), display_name=result.get('display_name'), confidence=0.8 # Medium confidence for geocoded data ) if city: print(f" ✓ {city} ({geocoding_result.latitude:.4f}, {geocoding_result.longitude:.4f})") else: print(f" ⚠ No city found ({geocoding_result.latitude:.4f}, {geocoding_result.longitude:.4f})") return geocoding_result except Exception as e: print(f" ✗ Parse error: {e}") return None def _result_to_dict(self, result: GeocodingResult) -> Dict[str, Any]: """Convert GeocodingResult to dict for caching""" return { 'found': True, 'city': result.city, 'latitude': result.latitude, 'longitude': result.longitude, 'osm_type': result.osm_type, 'osm_id': result.osm_id, 'display_name': result.display_name, 'confidence': result.confidence } def _dict_to_result(self, data: Dict[str, Any]) -> Optional[GeocodingResult]: """Convert cached dict to GeocodingResult""" if not data.get('found', False): return None return GeocodingResult( city=data.get('city'), latitude=data.get('latitude'), longitude=data.get('longitude'), osm_type=data.get('osm_type'), osm_id=data.get('osm_id'), display_name=data.get('display_name'), confidence=data.get('confidence', 0.8) ) def load_institutions(filepath: Path) -> List[Dict[str, Any]]: """Load institutions from YAML file""" with open(filepath, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) return institutions def save_institutions(institutions: List[Dict[str, Any]], filepath: Path): """Save institutions to YAML file""" with open(filepath, 'w', encoding='utf-8') as f: # Write header comment f.write("---\n") f.write("# Chilean GLAM Institutions - Geocoded Edition v2\n") f.write(f"# Geocoding date: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Total institutions: {len(institutions)}\n") f.write("\n") # Write YAML (skip first --- since we wrote it manually) yaml_content = yaml.dump(institutions, allow_unicode=True, default_flow_style=False, sort_keys=False) # Remove the leading --- that yaml.dump adds if yaml_content.startswith('---\n'): yaml_content = yaml_content[4:] f.write(yaml_content) def enrich_institution(institution: Dict[str, Any], geocoder: ChileanGeocoder) -> Dict[str, Any]: """ Enrich a single institution with geocoding data. Returns updated institution dict. """ geocoder.stats['total'] += 1 name = institution.get('name', 'Unknown') locations = institution.get('locations', []) if not locations: print(f"⚠ No location data for: {name}") geocoder.stats['failed'] += 1 return institution location = locations[0] # Take first location # Check if already geocoded if location.get('city') and location.get('latitude') and location.get('longitude'): print(f"✓ Already geocoded: {name} ({location.get('city')})") geocoder.stats['already_geocoded'] += 1 return institution region = location.get('region') if not region: print(f"⚠ No region data for: {name}") geocoder.stats['failed'] += 1 return institution # Geocode print(f"\n[{geocoder.stats['total']}] Geocoding: {name}") result = geocoder.geocode_institution(name, region) if result: # Update location if result.city: location['city'] = result.city location['latitude'] = result.latitude location['longitude'] = result.longitude # Update provenance if 'provenance' in institution: old_method = institution['provenance'].get('extraction_method', '') institution['provenance']['extraction_method'] = f"{old_method} + Nominatim geocoding" institution['provenance']['confidence_score'] = min( institution['provenance'].get('confidence_score', 0.85) + 0.05, 0.95 ) # Add OSM identifier if result.osm_type and result.osm_id: osm_identifier = { 'identifier_scheme': 'OpenStreetMap', 'identifier_value': f"{result.osm_type}/{result.osm_id}", 'identifier_url': f"https://www.openstreetmap.org/{result.osm_type}/{result.osm_id}" } if 'identifiers' not in institution: institution['identifiers'] = [] # Check if OSM identifier already exists has_osm = any( id.get('identifier_scheme') == 'OpenStreetMap' for id in institution['identifiers'] ) if not has_osm: institution['identifiers'].append(osm_identifier) geocoder.stats['geocoded'] += 1 else: geocoder.stats['failed'] += 1 return institution def generate_report(stats: Dict[str, int], output_file: Path, report_file: Path): """Generate geocoding report""" total = stats['total'] geocoded = stats['geocoded'] already_geocoded = stats['already_geocoded'] failed = stats['failed'] api_calls = stats['api_calls'] cached = stats['cached'] total_with_coords = geocoded + already_geocoded coverage_pct = (total_with_coords / total * 100) if total > 0 else 0 report = f"""# Chilean Institutions Geocoding Report v2 **Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')} ## Summary - **Total institutions**: {total} - **Successfully geocoded**: {geocoded} - **Already geocoded**: {already_geocoded} - **Failed to geocode**: {failed} - **Total with coordinates**: {total_with_coords} ({coverage_pct:.1f}%) ## API Usage - **Nominatim API calls**: {api_calls} - **Cache hits**: {cached} - **Cache efficiency**: {(cached / (api_calls + cached) * 100) if (api_calls + cached) > 0 else 0:.1f}% ## Target Achievement - **Target coverage**: 60% (54+ institutions) - **Actual coverage**: {coverage_pct:.1f}% ({total_with_coords} institutions) - **Status**: {'✓ TARGET MET' if coverage_pct >= 60 else '✗ Below target'} ## Output Files - **Geocoded YAML**: `{output_file}` - **Cache file**: `{CACHE_FILE}` - **This report**: `{report_file}` ## Next Steps {'- ✓ Chilean geocoding complete. Ready for Mexican institutions.' if coverage_pct >= 60 else '- Review failed geocoding attempts and retry with refined queries'} - Repeat geocoding process for Mexican institutions (117 records, currently 5.9% geocoded) - Final deliverable: 304 institutions with comprehensive geocoding across Brazil, Chile, Mexico --- *Geocoding performed using Nominatim API with 1 req/sec rate limit* """ with open(report_file, 'w', encoding='utf-8') as f: f.write(report) print(f"\n✓ Report saved to: {report_file}") return report def main(): """Main geocoding workflow""" print("=" * 80) print("Chilean Heritage Institutions Geocoding - v2") print("=" * 80) # Check input file exists if not INPUT_FILE.exists(): print(f"✗ Input file not found: {INPUT_FILE}") sys.exit(1) # Load institutions print(f"\n1. Loading institutions from: {INPUT_FILE}") institutions = load_institutions(INPUT_FILE) print(f" ✓ Loaded {len(institutions)} institutions") # Initialize geocoder with cache print(f"\n2. Initializing geocoder with cache: {CACHE_FILE}") cache = GeocodingCache(CACHE_FILE) geocoder = ChileanGeocoder(cache) # Enrich institutions print(f"\n3. Geocoding institutions...") print(" (This may take several minutes due to 1 req/sec rate limit)\n") enriched = [] for institution in institutions: enriched_inst = enrich_institution(institution, geocoder) enriched.append(enriched_inst) # Save results print(f"\n4. Saving geocoded institutions to: {OUTPUT_FILE}") save_institutions(enriched, OUTPUT_FILE) print(f" ✓ Saved {len(enriched)} institutions") # Generate report print(f"\n5. Generating report...") report = generate_report(geocoder.stats, OUTPUT_FILE, REPORT_FILE) print("\n" + "=" * 80) print(report) print("=" * 80) # Summary total_with_coords = geocoder.stats['geocoded'] + geocoder.stats['already_geocoded'] coverage_pct = (total_with_coords / geocoder.stats['total'] * 100) if geocoder.stats['total'] > 0 else 0 if coverage_pct >= 60: print("\n✓ SUCCESS: Chilean geocoding complete!") print(f" Achieved {coverage_pct:.1f}% coverage (target: 60%)") else: print(f"\n⚠ WARNING: Coverage below target ({coverage_pct:.1f}% < 60%)") print(" Review failed attempts and consider manual geocoding") return 0 if __name__ == '__main__': sys.exit(main())