- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
500 lines
18 KiB
Python
Executable file
500 lines
18 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Geocode Chilean Heritage Institutions using Nominatim API
|
|
|
|
Takes chilean_institutions_curated.yaml as input and enriches location data with:
|
|
- City names
|
|
- Latitude/longitude coordinates
|
|
- OpenStreetMap identifiers
|
|
|
|
Respects Nominatim usage policy: 1 request/second, caching, descriptive User-Agent.
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import time
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
from dataclasses import dataclass
|
|
import sys
|
|
|
|
# Configuration
|
|
INPUT_FILE = Path("data/instances/chilean_institutions_curated.yaml")
|
|
OUTPUT_FILE = Path("data/instances/chilean_institutions_geocoded_v2.yaml")
|
|
REPORT_FILE = Path("data/instances/chilean_geocoding_report_v2.md")
|
|
CACHE_FILE = Path("data/instances/.geocoding_cache_chile.yaml")
|
|
|
|
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
|
|
USER_AGENT = "GLAM-Heritage-Data-Project/1.0 (https://github.com/cultural-heritage/glam-extractor)"
|
|
REQUEST_DELAY = 1.1 # Seconds between requests (Nominatim requires 1/sec max)
|
|
|
|
|
|
@dataclass
|
|
class GeocodingResult:
|
|
"""Result from Nominatim geocoding"""
|
|
city: Optional[str]
|
|
latitude: Optional[float]
|
|
longitude: Optional[float]
|
|
osm_type: Optional[str]
|
|
osm_id: Optional[str]
|
|
display_name: Optional[str]
|
|
confidence: float
|
|
|
|
|
|
class GeocodingCache:
|
|
"""Cache for geocoding results to avoid duplicate API calls"""
|
|
|
|
def __init__(self, cache_file: Path):
|
|
self.cache_file = cache_file
|
|
self.cache: Dict[str, Dict[str, Any]] = {}
|
|
self.load()
|
|
|
|
def load(self):
|
|
"""Load cache from file"""
|
|
if self.cache_file.exists():
|
|
with open(self.cache_file, 'r', encoding='utf-8') as f:
|
|
self.cache = yaml.safe_load(f) or {}
|
|
print(f"✓ Loaded {len(self.cache)} cached geocoding results")
|
|
|
|
def save(self):
|
|
"""Save cache to file"""
|
|
with open(self.cache_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(self.cache, f, allow_unicode=True, default_flow_style=False)
|
|
|
|
def get(self, query: str) -> Optional[Dict[str, Any]]:
|
|
"""Get cached result for query"""
|
|
return self.cache.get(query)
|
|
|
|
def put(self, query: str, result: Dict[str, Any]):
|
|
"""Store result in cache"""
|
|
self.cache[query] = result
|
|
self.save()
|
|
|
|
|
|
class ChileanGeocoder:
|
|
"""Geocode Chilean institutions using Nominatim API"""
|
|
|
|
def __init__(self, cache: GeocodingCache):
|
|
self.cache = cache
|
|
self.stats = {
|
|
'total': 0,
|
|
'cached': 0,
|
|
'api_calls': 0,
|
|
'geocoded': 0,
|
|
'failed': 0,
|
|
'already_geocoded': 0
|
|
}
|
|
|
|
def geocode_institution(self, name: str, region: str) -> Optional[GeocodingResult]:
|
|
"""
|
|
Geocode an institution by name and region.
|
|
Uses fallback strategies with simplified queries if initial search fails.
|
|
|
|
Args:
|
|
name: Institution name
|
|
region: Chilean region name
|
|
|
|
Returns:
|
|
GeocodingResult if successful, None otherwise
|
|
"""
|
|
# Build search queries with fallback strategies
|
|
queries = self._build_fallback_queries(name, region)
|
|
|
|
for i, query in enumerate(queries):
|
|
# Check cache first
|
|
cached = self.cache.get(query)
|
|
if cached:
|
|
self.stats['cached'] += 1
|
|
if i == 0:
|
|
print(f" [CACHE] {name[:60]}")
|
|
else:
|
|
print(f" [CACHE-FALLBACK-{i}] {query[:60]}")
|
|
return self._dict_to_result(cached)
|
|
|
|
# Make API request
|
|
if i == 0:
|
|
print(f" [API] {name[:60]}")
|
|
else:
|
|
print(f" [API-FALLBACK-{i}] {query[:60]}")
|
|
|
|
params = {
|
|
'q': query,
|
|
'format': 'json',
|
|
'limit': 1,
|
|
'addressdetails': 1
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': USER_AGENT
|
|
}
|
|
|
|
try:
|
|
response = requests.get(NOMINATIM_URL, params=params, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
self.stats['api_calls'] += 1
|
|
time.sleep(REQUEST_DELAY) # Respect rate limit
|
|
|
|
results = response.json()
|
|
|
|
if not results:
|
|
if i < len(queries) - 1:
|
|
print(f" ⚠ No results, trying fallback...")
|
|
continue # Try next fallback
|
|
else:
|
|
print(f" ⚠ No results found (all strategies exhausted)")
|
|
self.cache.put(queries[0], {'found': False})
|
|
return None
|
|
|
|
# Success! Cache under original query for future lookups
|
|
result = self._parse_nominatim_result(results[0])
|
|
if result:
|
|
result_dict = self._result_to_dict(result)
|
|
self.cache.put(queries[0], result_dict) # Cache under original query
|
|
if i > 0:
|
|
print(f" ✓ Found via fallback strategy {i}")
|
|
return result
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error: {e}")
|
|
if i == len(queries) - 1:
|
|
self.cache.put(queries[0], {'found': False, 'error': str(e)})
|
|
continue # Try next fallback
|
|
|
|
return None
|
|
|
|
def _build_fallback_queries(self, name: str, region: str) -> List[str]:
|
|
"""
|
|
Build a list of fallback queries with progressively simplified names.
|
|
|
|
Strategy:
|
|
1. Full name + region + Chile
|
|
2. Remove parenthetical content + region + Chile
|
|
3. Extract museum/archive/library name + region + Chile
|
|
4. Just region + Chile (last resort - gives region center)
|
|
"""
|
|
queries = []
|
|
|
|
# Strategy 1: Full name
|
|
queries.append(f"{name}, {region}, Chile")
|
|
|
|
# Strategy 2: Remove parenthetical content (MASMA, MUHNCAL, etc.)
|
|
clean_name = re.sub(r'\s*\([^)]*\)', '', name).strip()
|
|
if clean_name != name:
|
|
queries.append(f"{clean_name}, {region}, Chile")
|
|
|
|
# Strategy 3: Extract key museum/archive words
|
|
# For "Museo Universidad de Tarapacá San Miguel de Azapa" -> "Museo San Miguel de Azapa"
|
|
if 'Museo' in name or 'Archivo' in name or 'Biblioteca' in name:
|
|
# Try to extract the most distinctive part
|
|
words = name.split()
|
|
if 'Museo' in words:
|
|
idx = words.index('Museo')
|
|
# Take "Museo" + last 2-3 significant words
|
|
distinctive = ' '.join(words[idx:idx+1] + words[-3:])
|
|
distinctive = re.sub(r'\s*\([^)]*\)', '', distinctive).strip()
|
|
queries.append(f"{distinctive}, {region}, Chile")
|
|
|
|
# Strategy 4: Generic institution type + region (last resort)
|
|
for inst_type in ['Museo', 'Archivo', 'Biblioteca', 'Universidad']:
|
|
if inst_type in name:
|
|
queries.append(f"{inst_type}, {region}, Chile")
|
|
break
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_queries = []
|
|
for q in queries:
|
|
if q not in seen:
|
|
seen.add(q)
|
|
unique_queries.append(q)
|
|
|
|
return unique_queries
|
|
|
|
def _parse_nominatim_result(self, result: Dict[str, Any]) -> Optional[GeocodingResult]:
|
|
"""Parse a Nominatim result into GeocodingResult"""
|
|
try:
|
|
address = result.get('address', {})
|
|
|
|
# Extract city name (try multiple fields)
|
|
city = (
|
|
address.get('city') or
|
|
address.get('town') or
|
|
address.get('municipality') or
|
|
address.get('village') or
|
|
address.get('county')
|
|
)
|
|
|
|
geocoding_result = GeocodingResult(
|
|
city=city,
|
|
latitude=float(result['lat']),
|
|
longitude=float(result['lon']),
|
|
osm_type=result.get('osm_type'),
|
|
osm_id=result.get('osm_id'),
|
|
display_name=result.get('display_name'),
|
|
confidence=0.8 # Medium confidence for geocoded data
|
|
)
|
|
|
|
if city:
|
|
print(f" ✓ {city} ({geocoding_result.latitude:.4f}, {geocoding_result.longitude:.4f})")
|
|
else:
|
|
print(f" ⚠ No city found ({geocoding_result.latitude:.4f}, {geocoding_result.longitude:.4f})")
|
|
|
|
return geocoding_result
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Parse error: {e}")
|
|
return None
|
|
|
|
def _result_to_dict(self, result: GeocodingResult) -> Dict[str, Any]:
|
|
"""Convert GeocodingResult to dict for caching"""
|
|
return {
|
|
'found': True,
|
|
'city': result.city,
|
|
'latitude': result.latitude,
|
|
'longitude': result.longitude,
|
|
'osm_type': result.osm_type,
|
|
'osm_id': result.osm_id,
|
|
'display_name': result.display_name,
|
|
'confidence': result.confidence
|
|
}
|
|
|
|
def _dict_to_result(self, data: Dict[str, Any]) -> Optional[GeocodingResult]:
|
|
"""Convert cached dict to GeocodingResult"""
|
|
if not data.get('found', False):
|
|
return None
|
|
|
|
return GeocodingResult(
|
|
city=data.get('city'),
|
|
latitude=data.get('latitude'),
|
|
longitude=data.get('longitude'),
|
|
osm_type=data.get('osm_type'),
|
|
osm_id=data.get('osm_id'),
|
|
display_name=data.get('display_name'),
|
|
confidence=data.get('confidence', 0.8)
|
|
)
|
|
|
|
|
|
def load_institutions(filepath: Path) -> List[Dict[str, Any]]:
|
|
"""Load institutions from YAML file"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
return institutions
|
|
|
|
|
|
def save_institutions(institutions: List[Dict[str, Any]], filepath: Path):
|
|
"""Save institutions to YAML file"""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
# Write header comment
|
|
f.write("---\n")
|
|
f.write("# Chilean GLAM Institutions - Geocoded Edition v2\n")
|
|
f.write(f"# Geocoding date: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Total institutions: {len(institutions)}\n")
|
|
f.write("\n")
|
|
|
|
# Write YAML (skip first --- since we wrote it manually)
|
|
yaml_content = yaml.dump(institutions, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
# Remove the leading --- that yaml.dump adds
|
|
if yaml_content.startswith('---\n'):
|
|
yaml_content = yaml_content[4:]
|
|
f.write(yaml_content)
|
|
|
|
|
|
def enrich_institution(institution: Dict[str, Any], geocoder: ChileanGeocoder) -> Dict[str, Any]:
|
|
"""
|
|
Enrich a single institution with geocoding data.
|
|
|
|
Returns updated institution dict.
|
|
"""
|
|
geocoder.stats['total'] += 1
|
|
|
|
name = institution.get('name', 'Unknown')
|
|
locations = institution.get('locations', [])
|
|
|
|
if not locations:
|
|
print(f"⚠ No location data for: {name}")
|
|
geocoder.stats['failed'] += 1
|
|
return institution
|
|
|
|
location = locations[0] # Take first location
|
|
|
|
# Check if already geocoded
|
|
if location.get('city') and location.get('latitude') and location.get('longitude'):
|
|
print(f"✓ Already geocoded: {name} ({location.get('city')})")
|
|
geocoder.stats['already_geocoded'] += 1
|
|
return institution
|
|
|
|
region = location.get('region')
|
|
if not region:
|
|
print(f"⚠ No region data for: {name}")
|
|
geocoder.stats['failed'] += 1
|
|
return institution
|
|
|
|
# Geocode
|
|
print(f"\n[{geocoder.stats['total']}] Geocoding: {name}")
|
|
result = geocoder.geocode_institution(name, region)
|
|
|
|
if result:
|
|
# Update location
|
|
if result.city:
|
|
location['city'] = result.city
|
|
location['latitude'] = result.latitude
|
|
location['longitude'] = result.longitude
|
|
|
|
# Update provenance
|
|
if 'provenance' in institution:
|
|
old_method = institution['provenance'].get('extraction_method', '')
|
|
institution['provenance']['extraction_method'] = f"{old_method} + Nominatim geocoding"
|
|
institution['provenance']['confidence_score'] = min(
|
|
institution['provenance'].get('confidence_score', 0.85) + 0.05,
|
|
0.95
|
|
)
|
|
|
|
# Add OSM identifier
|
|
if result.osm_type and result.osm_id:
|
|
osm_identifier = {
|
|
'identifier_scheme': 'OpenStreetMap',
|
|
'identifier_value': f"{result.osm_type}/{result.osm_id}",
|
|
'identifier_url': f"https://www.openstreetmap.org/{result.osm_type}/{result.osm_id}"
|
|
}
|
|
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Check if OSM identifier already exists
|
|
has_osm = any(
|
|
id.get('identifier_scheme') == 'OpenStreetMap'
|
|
for id in institution['identifiers']
|
|
)
|
|
|
|
if not has_osm:
|
|
institution['identifiers'].append(osm_identifier)
|
|
|
|
geocoder.stats['geocoded'] += 1
|
|
else:
|
|
geocoder.stats['failed'] += 1
|
|
|
|
return institution
|
|
|
|
|
|
def generate_report(stats: Dict[str, int], output_file: Path, report_file: Path):
|
|
"""Generate geocoding report"""
|
|
|
|
total = stats['total']
|
|
geocoded = stats['geocoded']
|
|
already_geocoded = stats['already_geocoded']
|
|
failed = stats['failed']
|
|
api_calls = stats['api_calls']
|
|
cached = stats['cached']
|
|
|
|
total_with_coords = geocoded + already_geocoded
|
|
coverage_pct = (total_with_coords / total * 100) if total > 0 else 0
|
|
|
|
report = f"""# Chilean Institutions Geocoding Report v2
|
|
|
|
**Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
|
|
|
|
## Summary
|
|
|
|
- **Total institutions**: {total}
|
|
- **Successfully geocoded**: {geocoded}
|
|
- **Already geocoded**: {already_geocoded}
|
|
- **Failed to geocode**: {failed}
|
|
- **Total with coordinates**: {total_with_coords} ({coverage_pct:.1f}%)
|
|
|
|
## API Usage
|
|
|
|
- **Nominatim API calls**: {api_calls}
|
|
- **Cache hits**: {cached}
|
|
- **Cache efficiency**: {(cached / (api_calls + cached) * 100) if (api_calls + cached) > 0 else 0:.1f}%
|
|
|
|
## Target Achievement
|
|
|
|
- **Target coverage**: 60% (54+ institutions)
|
|
- **Actual coverage**: {coverage_pct:.1f}% ({total_with_coords} institutions)
|
|
- **Status**: {'✓ TARGET MET' if coverage_pct >= 60 else '✗ Below target'}
|
|
|
|
## Output Files
|
|
|
|
- **Geocoded YAML**: `{output_file}`
|
|
- **Cache file**: `{CACHE_FILE}`
|
|
- **This report**: `{report_file}`
|
|
|
|
## Next Steps
|
|
|
|
{'- ✓ Chilean geocoding complete. Ready for Mexican institutions.' if coverage_pct >= 60 else '- Review failed geocoding attempts and retry with refined queries'}
|
|
- Repeat geocoding process for Mexican institutions (117 records, currently 5.9% geocoded)
|
|
- Final deliverable: 304 institutions with comprehensive geocoding across Brazil, Chile, Mexico
|
|
|
|
---
|
|
*Geocoding performed using Nominatim API with 1 req/sec rate limit*
|
|
"""
|
|
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
|
|
print(f"\n✓ Report saved to: {report_file}")
|
|
return report
|
|
|
|
|
|
def main():
|
|
"""Main geocoding workflow"""
|
|
|
|
print("=" * 80)
|
|
print("Chilean Heritage Institutions Geocoding - v2")
|
|
print("=" * 80)
|
|
|
|
# Check input file exists
|
|
if not INPUT_FILE.exists():
|
|
print(f"✗ Input file not found: {INPUT_FILE}")
|
|
sys.exit(1)
|
|
|
|
# Load institutions
|
|
print(f"\n1. Loading institutions from: {INPUT_FILE}")
|
|
institutions = load_institutions(INPUT_FILE)
|
|
print(f" ✓ Loaded {len(institutions)} institutions")
|
|
|
|
# Initialize geocoder with cache
|
|
print(f"\n2. Initializing geocoder with cache: {CACHE_FILE}")
|
|
cache = GeocodingCache(CACHE_FILE)
|
|
geocoder = ChileanGeocoder(cache)
|
|
|
|
# Enrich institutions
|
|
print(f"\n3. Geocoding institutions...")
|
|
print(" (This may take several minutes due to 1 req/sec rate limit)\n")
|
|
|
|
enriched = []
|
|
for institution in institutions:
|
|
enriched_inst = enrich_institution(institution, geocoder)
|
|
enriched.append(enriched_inst)
|
|
|
|
# Save results
|
|
print(f"\n4. Saving geocoded institutions to: {OUTPUT_FILE}")
|
|
save_institutions(enriched, OUTPUT_FILE)
|
|
print(f" ✓ Saved {len(enriched)} institutions")
|
|
|
|
# Generate report
|
|
print(f"\n5. Generating report...")
|
|
report = generate_report(geocoder.stats, OUTPUT_FILE, REPORT_FILE)
|
|
print("\n" + "=" * 80)
|
|
print(report)
|
|
print("=" * 80)
|
|
|
|
# Summary
|
|
total_with_coords = geocoder.stats['geocoded'] + geocoder.stats['already_geocoded']
|
|
coverage_pct = (total_with_coords / geocoder.stats['total'] * 100) if geocoder.stats['total'] > 0 else 0
|
|
|
|
if coverage_pct >= 60:
|
|
print("\n✓ SUCCESS: Chilean geocoding complete!")
|
|
print(f" Achieved {coverage_pct:.1f}% coverage (target: 60%)")
|
|
else:
|
|
print(f"\n⚠ WARNING: Coverage below target ({coverage_pct:.1f}% < 60%)")
|
|
print(" Review failed attempts and consider manual geocoding")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|