#!/usr/bin/env python3 """ Generate comprehensive geocoding enrichment report. """ import yaml from pathlib import Path from collections import Counter from datetime import datetime INPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml") OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_geocoded_v3.yaml") CACHE_FILE = Path("/Users/kempersc/apps/glam/data/cache/geocoding_cache.yaml") REPORT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_geocoding_report_v3.md") def main(): # Load data with open(INPUT_FILE, 'r', encoding='utf-8') as f: input_records = yaml.safe_load(f) with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: output_records = yaml.safe_load(f) with open(CACHE_FILE, 'r', encoding='utf-8') as f: cache = yaml.safe_load(f) or {} # Analyze changes input_with_cities = sum( 1 for r in input_records if r.get('locations') and any(loc.get('city') for loc in r['locations']) ) output_with_cities = sum( 1 for r in output_records if r.get('locations') and any(loc.get('city') for loc in r['locations']) ) output_with_coords = sum( 1 for r in output_records if r.get('locations') and any(loc.get('latitude') for loc in r['locations']) ) newly_geocoded = output_with_cities - input_with_cities # Count OSM identifiers added osm_count = sum( 1 for r in output_records if r.get('identifiers') and any( id.get('identifier_scheme') == 'OpenStreetMap' for id in r['identifiers'] ) ) # Collect cities found cities = [] for r in output_records: if r.get('locations'): for loc in r['locations']: if loc.get('city'): cities.append(loc['city']) city_counts = Counter(cities) # States with geocoded institutions state_cities = {} for r in output_records: if r.get('locations') and r['locations']: state = r['locations'][0].get('region') city = r['locations'][0].get('city') if state and city: if state not in state_cities: state_cities[state] = [] state_cities[state].append(city) # Failed geocoding attempts failed_institutions = [] for r in output_records: has_city = r.get('locations') and any(loc.get('city') for loc in r['locations']) has_state = r.get('locations') and r['locations'] and r['locations'][0].get('region') if not has_city and has_state: failed_institutions.append({ 'name': r.get('name', 'Unknown'), 'state': r['locations'][0].get('region'), 'type': r.get('institution_type', 'Unknown') }) # Generate report report = f"""# Brazilian GLAM Geocoding Enrichment Report - v3.0 **Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ## Summary This report documents the geocoding enrichment process for Brazilian heritage institutions using the Nominatim API (OpenStreetMap). ## Input/Output Files - **Input**: `{INPUT_FILE.name}` - **Output**: `{OUTPUT_FILE.name}` - **Cache**: `{CACHE_FILE.name}` ## Overall Statistics | Metric | Before (v2) | After (v3) | Change | |--------|-------------|------------|--------| | **Total records** | {len(input_records)} | {len(output_records)} | - | | **Records with cities** | {input_with_cities} ({input_with_cities/len(input_records)*100:.1f}%) | {output_with_cities} ({output_with_cities/len(output_records)*100:.1f}%) | +{newly_geocoded} | | **Records with coordinates** | 0 (0.0%) | {output_with_coords} ({output_with_coords/len(output_records)*100:.1f}%) | +{output_with_coords} | | **OpenStreetMap identifiers** | 0 | {osm_count} | +{osm_count} | ## Geocoding Performance | Category | Count | Percentage | |----------|-------|------------| | **Already had cities** | {input_with_cities} | {input_with_cities/len(input_records)*100:.1f}% | | **Successfully geocoded** | {newly_geocoded} | {newly_geocoded/len(input_records)*100:.1f}% | | **Failed geocoding** | {len(failed_institutions)} | {len(failed_institutions)/len(input_records)*100:.1f}% | | **Total with cities (v3)** | {output_with_cities} | {output_with_cities/len(output_records)*100:.1f}% | ### Target Achievement - **Target**: 60% city coverage (58 records minimum) - **Achieved**: {output_with_cities} records ({output_with_cities/len(output_records)*100:.1f}%) - **Status**: {'✓ TARGET MET' if output_with_cities >= 58 else '✗ TARGET NOT MET'} ## Geographic Distribution ### Cities Found ({len(city_counts)} unique cities) Top 15 cities by institution count: """ for city, count in city_counts.most_common(15): report += f"- **{city}**: {count} institution{'s' if count > 1 else ''}\n" report += f"\n### States with Geocoded Institutions ({len(state_cities)} states)\n\n" for state in sorted(state_cities.keys()): unique_cities = len(set(state_cities[state])) report += f"- **{state}**: {len(state_cities[state])} institutions in {unique_cities} {'cities' if unique_cities > 1 else 'city'}\n" if failed_institutions: report += f"\n## Failed Geocoding Attempts ({len(failed_institutions)} institutions)\n\n" report += "These institutions have state information but could not be geocoded:\n\n" for inst in sorted(failed_institutions, key=lambda x: x['state']): report += f"- **{inst['name']}** ({inst['type']}) - {inst['state']}\n" report += f""" ## API Cache Statistics | Metric | Value | |--------|-------| | **Total cache entries** | {len(cache)} | | **Successful lookups** | {sum(1 for v in cache.values() if v is not None)} ({sum(1 for v in cache.values() if v is not None)/len(cache)*100:.1f}%) | | **Failed lookups** | {sum(1 for v in cache.values() if v is None)} ({sum(1 for v in cache.values() if v is None)/len(cache)*100:.1f}%) | ## Data Quality Enhancements The geocoding process added: 1. **City names** - Extracted from OpenStreetMap address data 2. **Geographic coordinates** - Latitude/longitude for mapping 3. **OpenStreetMap identifiers** - OSM type/ID for cross-referencing 4. **Provenance updates** - Extraction timestamps and confidence adjustments ### Confidence Score Adjustments - Successfully geocoded records received a +0.05 confidence boost (capped at 0.85) - Extraction method updated to include "+ Nominatim geocoding" ## Next Steps ### Recommended Improvements 1. **Manual verification** of failed geocoding attempts ({len(failed_institutions)} institutions) 2. **Website enrichment** - Extract URLs to improve coverage from current 9.3% 3. **Wikidata integration** - Cross-reference institutions with Wikidata Q-IDs 4. **Address enrichment** - Add street addresses where available 5. **Collection metadata** - Extract collection information from institutional websites ### Priority Actions 1. Review failed geocoding cases to identify patterns 2. Attempt alternative geocoding strategies (city+state only, abbreviations, etc.) 3. Cross-reference with IBRAM registry for official museum locations 4. Implement web scraping for institutional websites ## Technical Notes - **API**: OpenStreetMap Nominatim - **Rate limiting**: 1.1 seconds per request - **Total processing time**: ~{len(cache) * 1.1 / 60:.1f} minutes - **Cache format**: YAML (persistent across runs) - **User-Agent**: GLAM-Data-Extraction/0.2.0 --- **Report Version**: 3.0 **Data Version**: v3 (geocoded) **Schema Compliance**: LinkML v0.2.0 **Generated by**: `generate_geocoding_report.py` """ # Save report with open(REPORT_FILE, 'w', encoding='utf-8') as f: f.write(report) print(f"✓ Geocoding report saved to: {REPORT_FILE}") print() print("=" * 70) print("Key Statistics") print("=" * 70) print(f"City coverage: {output_with_cities}/{len(output_records)} ({output_with_cities/len(output_records)*100:.1f}%)") print(f"Coordinate coverage: {output_with_coords}/{len(output_records)} ({output_with_coords/len(output_records)*100:.1f}%)") print(f"Newly geocoded: {newly_geocoded}") print(f"Failed attempts: {len(failed_institutions)}") print(f"Unique cities found: {len(city_counts)}") print(f"States covered: {len(state_cities)}/27") print() if __name__ == "__main__": main()