glam/generate_geocoding_report.py
2025-11-19 23:25:22 +01:00

225 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""
Generate comprehensive geocoding enrichment report.
"""
import yaml
from pathlib import Path
from collections import Counter
from datetime import datetime
INPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_geocoded_v3.yaml")
CACHE_FILE = Path("/Users/kempersc/apps/glam/data/cache/geocoding_cache.yaml")
REPORT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_geocoding_report_v3.md")
def main():
# Load data
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
input_records = yaml.safe_load(f)
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
output_records = yaml.safe_load(f)
with open(CACHE_FILE, 'r', encoding='utf-8') as f:
cache = yaml.safe_load(f) or {}
# Analyze changes
input_with_cities = sum(
1 for r in input_records
if r.get('locations') and any(loc.get('city') for loc in r['locations'])
)
output_with_cities = sum(
1 for r in output_records
if r.get('locations') and any(loc.get('city') for loc in r['locations'])
)
output_with_coords = sum(
1 for r in output_records
if r.get('locations') and any(loc.get('latitude') for loc in r['locations'])
)
newly_geocoded = output_with_cities - input_with_cities
# Count OSM identifiers added
osm_count = sum(
1 for r in output_records
if r.get('identifiers') and any(
id.get('identifier_scheme') == 'OpenStreetMap'
for id in r['identifiers']
)
)
# Collect cities found
cities = []
for r in output_records:
if r.get('locations'):
for loc in r['locations']:
if loc.get('city'):
cities.append(loc['city'])
city_counts = Counter(cities)
# States with geocoded institutions
state_cities = {}
for r in output_records:
if r.get('locations') and r['locations']:
state = r['locations'][0].get('region')
city = r['locations'][0].get('city')
if state and city:
if state not in state_cities:
state_cities[state] = []
state_cities[state].append(city)
# Failed geocoding attempts
failed_institutions = []
for r in output_records:
has_city = r.get('locations') and any(loc.get('city') for loc in r['locations'])
has_state = r.get('locations') and r['locations'] and r['locations'][0].get('region')
if not has_city and has_state:
failed_institutions.append({
'name': r.get('name', 'Unknown'),
'state': r['locations'][0].get('region'),
'type': r.get('institution_type', 'Unknown')
})
# Generate report
report = f"""# Brazilian GLAM Geocoding Enrichment Report - v3.0
**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## Summary
This report documents the geocoding enrichment process for Brazilian heritage institutions using the Nominatim API (OpenStreetMap).
## Input/Output Files
- **Input**: `{INPUT_FILE.name}`
- **Output**: `{OUTPUT_FILE.name}`
- **Cache**: `{CACHE_FILE.name}`
## Overall Statistics
| Metric | Before (v2) | After (v3) | Change |
|--------|-------------|------------|--------|
| **Total records** | {len(input_records)} | {len(output_records)} | - |
| **Records with cities** | {input_with_cities} ({input_with_cities/len(input_records)*100:.1f}%) | {output_with_cities} ({output_with_cities/len(output_records)*100:.1f}%) | +{newly_geocoded} |
| **Records with coordinates** | 0 (0.0%) | {output_with_coords} ({output_with_coords/len(output_records)*100:.1f}%) | +{output_with_coords} |
| **OpenStreetMap identifiers** | 0 | {osm_count} | +{osm_count} |
## Geocoding Performance
| Category | Count | Percentage |
|----------|-------|------------|
| **Already had cities** | {input_with_cities} | {input_with_cities/len(input_records)*100:.1f}% |
| **Successfully geocoded** | {newly_geocoded} | {newly_geocoded/len(input_records)*100:.1f}% |
| **Failed geocoding** | {len(failed_institutions)} | {len(failed_institutions)/len(input_records)*100:.1f}% |
| **Total with cities (v3)** | {output_with_cities} | {output_with_cities/len(output_records)*100:.1f}% |
### Target Achievement
- **Target**: 60% city coverage (58 records minimum)
- **Achieved**: {output_with_cities} records ({output_with_cities/len(output_records)*100:.1f}%)
- **Status**: {'✓ TARGET MET' if output_with_cities >= 58 else '✗ TARGET NOT MET'}
## Geographic Distribution
### Cities Found ({len(city_counts)} unique cities)
Top 15 cities by institution count:
"""
for city, count in city_counts.most_common(15):
report += f"- **{city}**: {count} institution{'s' if count > 1 else ''}\n"
report += f"\n### States with Geocoded Institutions ({len(state_cities)} states)\n\n"
for state in sorted(state_cities.keys()):
unique_cities = len(set(state_cities[state]))
report += f"- **{state}**: {len(state_cities[state])} institutions in {unique_cities} {'cities' if unique_cities > 1 else 'city'}\n"
if failed_institutions:
report += f"\n## Failed Geocoding Attempts ({len(failed_institutions)} institutions)\n\n"
report += "These institutions have state information but could not be geocoded:\n\n"
for inst in sorted(failed_institutions, key=lambda x: x['state']):
report += f"- **{inst['name']}** ({inst['type']}) - {inst['state']}\n"
report += f"""
## API Cache Statistics
| Metric | Value |
|--------|-------|
| **Total cache entries** | {len(cache)} |
| **Successful lookups** | {sum(1 for v in cache.values() if v is not None)} ({sum(1 for v in cache.values() if v is not None)/len(cache)*100:.1f}%) |
| **Failed lookups** | {sum(1 for v in cache.values() if v is None)} ({sum(1 for v in cache.values() if v is None)/len(cache)*100:.1f}%) |
## Data Quality Enhancements
The geocoding process added:
1. **City names** - Extracted from OpenStreetMap address data
2. **Geographic coordinates** - Latitude/longitude for mapping
3. **OpenStreetMap identifiers** - OSM type/ID for cross-referencing
4. **Provenance updates** - Extraction timestamps and confidence adjustments
### Confidence Score Adjustments
- Successfully geocoded records received a +0.05 confidence boost (capped at 0.85)
- Extraction method updated to include "+ Nominatim geocoding"
## Next Steps
### Recommended Improvements
1. **Manual verification** of failed geocoding attempts ({len(failed_institutions)} institutions)
2. **Website enrichment** - Extract URLs to improve coverage from current 9.3%
3. **Wikidata integration** - Cross-reference institutions with Wikidata Q-IDs
4. **Address enrichment** - Add street addresses where available
5. **Collection metadata** - Extract collection information from institutional websites
### Priority Actions
1. Review failed geocoding cases to identify patterns
2. Attempt alternative geocoding strategies (city+state only, abbreviations, etc.)
3. Cross-reference with IBRAM registry for official museum locations
4. Implement web scraping for institutional websites
## Technical Notes
- **API**: OpenStreetMap Nominatim
- **Rate limiting**: 1.1 seconds per request
- **Total processing time**: ~{len(cache) * 1.1 / 60:.1f} minutes
- **Cache format**: YAML (persistent across runs)
- **User-Agent**: GLAM-Data-Extraction/0.2.0
---
**Report Version**: 3.0
**Data Version**: v3 (geocoded)
**Schema Compliance**: LinkML v0.2.0
**Generated by**: `generate_geocoding_report.py`
"""
# Save report
with open(REPORT_FILE, 'w', encoding='utf-8') as f:
f.write(report)
print(f"✓ Geocoding report saved to: {REPORT_FILE}")
print()
print("=" * 70)
print("Key Statistics")
print("=" * 70)
print(f"City coverage: {output_with_cities}/{len(output_records)} ({output_with_cities/len(output_records)*100:.1f}%)")
print(f"Coordinate coverage: {output_with_coords}/{len(output_records)} ({output_with_coords/len(output_records)*100:.1f}%)")
print(f"Newly geocoded: {newly_geocoded}")
print(f"Failed attempts: {len(failed_institutions)}")
print(f"Unique cities found: {len(city_counts)}")
print(f"States covered: {len(state_cities)}/27")
print()
if __name__ == "__main__":
main()