225 lines
8.4 KiB
Python
225 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate comprehensive geocoding enrichment report.
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
from datetime import datetime
|
|
|
|
INPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml")
|
|
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_geocoded_v3.yaml")
|
|
CACHE_FILE = Path("/Users/kempersc/apps/glam/data/cache/geocoding_cache.yaml")
|
|
REPORT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_geocoding_report_v3.md")
|
|
|
|
def main():
|
|
# Load data
|
|
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
|
input_records = yaml.safe_load(f)
|
|
|
|
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
|
output_records = yaml.safe_load(f)
|
|
|
|
with open(CACHE_FILE, 'r', encoding='utf-8') as f:
|
|
cache = yaml.safe_load(f) or {}
|
|
|
|
# Analyze changes
|
|
input_with_cities = sum(
|
|
1 for r in input_records
|
|
if r.get('locations') and any(loc.get('city') for loc in r['locations'])
|
|
)
|
|
|
|
output_with_cities = sum(
|
|
1 for r in output_records
|
|
if r.get('locations') and any(loc.get('city') for loc in r['locations'])
|
|
)
|
|
|
|
output_with_coords = sum(
|
|
1 for r in output_records
|
|
if r.get('locations') and any(loc.get('latitude') for loc in r['locations'])
|
|
)
|
|
|
|
newly_geocoded = output_with_cities - input_with_cities
|
|
|
|
# Count OSM identifiers added
|
|
osm_count = sum(
|
|
1 for r in output_records
|
|
if r.get('identifiers') and any(
|
|
id.get('identifier_scheme') == 'OpenStreetMap'
|
|
for id in r['identifiers']
|
|
)
|
|
)
|
|
|
|
# Collect cities found
|
|
cities = []
|
|
for r in output_records:
|
|
if r.get('locations'):
|
|
for loc in r['locations']:
|
|
if loc.get('city'):
|
|
cities.append(loc['city'])
|
|
|
|
city_counts = Counter(cities)
|
|
|
|
# States with geocoded institutions
|
|
state_cities = {}
|
|
for r in output_records:
|
|
if r.get('locations') and r['locations']:
|
|
state = r['locations'][0].get('region')
|
|
city = r['locations'][0].get('city')
|
|
if state and city:
|
|
if state not in state_cities:
|
|
state_cities[state] = []
|
|
state_cities[state].append(city)
|
|
|
|
# Failed geocoding attempts
|
|
failed_institutions = []
|
|
for r in output_records:
|
|
has_city = r.get('locations') and any(loc.get('city') for loc in r['locations'])
|
|
has_state = r.get('locations') and r['locations'] and r['locations'][0].get('region')
|
|
if not has_city and has_state:
|
|
failed_institutions.append({
|
|
'name': r.get('name', 'Unknown'),
|
|
'state': r['locations'][0].get('region'),
|
|
'type': r.get('institution_type', 'Unknown')
|
|
})
|
|
|
|
# Generate report
|
|
report = f"""# Brazilian GLAM Geocoding Enrichment Report - v3.0
|
|
|
|
**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
|
|
|
## Summary
|
|
|
|
This report documents the geocoding enrichment process for Brazilian heritage institutions using the Nominatim API (OpenStreetMap).
|
|
|
|
## Input/Output Files
|
|
|
|
- **Input**: `{INPUT_FILE.name}`
|
|
- **Output**: `{OUTPUT_FILE.name}`
|
|
- **Cache**: `{CACHE_FILE.name}`
|
|
|
|
## Overall Statistics
|
|
|
|
| Metric | Before (v2) | After (v3) | Change |
|
|
|--------|-------------|------------|--------|
|
|
| **Total records** | {len(input_records)} | {len(output_records)} | - |
|
|
| **Records with cities** | {input_with_cities} ({input_with_cities/len(input_records)*100:.1f}%) | {output_with_cities} ({output_with_cities/len(output_records)*100:.1f}%) | +{newly_geocoded} |
|
|
| **Records with coordinates** | 0 (0.0%) | {output_with_coords} ({output_with_coords/len(output_records)*100:.1f}%) | +{output_with_coords} |
|
|
| **OpenStreetMap identifiers** | 0 | {osm_count} | +{osm_count} |
|
|
|
|
## Geocoding Performance
|
|
|
|
| Category | Count | Percentage |
|
|
|----------|-------|------------|
|
|
| **Already had cities** | {input_with_cities} | {input_with_cities/len(input_records)*100:.1f}% |
|
|
| **Successfully geocoded** | {newly_geocoded} | {newly_geocoded/len(input_records)*100:.1f}% |
|
|
| **Failed geocoding** | {len(failed_institutions)} | {len(failed_institutions)/len(input_records)*100:.1f}% |
|
|
| **Total with cities (v3)** | {output_with_cities} | {output_with_cities/len(output_records)*100:.1f}% |
|
|
|
|
### Target Achievement
|
|
|
|
- **Target**: 60% city coverage (58 records minimum)
|
|
- **Achieved**: {output_with_cities} records ({output_with_cities/len(output_records)*100:.1f}%)
|
|
- **Status**: {'✓ TARGET MET' if output_with_cities >= 58 else '✗ TARGET NOT MET'}
|
|
|
|
## Geographic Distribution
|
|
|
|
### Cities Found ({len(city_counts)} unique cities)
|
|
|
|
Top 15 cities by institution count:
|
|
|
|
"""
|
|
|
|
for city, count in city_counts.most_common(15):
|
|
report += f"- **{city}**: {count} institution{'s' if count > 1 else ''}\n"
|
|
|
|
report += f"\n### States with Geocoded Institutions ({len(state_cities)} states)\n\n"
|
|
|
|
for state in sorted(state_cities.keys()):
|
|
unique_cities = len(set(state_cities[state]))
|
|
report += f"- **{state}**: {len(state_cities[state])} institutions in {unique_cities} {'cities' if unique_cities > 1 else 'city'}\n"
|
|
|
|
if failed_institutions:
|
|
report += f"\n## Failed Geocoding Attempts ({len(failed_institutions)} institutions)\n\n"
|
|
report += "These institutions have state information but could not be geocoded:\n\n"
|
|
|
|
for inst in sorted(failed_institutions, key=lambda x: x['state']):
|
|
report += f"- **{inst['name']}** ({inst['type']}) - {inst['state']}\n"
|
|
|
|
report += f"""
|
|
|
|
## API Cache Statistics
|
|
|
|
| Metric | Value |
|
|
|--------|-------|
|
|
| **Total cache entries** | {len(cache)} |
|
|
| **Successful lookups** | {sum(1 for v in cache.values() if v is not None)} ({sum(1 for v in cache.values() if v is not None)/len(cache)*100:.1f}%) |
|
|
| **Failed lookups** | {sum(1 for v in cache.values() if v is None)} ({sum(1 for v in cache.values() if v is None)/len(cache)*100:.1f}%) |
|
|
|
|
## Data Quality Enhancements
|
|
|
|
The geocoding process added:
|
|
|
|
1. **City names** - Extracted from OpenStreetMap address data
|
|
2. **Geographic coordinates** - Latitude/longitude for mapping
|
|
3. **OpenStreetMap identifiers** - OSM type/ID for cross-referencing
|
|
4. **Provenance updates** - Extraction timestamps and confidence adjustments
|
|
|
|
### Confidence Score Adjustments
|
|
|
|
- Successfully geocoded records received a +0.05 confidence boost (capped at 0.85)
|
|
- Extraction method updated to include "+ Nominatim geocoding"
|
|
|
|
## Next Steps
|
|
|
|
### Recommended Improvements
|
|
|
|
1. **Manual verification** of failed geocoding attempts ({len(failed_institutions)} institutions)
|
|
2. **Website enrichment** - Extract URLs to improve coverage from current 9.3%
|
|
3. **Wikidata integration** - Cross-reference institutions with Wikidata Q-IDs
|
|
4. **Address enrichment** - Add street addresses where available
|
|
5. **Collection metadata** - Extract collection information from institutional websites
|
|
|
|
### Priority Actions
|
|
|
|
1. Review failed geocoding cases to identify patterns
|
|
2. Attempt alternative geocoding strategies (city+state only, abbreviations, etc.)
|
|
3. Cross-reference with IBRAM registry for official museum locations
|
|
4. Implement web scraping for institutional websites
|
|
|
|
## Technical Notes
|
|
|
|
- **API**: OpenStreetMap Nominatim
|
|
- **Rate limiting**: 1.1 seconds per request
|
|
- **Total processing time**: ~{len(cache) * 1.1 / 60:.1f} minutes
|
|
- **Cache format**: YAML (persistent across runs)
|
|
- **User-Agent**: GLAM-Data-Extraction/0.2.0
|
|
|
|
---
|
|
|
|
**Report Version**: 3.0
|
|
**Data Version**: v3 (geocoded)
|
|
**Schema Compliance**: LinkML v0.2.0
|
|
**Generated by**: `generate_geocoding_report.py`
|
|
"""
|
|
|
|
# Save report
|
|
with open(REPORT_FILE, 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
|
|
print(f"✓ Geocoding report saved to: {REPORT_FILE}")
|
|
print()
|
|
print("=" * 70)
|
|
print("Key Statistics")
|
|
print("=" * 70)
|
|
print(f"City coverage: {output_with_cities}/{len(output_records)} ({output_with_cities/len(output_records)*100:.1f}%)")
|
|
print(f"Coordinate coverage: {output_with_coords}/{len(output_records)} ({output_with_coords/len(output_records)*100:.1f}%)")
|
|
print(f"Newly geocoded: {newly_geocoded}")
|
|
print(f"Failed attempts: {len(failed_institutions)}")
|
|
print(f"Unique cities found: {len(city_counts)}")
|
|
print(f"States covered: {len(state_cities)}/27")
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|