glam/check_geocoding_progress.py
2025-11-19 23:25:22 +01:00

73 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""Monitor geocoding progress."""
import yaml
from pathlib import Path
from datetime import datetime
CACHE_FILE = Path("/Users/kempersc/apps/glam/data/cache/geocoding_cache.yaml")
INPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_geocoded_v3.yaml")
def main():
print("=" * 70)
print(f"Geocoding Progress Check - {datetime.now().strftime('%H:%M:%S')}")
print("=" * 70)
print()
# Load input to get total count
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
records = yaml.safe_load(f)
total_records = len(records)
needs_geocoding = sum(
1 for r in records
if not (r.get('locations') and any(loc.get('city') for loc in r['locations']))
)
print(f"Total records: {total_records}")
print(f"Need geocoding: {needs_geocoding}")
print()
# Check cache
if CACHE_FILE.exists():
with open(CACHE_FILE, 'r', encoding='utf-8') as f:
cache = yaml.safe_load(f) or {}
cache_entries = len(cache)
successful = sum(1 for v in cache.values() if v is not None)
failed = cache_entries - successful
print(f"Cache entries: {cache_entries}")
print(f" Successful: {successful}")
print(f" Failed: {failed}")
print(f"Progress: {cache_entries}/{needs_geocoding} ({cache_entries/needs_geocoding*100:.1f}%)")
# Estimate time remaining
remaining = needs_geocoding - cache_entries
est_minutes = remaining * 1.1 / 60
print(f"Estimated time left: ~{est_minutes:.1f} minutes")
else:
print("Cache file not found - geocoding not started")
print()
# Check if output exists
if OUTPUT_FILE.exists():
print(f"✓ Output file exists: {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
output_records = yaml.safe_load(f)
total_with_cities = sum(
1 for r in output_records
if r.get('locations') and any(loc.get('city') for loc in r['locations'])
)
print(f" Records with cities: {total_with_cities}/{len(output_records)} ({total_with_cities/len(output_records)*100:.1f}%)")
else:
print("✗ Output file not yet created")
print()
if __name__ == "__main__":
main()