#!/usr/bin/env python3 """Monitor geocoding progress.""" import yaml from pathlib import Path from datetime import datetime CACHE_FILE = Path("/Users/kempersc/apps/glam/data/cache/geocoding_cache.yaml") INPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml") OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_geocoded_v3.yaml") def main(): print("=" * 70) print(f"Geocoding Progress Check - {datetime.now().strftime('%H:%M:%S')}") print("=" * 70) print() # Load input to get total count with open(INPUT_FILE, 'r', encoding='utf-8') as f: records = yaml.safe_load(f) total_records = len(records) needs_geocoding = sum( 1 for r in records if not (r.get('locations') and any(loc.get('city') for loc in r['locations'])) ) print(f"Total records: {total_records}") print(f"Need geocoding: {needs_geocoding}") print() # Check cache if CACHE_FILE.exists(): with open(CACHE_FILE, 'r', encoding='utf-8') as f: cache = yaml.safe_load(f) or {} cache_entries = len(cache) successful = sum(1 for v in cache.values() if v is not None) failed = cache_entries - successful print(f"Cache entries: {cache_entries}") print(f" Successful: {successful}") print(f" Failed: {failed}") print(f"Progress: {cache_entries}/{needs_geocoding} ({cache_entries/needs_geocoding*100:.1f}%)") # Estimate time remaining remaining = needs_geocoding - cache_entries est_minutes = remaining * 1.1 / 60 print(f"Estimated time left: ~{est_minutes:.1f} minutes") else: print("Cache file not found - geocoding not started") print() # Check if output exists if OUTPUT_FILE.exists(): print(f"✓ Output file exists: {OUTPUT_FILE}") with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: output_records = yaml.safe_load(f) total_with_cities = sum( 1 for r in output_records if r.get('locations') and any(loc.get('city') for loc in r['locations']) ) print(f" Records with cities: {total_with_cities}/{len(output_records)} ({total_with_cities/len(output_records)*100:.1f}%)") else: print("✗ Output file not yet created") print() if __name__ == "__main__": main()