73 lines
2.5 KiB
Python
73 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Monitor geocoding progress."""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
CACHE_FILE = Path("/Users/kempersc/apps/glam/data/cache/geocoding_cache.yaml")
|
|
INPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml")
|
|
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_geocoded_v3.yaml")
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print(f"Geocoding Progress Check - {datetime.now().strftime('%H:%M:%S')}")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Load input to get total count
|
|
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
|
records = yaml.safe_load(f)
|
|
|
|
total_records = len(records)
|
|
needs_geocoding = sum(
|
|
1 for r in records
|
|
if not (r.get('locations') and any(loc.get('city') for loc in r['locations']))
|
|
)
|
|
|
|
print(f"Total records: {total_records}")
|
|
print(f"Need geocoding: {needs_geocoding}")
|
|
print()
|
|
|
|
# Check cache
|
|
if CACHE_FILE.exists():
|
|
with open(CACHE_FILE, 'r', encoding='utf-8') as f:
|
|
cache = yaml.safe_load(f) or {}
|
|
|
|
cache_entries = len(cache)
|
|
successful = sum(1 for v in cache.values() if v is not None)
|
|
failed = cache_entries - successful
|
|
|
|
print(f"Cache entries: {cache_entries}")
|
|
print(f" Successful: {successful}")
|
|
print(f" Failed: {failed}")
|
|
print(f"Progress: {cache_entries}/{needs_geocoding} ({cache_entries/needs_geocoding*100:.1f}%)")
|
|
|
|
# Estimate time remaining
|
|
remaining = needs_geocoding - cache_entries
|
|
est_minutes = remaining * 1.1 / 60
|
|
print(f"Estimated time left: ~{est_minutes:.1f} minutes")
|
|
else:
|
|
print("Cache file not found - geocoding not started")
|
|
|
|
print()
|
|
|
|
# Check if output exists
|
|
if OUTPUT_FILE.exists():
|
|
print(f"✓ Output file exists: {OUTPUT_FILE}")
|
|
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
|
output_records = yaml.safe_load(f)
|
|
|
|
total_with_cities = sum(
|
|
1 for r in output_records
|
|
if r.get('locations') and any(loc.get('city') for loc in r['locations'])
|
|
)
|
|
|
|
print(f" Records with cities: {total_with_cities}/{len(output_records)} ({total_with_cities/len(output_records)*100:.1f}%)")
|
|
else:
|
|
print("✗ Output file not yet created")
|
|
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|