glam/scripts/monitor_enrichment.py
2025-11-19 23:25:22 +01:00

71 lines
2.6 KiB
Python

#!/usr/bin/env python3
"""
Monitor progress of the Wikidata enrichment script.
Shows number of entities cached and estimated completion.
"""
import json
import os
from pathlib import Path
from datetime import datetime, timezone
REGISTER_FILE = Path("data/wikidata/GLAMORCUBEPSXHFN/.fetch_register.json")
TOTAL_ENTITIES = 1970 # From count: hypernym(1662) + entity(214) + entity_list(35) + collection(59)
def monitor():
print("="*70)
print("📊 WIKIDATA ENRICHMENT PROGRESS MONITOR")
print("="*70)
if not REGISTER_FILE.exists():
print("❌ Register file not found")
return
# Load register
with open(REGISTER_FILE, 'r') as f:
register = json.load(f)
entities_cached = len(register.get('entities', {}))
total_fetches = register.get('total_fetches', 0)
last_updated = register.get('last_updated', 'Unknown')
# Calculate progress
progress_pct = (entities_cached / TOTAL_ENTITIES * 100) if TOTAL_ENTITIES > 0 else 0
remaining = TOTAL_ENTITIES - entities_cached
print(f"\n{'Status':<25s}: {'Running' if os.system('pgrep -f enrich_hyponyms_with_wikidata > /dev/null') == 0 else 'Completed/Stopped'}")
print(f"{'Last Updated':<25s}: {last_updated}")
print(f"\n{'Total Entities to Process':<25s}: {TOTAL_ENTITIES:,}")
print(f"{'Entities Cached':<25s}: {entities_cached:,}")
print(f"{'Total API Fetches':<25s}: {total_fetches:,}")
print(f"{'Progress':<25s}: {progress_pct:.1f}% ({entities_cached}/{TOTAL_ENTITIES})")
print(f"{'Remaining':<25s}: {remaining:,} entities")
# Estimate time
if entities_cached > 0:
rate = 10 # requests per second (configured in script)
est_seconds = remaining / rate
est_minutes = est_seconds / 60
print(f"{'Estimated Time Remaining':<25s}: {est_minutes:.1f} minutes")
# Progress bar
bar_width = 50
filled = int(bar_width * progress_pct / 100)
bar = '' * filled + '' * (bar_width - filled)
print(f"\n[{bar}] {progress_pct:.1f}%")
# Show sample cached entities
if entities_cached > 0:
print(f"\n{'Sample Cached Entities':<25s}:")
sample_qids = list(register['entities'].keys())[:5]
for qid in sample_qids:
entry = register['entities'][qid]
label = entry.get('data', {}).get('labels', {}).get('en', {}).get('value', 'No English label')
print(f"{qid}: {label}")
print("\n" + "="*70)
print("💡 Tip: Run this script again to see updated progress")
print("="*70)
if __name__ == "__main__":
monitor()