71 lines
2.6 KiB
Python
71 lines
2.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Monitor progress of the Wikidata enrichment script.
|
|
Shows number of entities cached and estimated completion.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
REGISTER_FILE = Path("data/wikidata/GLAMORCUBEPSXHFN/.fetch_register.json")
|
|
TOTAL_ENTITIES = 1970 # From count: hypernym(1662) + entity(214) + entity_list(35) + collection(59)
|
|
|
|
def monitor():
|
|
print("="*70)
|
|
print("📊 WIKIDATA ENRICHMENT PROGRESS MONITOR")
|
|
print("="*70)
|
|
|
|
if not REGISTER_FILE.exists():
|
|
print("❌ Register file not found")
|
|
return
|
|
|
|
# Load register
|
|
with open(REGISTER_FILE, 'r') as f:
|
|
register = json.load(f)
|
|
|
|
entities_cached = len(register.get('entities', {}))
|
|
total_fetches = register.get('total_fetches', 0)
|
|
last_updated = register.get('last_updated', 'Unknown')
|
|
|
|
# Calculate progress
|
|
progress_pct = (entities_cached / TOTAL_ENTITIES * 100) if TOTAL_ENTITIES > 0 else 0
|
|
remaining = TOTAL_ENTITIES - entities_cached
|
|
|
|
print(f"\n{'Status':<25s}: {'Running' if os.system('pgrep -f enrich_hyponyms_with_wikidata > /dev/null') == 0 else 'Completed/Stopped'}")
|
|
print(f"{'Last Updated':<25s}: {last_updated}")
|
|
print(f"\n{'Total Entities to Process':<25s}: {TOTAL_ENTITIES:,}")
|
|
print(f"{'Entities Cached':<25s}: {entities_cached:,}")
|
|
print(f"{'Total API Fetches':<25s}: {total_fetches:,}")
|
|
print(f"{'Progress':<25s}: {progress_pct:.1f}% ({entities_cached}/{TOTAL_ENTITIES})")
|
|
print(f"{'Remaining':<25s}: {remaining:,} entities")
|
|
|
|
# Estimate time
|
|
if entities_cached > 0:
|
|
rate = 10 # requests per second (configured in script)
|
|
est_seconds = remaining / rate
|
|
est_minutes = est_seconds / 60
|
|
print(f"{'Estimated Time Remaining':<25s}: {est_minutes:.1f} minutes")
|
|
|
|
# Progress bar
|
|
bar_width = 50
|
|
filled = int(bar_width * progress_pct / 100)
|
|
bar = '█' * filled + '░' * (bar_width - filled)
|
|
print(f"\n[{bar}] {progress_pct:.1f}%")
|
|
|
|
# Show sample cached entities
|
|
if entities_cached > 0:
|
|
print(f"\n{'Sample Cached Entities':<25s}:")
|
|
sample_qids = list(register['entities'].keys())[:5]
|
|
for qid in sample_qids:
|
|
entry = register['entities'][qid]
|
|
label = entry.get('data', {}).get('labels', {}).get('en', {}).get('value', 'No English label')
|
|
print(f" • {qid}: {label}")
|
|
|
|
print("\n" + "="*70)
|
|
print("💡 Tip: Run this script again to see updated progress")
|
|
print("="*70)
|
|
|
|
if __name__ == "__main__":
|
|
monitor()
|