glam/scripts/check_validation_progress.py
2025-11-19 23:25:22 +01:00

189 lines
6.6 KiB
Python

"""
Quick Statistics for Wikidata Validation Progress
Reads the review CSV and generates progress statistics.
Run this anytime during manual review to check progress.
"""
import csv
from pathlib import Path
from collections import Counter
def analyze_review_progress(csv_path: Path):
"""Analyze validation progress from CSV."""
if not csv_path.exists():
print(f"❌ CSV not found: {csv_path}")
print("Run: python scripts/generate_wikidata_review_report.py")
return
stats = {
'total': 0,
'reviewed': 0,
'not_reviewed': 0,
'by_status': Counter(),
'by_priority': {i: {'total': 0, 'reviewed': 0} for i in range(1, 6)},
'by_type': Counter(),
'avg_score_correct': [],
'avg_score_incorrect': []
}
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
stats['total'] += 1
priority = int(row['priority'])
status = row['validation_status'].strip().upper()
inst_type = row['institution_type']
score = float(row['match_score'])
stats['by_priority'][priority]['total'] += 1
stats['by_type'][inst_type] += 1
if status in ['CORRECT', 'INCORRECT', 'UNCERTAIN']:
stats['reviewed'] += 1
stats['by_status'][status] += 1
stats['by_priority'][priority]['reviewed'] += 1
if status == 'CORRECT':
stats['avg_score_correct'].append(score)
elif status == 'INCORRECT':
stats['avg_score_incorrect'].append(score)
else:
stats['not_reviewed'] += 1
return stats
def print_statistics(stats):
"""Print formatted statistics."""
print("=" * 70)
print("Wikidata Validation Progress Statistics")
print("=" * 70)
# Overall progress
total = stats['total']
reviewed = stats['reviewed']
pct_reviewed = (reviewed / total * 100) if total > 0 else 0
print(f"\n📊 Overall Progress")
print(f" Total fuzzy matches: {total}")
print(f" Reviewed: {reviewed} ({pct_reviewed:.1f}%)")
print(f" Not reviewed: {stats['not_reviewed']} ({100-pct_reviewed:.1f}%)")
# Progress bar
bar_length = 40
filled = int(bar_length * reviewed / total)
bar = '' * filled + '' * (bar_length - filled)
print(f" [{bar}] {pct_reviewed:.1f}%")
# Review status breakdown
if reviewed > 0:
print(f"\n✅ Review Status Breakdown")
for status, count in stats['by_status'].most_common():
pct = count / reviewed * 100
print(f" {status:12s}: {count:3d} ({pct:5.1f}% of reviewed)")
# Average scores
if stats['avg_score_correct']:
avg_correct = sum(stats['avg_score_correct']) / len(stats['avg_score_correct'])
print(f"\n Average score (CORRECT): {avg_correct:.1f}%")
if stats['avg_score_incorrect']:
avg_incorrect = sum(stats['avg_score_incorrect']) / len(stats['avg_score_incorrect'])
print(f" Average score (INCORRECT): {avg_incorrect:.1f}%")
# Priority breakdown
print(f"\n🎯 Progress by Priority")
print(f" {'Priority':<12} {'Total':>6} {'Reviewed':>9} {'%':>7} {'Status'}")
print(f" {'-'*50}")
for priority in sorted(stats['by_priority'].keys()):
data = stats['by_priority'][priority]
total_p = data['total']
reviewed_p = data['reviewed']
pct_p = (reviewed_p / total_p * 100) if total_p > 0 else 0
# Status indicator
if pct_p == 100:
status = "✅ Complete"
elif pct_p >= 50:
status = "🟡 In Progress"
elif pct_p > 0:
status = "🟠 Started"
else:
status = "⬜ Not Started"
print(f" Priority {priority:<3} {total_p:6d} {reviewed_p:9d} {pct_p:6.1f}% {status}")
# Institution type breakdown
print(f"\n📚 By Institution Type")
for inst_type, count in stats['by_type'].most_common():
pct = count / total * 100
print(f" {inst_type:12s}: {count:3d} ({pct:5.1f}%)")
# Recommendations
print(f"\n💡 Recommendations")
if stats['not_reviewed'] == 0:
print(" 🎉 All matches reviewed! Ready to apply validation.")
print(" Run: python scripts/apply_wikidata_validation.py")
else:
# Find least-reviewed priority
not_complete = [
(p, data) for p, data in stats['by_priority'].items()
if data['reviewed'] < data['total']
]
if not_complete:
next_priority, next_data = not_complete[0]
remaining = next_data['total'] - next_data['reviewed']
print(f" ➡️ Next focus: Priority {next_priority} ({remaining} matches remaining)")
# Time estimate (2-3 minutes per match for P1-2, 1-2 for P3-5)
if next_priority <= 2:
est_time = remaining * 2.5 / 60 # hours
else:
est_time = remaining * 1.5 / 60 # hours
print(f" ⏱️ Estimated time: {est_time:.1f} hours")
# Quality warnings
if reviewed > 20: # Only check quality if significant sample
incorrect_pct = stats['by_status'].get('INCORRECT', 0) / reviewed * 100
uncertain_pct = stats['by_status'].get('UNCERTAIN', 0) / reviewed * 100
print(f"\n⚠️ Quality Indicators")
if incorrect_pct > 20:
print(f" 🔴 HIGH INCORRECT RATE: {incorrect_pct:.1f}% (>20% threshold)")
print(f" This may indicate algorithm issues. Review methodology.")
elif incorrect_pct > 10:
print(f" 🟡 ELEVATED INCORRECT RATE: {incorrect_pct:.1f}% (normal: 5-10%)")
else:
print(f" ✅ Incorrect rate within normal range: {incorrect_pct:.1f}%")
if uncertain_pct > 10:
print(f" 🟡 HIGH UNCERTAIN RATE: {uncertain_pct:.1f}% (>10%)")
print(f" Consider requesting expert review for ambiguous cases.")
else:
print(f" ✅ Uncertain rate within normal range: {uncertain_pct:.1f}%")
print("\n" + "=" * 70)
def main():
csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
print("Loading validation progress...\n")
stats = analyze_review_progress(csv_path)
if stats:
print_statistics(stats)
if __name__ == '__main__':
main()