189 lines
6.6 KiB
Python
189 lines
6.6 KiB
Python
"""
|
|
Quick Statistics for Wikidata Validation Progress
|
|
|
|
Reads the review CSV and generates progress statistics.
|
|
Run this anytime during manual review to check progress.
|
|
"""
|
|
|
|
import csv
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
|
|
|
|
def analyze_review_progress(csv_path: Path):
|
|
"""Analyze validation progress from CSV."""
|
|
|
|
if not csv_path.exists():
|
|
print(f"❌ CSV not found: {csv_path}")
|
|
print("Run: python scripts/generate_wikidata_review_report.py")
|
|
return
|
|
|
|
stats = {
|
|
'total': 0,
|
|
'reviewed': 0,
|
|
'not_reviewed': 0,
|
|
'by_status': Counter(),
|
|
'by_priority': {i: {'total': 0, 'reviewed': 0} for i in range(1, 6)},
|
|
'by_type': Counter(),
|
|
'avg_score_correct': [],
|
|
'avg_score_incorrect': []
|
|
}
|
|
|
|
with open(csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
|
|
for row in reader:
|
|
stats['total'] += 1
|
|
|
|
priority = int(row['priority'])
|
|
status = row['validation_status'].strip().upper()
|
|
inst_type = row['institution_type']
|
|
score = float(row['match_score'])
|
|
|
|
stats['by_priority'][priority]['total'] += 1
|
|
stats['by_type'][inst_type] += 1
|
|
|
|
if status in ['CORRECT', 'INCORRECT', 'UNCERTAIN']:
|
|
stats['reviewed'] += 1
|
|
stats['by_status'][status] += 1
|
|
stats['by_priority'][priority]['reviewed'] += 1
|
|
|
|
if status == 'CORRECT':
|
|
stats['avg_score_correct'].append(score)
|
|
elif status == 'INCORRECT':
|
|
stats['avg_score_incorrect'].append(score)
|
|
else:
|
|
stats['not_reviewed'] += 1
|
|
|
|
return stats
|
|
|
|
|
|
def print_statistics(stats):
|
|
"""Print formatted statistics."""
|
|
|
|
print("=" * 70)
|
|
print("Wikidata Validation Progress Statistics")
|
|
print("=" * 70)
|
|
|
|
# Overall progress
|
|
total = stats['total']
|
|
reviewed = stats['reviewed']
|
|
pct_reviewed = (reviewed / total * 100) if total > 0 else 0
|
|
|
|
print(f"\n📊 Overall Progress")
|
|
print(f" Total fuzzy matches: {total}")
|
|
print(f" Reviewed: {reviewed} ({pct_reviewed:.1f}%)")
|
|
print(f" Not reviewed: {stats['not_reviewed']} ({100-pct_reviewed:.1f}%)")
|
|
|
|
# Progress bar
|
|
bar_length = 40
|
|
filled = int(bar_length * reviewed / total)
|
|
bar = '█' * filled + '░' * (bar_length - filled)
|
|
print(f" [{bar}] {pct_reviewed:.1f}%")
|
|
|
|
# Review status breakdown
|
|
if reviewed > 0:
|
|
print(f"\n✅ Review Status Breakdown")
|
|
for status, count in stats['by_status'].most_common():
|
|
pct = count / reviewed * 100
|
|
print(f" {status:12s}: {count:3d} ({pct:5.1f}% of reviewed)")
|
|
|
|
# Average scores
|
|
if stats['avg_score_correct']:
|
|
avg_correct = sum(stats['avg_score_correct']) / len(stats['avg_score_correct'])
|
|
print(f"\n Average score (CORRECT): {avg_correct:.1f}%")
|
|
|
|
if stats['avg_score_incorrect']:
|
|
avg_incorrect = sum(stats['avg_score_incorrect']) / len(stats['avg_score_incorrect'])
|
|
print(f" Average score (INCORRECT): {avg_incorrect:.1f}%")
|
|
|
|
# Priority breakdown
|
|
print(f"\n🎯 Progress by Priority")
|
|
print(f" {'Priority':<12} {'Total':>6} {'Reviewed':>9} {'%':>7} {'Status'}")
|
|
print(f" {'-'*50}")
|
|
|
|
for priority in sorted(stats['by_priority'].keys()):
|
|
data = stats['by_priority'][priority]
|
|
total_p = data['total']
|
|
reviewed_p = data['reviewed']
|
|
pct_p = (reviewed_p / total_p * 100) if total_p > 0 else 0
|
|
|
|
# Status indicator
|
|
if pct_p == 100:
|
|
status = "✅ Complete"
|
|
elif pct_p >= 50:
|
|
status = "🟡 In Progress"
|
|
elif pct_p > 0:
|
|
status = "🟠 Started"
|
|
else:
|
|
status = "⬜ Not Started"
|
|
|
|
print(f" Priority {priority:<3} {total_p:6d} {reviewed_p:9d} {pct_p:6.1f}% {status}")
|
|
|
|
# Institution type breakdown
|
|
print(f"\n📚 By Institution Type")
|
|
for inst_type, count in stats['by_type'].most_common():
|
|
pct = count / total * 100
|
|
print(f" {inst_type:12s}: {count:3d} ({pct:5.1f}%)")
|
|
|
|
# Recommendations
|
|
print(f"\n💡 Recommendations")
|
|
|
|
if stats['not_reviewed'] == 0:
|
|
print(" 🎉 All matches reviewed! Ready to apply validation.")
|
|
print(" Run: python scripts/apply_wikidata_validation.py")
|
|
else:
|
|
# Find least-reviewed priority
|
|
not_complete = [
|
|
(p, data) for p, data in stats['by_priority'].items()
|
|
if data['reviewed'] < data['total']
|
|
]
|
|
|
|
if not_complete:
|
|
next_priority, next_data = not_complete[0]
|
|
remaining = next_data['total'] - next_data['reviewed']
|
|
print(f" ➡️ Next focus: Priority {next_priority} ({remaining} matches remaining)")
|
|
|
|
# Time estimate (2-3 minutes per match for P1-2, 1-2 for P3-5)
|
|
if next_priority <= 2:
|
|
est_time = remaining * 2.5 / 60 # hours
|
|
else:
|
|
est_time = remaining * 1.5 / 60 # hours
|
|
print(f" ⏱️ Estimated time: {est_time:.1f} hours")
|
|
|
|
# Quality warnings
|
|
if reviewed > 20: # Only check quality if significant sample
|
|
incorrect_pct = stats['by_status'].get('INCORRECT', 0) / reviewed * 100
|
|
uncertain_pct = stats['by_status'].get('UNCERTAIN', 0) / reviewed * 100
|
|
|
|
print(f"\n⚠️ Quality Indicators")
|
|
|
|
if incorrect_pct > 20:
|
|
print(f" 🔴 HIGH INCORRECT RATE: {incorrect_pct:.1f}% (>20% threshold)")
|
|
print(f" This may indicate algorithm issues. Review methodology.")
|
|
elif incorrect_pct > 10:
|
|
print(f" 🟡 ELEVATED INCORRECT RATE: {incorrect_pct:.1f}% (normal: 5-10%)")
|
|
else:
|
|
print(f" ✅ Incorrect rate within normal range: {incorrect_pct:.1f}%")
|
|
|
|
if uncertain_pct > 10:
|
|
print(f" 🟡 HIGH UNCERTAIN RATE: {uncertain_pct:.1f}% (>10%)")
|
|
print(f" Consider requesting expert review for ambiguous cases.")
|
|
else:
|
|
print(f" ✅ Uncertain rate within normal range: {uncertain_pct:.1f}%")
|
|
|
|
print("\n" + "=" * 70)
|
|
|
|
|
|
def main():
|
|
csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
|
|
|
|
print("Loading validation progress...\n")
|
|
stats = analyze_review_progress(csv_path)
|
|
|
|
if stats:
|
|
print_statistics(stats)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|