""" Quick Statistics for Wikidata Validation Progress Reads the review CSV and generates progress statistics. Run this anytime during manual review to check progress. """ import csv from pathlib import Path from collections import Counter def analyze_review_progress(csv_path: Path): """Analyze validation progress from CSV.""" if not csv_path.exists(): print(f"āŒ CSV not found: {csv_path}") print("Run: python scripts/generate_wikidata_review_report.py") return stats = { 'total': 0, 'reviewed': 0, 'not_reviewed': 0, 'by_status': Counter(), 'by_priority': {i: {'total': 0, 'reviewed': 0} for i in range(1, 6)}, 'by_type': Counter(), 'avg_score_correct': [], 'avg_score_incorrect': [] } with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: stats['total'] += 1 priority = int(row['priority']) status = row['validation_status'].strip().upper() inst_type = row['institution_type'] score = float(row['match_score']) stats['by_priority'][priority]['total'] += 1 stats['by_type'][inst_type] += 1 if status in ['CORRECT', 'INCORRECT', 'UNCERTAIN']: stats['reviewed'] += 1 stats['by_status'][status] += 1 stats['by_priority'][priority]['reviewed'] += 1 if status == 'CORRECT': stats['avg_score_correct'].append(score) elif status == 'INCORRECT': stats['avg_score_incorrect'].append(score) else: stats['not_reviewed'] += 1 return stats def print_statistics(stats): """Print formatted statistics.""" print("=" * 70) print("Wikidata Validation Progress Statistics") print("=" * 70) # Overall progress total = stats['total'] reviewed = stats['reviewed'] pct_reviewed = (reviewed / total * 100) if total > 0 else 0 print(f"\nšŸ“Š Overall Progress") print(f" Total fuzzy matches: {total}") print(f" Reviewed: {reviewed} ({pct_reviewed:.1f}%)") print(f" Not reviewed: {stats['not_reviewed']} ({100-pct_reviewed:.1f}%)") # Progress bar bar_length = 40 filled = int(bar_length * reviewed / total) bar = 'ā–ˆ' * filled + 'ā–‘' * (bar_length - filled) print(f" [{bar}] {pct_reviewed:.1f}%") # Review status breakdown if reviewed > 0: print(f"\nāœ… Review Status Breakdown") for status, count in stats['by_status'].most_common(): pct = count / reviewed * 100 print(f" {status:12s}: {count:3d} ({pct:5.1f}% of reviewed)") # Average scores if stats['avg_score_correct']: avg_correct = sum(stats['avg_score_correct']) / len(stats['avg_score_correct']) print(f"\n Average score (CORRECT): {avg_correct:.1f}%") if stats['avg_score_incorrect']: avg_incorrect = sum(stats['avg_score_incorrect']) / len(stats['avg_score_incorrect']) print(f" Average score (INCORRECT): {avg_incorrect:.1f}%") # Priority breakdown print(f"\nšŸŽÆ Progress by Priority") print(f" {'Priority':<12} {'Total':>6} {'Reviewed':>9} {'%':>7} {'Status'}") print(f" {'-'*50}") for priority in sorted(stats['by_priority'].keys()): data = stats['by_priority'][priority] total_p = data['total'] reviewed_p = data['reviewed'] pct_p = (reviewed_p / total_p * 100) if total_p > 0 else 0 # Status indicator if pct_p == 100: status = "āœ… Complete" elif pct_p >= 50: status = "🟔 In Progress" elif pct_p > 0: status = "🟠 Started" else: status = "⬜ Not Started" print(f" Priority {priority:<3} {total_p:6d} {reviewed_p:9d} {pct_p:6.1f}% {status}") # Institution type breakdown print(f"\nšŸ“š By Institution Type") for inst_type, count in stats['by_type'].most_common(): pct = count / total * 100 print(f" {inst_type:12s}: {count:3d} ({pct:5.1f}%)") # Recommendations print(f"\nšŸ’” Recommendations") if stats['not_reviewed'] == 0: print(" šŸŽ‰ All matches reviewed! Ready to apply validation.") print(" Run: python scripts/apply_wikidata_validation.py") else: # Find least-reviewed priority not_complete = [ (p, data) for p, data in stats['by_priority'].items() if data['reviewed'] < data['total'] ] if not_complete: next_priority, next_data = not_complete[0] remaining = next_data['total'] - next_data['reviewed'] print(f" āž”ļø Next focus: Priority {next_priority} ({remaining} matches remaining)") # Time estimate (2-3 minutes per match for P1-2, 1-2 for P3-5) if next_priority <= 2: est_time = remaining * 2.5 / 60 # hours else: est_time = remaining * 1.5 / 60 # hours print(f" ā±ļø Estimated time: {est_time:.1f} hours") # Quality warnings if reviewed > 20: # Only check quality if significant sample incorrect_pct = stats['by_status'].get('INCORRECT', 0) / reviewed * 100 uncertain_pct = stats['by_status'].get('UNCERTAIN', 0) / reviewed * 100 print(f"\nāš ļø Quality Indicators") if incorrect_pct > 20: print(f" šŸ”“ HIGH INCORRECT RATE: {incorrect_pct:.1f}% (>20% threshold)") print(f" This may indicate algorithm issues. Review methodology.") elif incorrect_pct > 10: print(f" 🟔 ELEVATED INCORRECT RATE: {incorrect_pct:.1f}% (normal: 5-10%)") else: print(f" āœ… Incorrect rate within normal range: {incorrect_pct:.1f}%") if uncertain_pct > 10: print(f" 🟔 HIGH UNCERTAIN RATE: {uncertain_pct:.1f}% (>10%)") print(f" Consider requesting expert review for ambiguous cases.") else: print(f" āœ… Uncertain rate within normal range: {uncertain_pct:.1f}%") print("\n" + "=" * 70) def main(): csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv') print("Loading validation progress...\n") stats = analyze_review_progress(csv_path) if stats: print_statistics(stats) if __name__ == '__main__': main()