glam/scripts/check_validation_progress.py

"""
Quick Statistics for Wikidata Validation Progress

Reads the review CSV and generates progress statistics.
Run this anytime during manual review to check progress.
"""

import csv
from pathlib import Path
from collections import Counter


def analyze_review_progress(csv_path: Path):
    """Analyze validation progress from CSV."""

    if not csv_path.exists():
        print(f"❌ CSV not found: {csv_path}")
        print("Run: python scripts/generate_wikidata_review_report.py")
        return

    stats = {
        'total': 0,
        'reviewed': 0,
        'not_reviewed': 0,
        'by_status': Counter(),
        'by_priority': {i: {'total': 0, 'reviewed': 0} for i in range(1, 6)},
        'by_type': Counter(),
        'avg_score_correct': [],
        'avg_score_incorrect': []
    }

    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)

        for row in reader:
            stats['total'] += 1

            priority = int(row['priority'])
            status = row['validation_status'].strip().upper()
            inst_type = row['institution_type']
            score = float(row['match_score'])

            stats['by_priority'][priority]['total'] += 1
            stats['by_type'][inst_type] += 1

            if status in ['CORRECT', 'INCORRECT', 'UNCERTAIN']:
                stats['reviewed'] += 1
                stats['by_status'][status] += 1
                stats['by_priority'][priority]['reviewed'] += 1

                if status == 'CORRECT':
                    stats['avg_score_correct'].append(score)
                elif status == 'INCORRECT':
                    stats['avg_score_incorrect'].append(score)
            else:
                stats['not_reviewed'] += 1

    return stats


def print_statistics(stats):
    """Print formatted statistics."""

    print("=" * 70)
    print("Wikidata Validation Progress Statistics")
    print("=" * 70)

    # Overall progress
    total = stats['total']
    reviewed = stats['reviewed']
    pct_reviewed = (reviewed / total * 100) if total > 0 else 0

    print(f"\n📊 Overall Progress")
    print(f"   Total fuzzy matches: {total}")
    print(f"   Reviewed: {reviewed} ({pct_reviewed:.1f}%)")
    print(f"   Not reviewed: {stats['not_reviewed']} ({100-pct_reviewed:.1f}%)")

    # Progress bar
    bar_length = 40
    filled = int(bar_length * reviewed / total)
    bar = '█' * filled + '░' * (bar_length - filled)
    print(f"   [{bar}] {pct_reviewed:.1f}%")

    # Review status breakdown
    if reviewed > 0:
        print(f"\n✅ Review Status Breakdown")
        for status, count in stats['by_status'].most_common():
            pct = count / reviewed * 100
            print(f"   {status:12s}: {count:3d} ({pct:5.1f}% of reviewed)")

        # Average scores
        if stats['avg_score_correct']:
            avg_correct = sum(stats['avg_score_correct']) / len(stats['avg_score_correct'])
            print(f"\n   Average score (CORRECT): {avg_correct:.1f}%")

        if stats['avg_score_incorrect']:
            avg_incorrect = sum(stats['avg_score_incorrect']) / len(stats['avg_score_incorrect'])
            print(f"   Average score (INCORRECT): {avg_incorrect:.1f}%")

    # Priority breakdown
    print(f"\n🎯 Progress by Priority")
    print(f"   {'Priority':<12} {'Total':>6} {'Reviewed':>9} {'%':>7} {'Status'}")
    print(f"   {'-'*50}")

    for priority in sorted(stats['by_priority'].keys()):
        data = stats['by_priority'][priority]
        total_p = data['total']
        reviewed_p = data['reviewed']
        pct_p = (reviewed_p / total_p * 100) if total_p > 0 else 0

        # Status indicator
        if pct_p == 100:
            status = "✅ Complete"
        elif pct_p >= 50:
            status = "🟡 In Progress"
        elif pct_p > 0:
            status = "🟠 Started"
        else:
            status = "⬜ Not Started"

        print(f"   Priority {priority:<3} {total_p:6d} {reviewed_p:9d} {pct_p:6.1f}% {status}")

    # Institution type breakdown
    print(f"\n📚 By Institution Type")
    for inst_type, count in stats['by_type'].most_common():
        pct = count / total * 100
        print(f"   {inst_type:12s}: {count:3d} ({pct:5.1f}%)")

    # Recommendations
    print(f"\n💡 Recommendations")

    if stats['not_reviewed'] == 0:
        print("   🎉 All matches reviewed! Ready to apply validation.")
        print("   Run: python scripts/apply_wikidata_validation.py")
    else:
        # Find least-reviewed priority
        not_complete = [
            (p, data) for p, data in stats['by_priority'].items()
            if data['reviewed'] < data['total']
        ]

        if not_complete:
            next_priority, next_data = not_complete[0]
            remaining = next_data['total'] - next_data['reviewed']
            print(f"   ➡️  Next focus: Priority {next_priority} ({remaining} matches remaining)")

            # Time estimate (2-3 minutes per match for P1-2, 1-2 for P3-5)
            if next_priority <= 2:
                est_time = remaining * 2.5 / 60  # hours
            else:
                est_time = remaining * 1.5 / 60  # hours
            print(f"   ⏱️  Estimated time: {est_time:.1f} hours")

    # Quality warnings
    if reviewed > 20:  # Only check quality if significant sample
        incorrect_pct = stats['by_status'].get('INCORRECT', 0) / reviewed * 100
        uncertain_pct = stats['by_status'].get('UNCERTAIN', 0) / reviewed * 100

        print(f"\n⚠️  Quality Indicators")

        if incorrect_pct > 20:
            print(f"   🔴 HIGH INCORRECT RATE: {incorrect_pct:.1f}% (>20% threshold)")
            print(f"      This may indicate algorithm issues. Review methodology.")
        elif incorrect_pct > 10:
            print(f"   🟡 ELEVATED INCORRECT RATE: {incorrect_pct:.1f}% (normal: 5-10%)")
        else:
            print(f"   ✅ Incorrect rate within normal range: {incorrect_pct:.1f}%")

        if uncertain_pct > 10:
            print(f"   🟡 HIGH UNCERTAIN RATE: {uncertain_pct:.1f}% (>10%)")
            print(f"      Consider requesting expert review for ambiguous cases.")
        else:
            print(f"   ✅ Uncertain rate within normal range: {uncertain_pct:.1f}%")

    print("\n" + "=" * 70)


def main():
    csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv')

    print("Loading validation progress...\n")
    stats = analyze_review_progress(csv_path)

    if stats:
        print_statistics(stats)


if __name__ == '__main__':
    main()