glam/scripts/prefill_obvious_errors.py

"""
Pre-fill Obvious Errors in Fuzzy Wikidata Matches

Automatically marks clear INCORRECT matches based on:
1. City mismatches (🚨 flag) - Different cities = different institutions
2. Combined with other strong indicators (low similarity + city mismatch)

Generates two outputs:
1. Updated CSV with pre-filled INCORRECT statuses
2. Streamlined "needs_review.csv" with only ambiguous cases
"""

import csv
from pathlib import Path
from typing import Dict, List


def load_flagged_csv(csv_path: Path) -> List[Dict]:
    """Load the flagged fuzzy matches CSV."""
    matches = []
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            matches.append(row)
    return matches


def is_obvious_incorrect(match: Dict) -> tuple[bool, str]:
    """
    Determine if match is obviously INCORRECT.

    Returns: (is_incorrect, reason)
    """
    issues = match.get('spot_check_issues', '')

    # Rule 1: City mismatch (🚨 flag) = INCORRECT
    if '🚨 City mismatch:' in issues:
        # Extract city details from issues
        city_issue = [i for i in issues.split(' | ') if '🚨 City mismatch:' in i][0]
        return (True, f"City mismatch detected. {city_issue.replace('🚨 ', '')}")

    # Rule 2: Type mismatch (museum vs library)
    if '⚠️  Type mismatch:' in issues and 'museum' in issues.lower():
        return (True, "Type mismatch: institution types fundamentally different (library vs museum)")

    # Rule 3: Very low name similarity (<30%) + other issues
    if 'Low name similarity' in issues:
        # Extract similarity score
        import re
        match_sim = re.search(r'(\d+\.\d+)%\)', issues)
        if match_sim:
            similarity = float(match_sim.group(1))
            if similarity < 30:
                return (True, f"Very low name similarity ({similarity:.1f}%) indicates different institutions")

    return (False, '')


def prefill_obvious_errors(matches: List[Dict]) -> tuple[List[Dict], int]:
    """
    Pre-fill validation_status for obvious INCORRECT matches.

    Returns: (updated_matches, count_prefilled)
    """
    print("\nPre-filling obvious INCORRECT matches...")

    prefilled_count = 0

    for match in matches:
        # Skip if already validated
        if match.get('validation_status'):
            continue

        is_incorrect, reason = is_obvious_incorrect(match)

        if is_incorrect:
            match['validation_status'] = 'INCORRECT'
            match['validation_notes'] = f"[AUTO] {reason}"
            prefilled_count += 1

    print(f"  ✅ Pre-filled {prefilled_count} obvious INCORRECT matches")

    return matches, prefilled_count


def generate_needs_review_csv(matches: List[Dict], output_path: Path) -> int:
    """
    Generate streamlined CSV with only rows needing manual review.

    Includes:
    - Flagged rows NOT pre-filled as INCORRECT
    - Priority 1-2 OK rows (spot check only)

    Returns: count of rows in needs_review
    """
    needs_review = []

    for match in matches:
        status = match.get('validation_status', '')
        flag = match.get('auto_flag', '')
        priority = int(match.get('priority', 5))

        # Include if:
        # 1. Flagged but not pre-filled INCORRECT (needs judgment)
        if flag == 'REVIEW_URGENT' and not status:
            needs_review.append(match)
        # 2. OK but Priority 1-2 (spot check safety)
        elif flag == 'OK' and priority <= 2:
            needs_review.append(match)

    # Write streamlined CSV (use all original fields)
    fieldnames = list(matches[0].keys()) if matches else []

    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(needs_review)

    print(f"\n  ✅ Generated needs_review CSV: {len(needs_review)} rows")

    return len(needs_review)


def generate_summary(matches: List[Dict], prefilled_count: int, needs_review_count: int):
    """Print summary of prefilling results."""

    total = len(matches)
    flagged = sum(1 for m in matches if m['auto_flag'] == 'REVIEW_URGENT')
    ok = sum(1 for m in matches if m['auto_flag'] == 'OK')

    # Count validation statuses
    incorrect_auto = sum(1 for m in matches if m.get('validation_status') == 'INCORRECT'
                         and '[AUTO]' in m.get('validation_notes', ''))

    print("\n" + "=" * 70)
    print("Pre-fill Summary")
    print("=" * 70)

    print(f"\n📊 Before Pre-fill")
    print(f"   Total fuzzy matches:           {total}")
    print(f"   Flagged issues:                {flagged}")
    print(f"   No issues (OK):                {ok}")

    print(f"\n✅ After Pre-fill")
    print(f"   Pre-filled INCORRECT (auto):   {incorrect_auto}")
    print(f"   Needs manual review:           {needs_review_count}")
    print(f"     - Flagged (ambiguous):       {flagged - incorrect_auto}")
    print(f"     - OK (Priority 1-2 check):   {needs_review_count - (flagged - incorrect_auto)}")

    # Time estimate
    review_time_min = needs_review_count * 2  # 2 min per ambiguous case
    print(f"\n⏱️  Estimated Review Time")
    print(f"   Manual review needed:          {needs_review_count} rows")
    print(f"   Est. time (2 min/row):         {review_time_min} min ({review_time_min/60:.1f} hours)")
    print(f"   Time saved by pre-fill:        {prefilled_count * 2} min ({prefilled_count * 2 / 60:.1f} hours)")

    # Original estimate
    original_time = total * 2.5  # Original: 2.5 min/row avg
    new_time = needs_review_count * 2  # Only ambiguous cases need deep review
    time_saved_percent = (1 - new_time / original_time) * 100

    print(f"\n📈 Efficiency Gains")
    print(f"   Original est. time:            {original_time:.0f} min ({original_time/60:.1f} hours)")
    print(f"   New est. time:                 {new_time:.0f} min ({new_time/60:.1f} hours)")
    print(f"   Time saved:                    {time_saved_percent:.1f}%")

    print("\n" + "=" * 70)
    print("Next Steps")
    print("=" * 70)
    print(f"""
1. ✅ AUTOMATIC: {incorrect_auto} obvious errors marked INCORRECT
   - No action needed for these

2. 📝 MANUAL REVIEW REQUIRED: {needs_review_count} matches

   Option A: Review streamlined CSV (recommended)
   - File: data/review/denmark_wikidata_fuzzy_matches_needs_review.csv
   - Contains ONLY rows needing your judgment
   - Smaller, faster to review

   Option B: Review full CSV
   - File: data/review/denmark_wikidata_fuzzy_matches_prefilled.csv
   - All {total} matches with pre-filled INCORRECT statuses
   - Filter for empty validation_status to find remaining work

3. After manual review:
   python scripts/apply_wikidata_validation.py

4. Check progress:
   python scripts/check_validation_progress.py
""")

    print("=" * 70)


def main():
    print("=" * 70)
    print("Pre-fill Obvious Errors in Wikidata Fuzzy Matches")
    print("=" * 70)

    # Load flagged CSV
    input_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv')
    print(f"\nLoading flagged CSV: {input_path}")
    matches = load_flagged_csv(input_path)
    print(f"  ✅ Loaded {len(matches)} matches")

    # Pre-fill obvious errors
    updated_matches, prefilled_count = prefill_obvious_errors(matches)

    # Save updated full CSV
    output_path = Path('data/review/denmark_wikidata_fuzzy_matches_prefilled.csv')
    print(f"\nSaving updated CSV: {output_path}")

    fieldnames = list(updated_matches[0].keys())  # All original fields

    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(updated_matches)

    size_kb = output_path.stat().st_size / 1024
    print(f"  ✅ Saved prefilled CSV ({size_kb:.1f} KB)")

    # Generate streamlined needs_review CSV
    needs_review_path = Path('data/review/denmark_wikidata_fuzzy_matches_needs_review.csv')
    print(f"\nGenerating streamlined needs_review CSV: {needs_review_path}")
    needs_review_count = generate_needs_review_csv(updated_matches, needs_review_path)
    size_kb = needs_review_path.stat().st_size / 1024
    print(f"  ✅ Saved needs_review CSV ({size_kb:.1f} KB)")

    # Print summary
    generate_summary(updated_matches, prefilled_count, needs_review_count)

    print("\n✅ Pre-fill Complete")


if __name__ == '__main__':
    main()