glam/scripts/prefill_obvious_errors.py
2025-11-21 22:12:33 +01:00

238 lines
8.4 KiB
Python

"""
Pre-fill Obvious Errors in Fuzzy Wikidata Matches
Automatically marks clear INCORRECT matches based on:
1. City mismatches (🚨 flag) - Different cities = different institutions
2. Combined with other strong indicators (low similarity + city mismatch)
Generates two outputs:
1. Updated CSV with pre-filled INCORRECT statuses
2. Streamlined "needs_review.csv" with only ambiguous cases
"""
import csv
from pathlib import Path
from typing import Dict, List
def load_flagged_csv(csv_path: Path) -> List[Dict]:
"""Load the flagged fuzzy matches CSV."""
matches = []
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
matches.append(row)
return matches
def is_obvious_incorrect(match: Dict) -> tuple[bool, str]:
"""
Determine if match is obviously INCORRECT.
Returns: (is_incorrect, reason)
"""
issues = match.get('spot_check_issues', '')
# Rule 1: City mismatch (🚨 flag) = INCORRECT
if '🚨 City mismatch:' in issues:
# Extract city details from issues
city_issue = [i for i in issues.split(' | ') if '🚨 City mismatch:' in i][0]
return (True, f"City mismatch detected. {city_issue.replace('🚨 ', '')}")
# Rule 2: Type mismatch (museum vs library)
if '⚠️ Type mismatch:' in issues and 'museum' in issues.lower():
return (True, "Type mismatch: institution types fundamentally different (library vs museum)")
# Rule 3: Very low name similarity (<30%) + other issues
if 'Low name similarity' in issues:
# Extract similarity score
import re
match_sim = re.search(r'(\d+\.\d+)%\)', issues)
if match_sim:
similarity = float(match_sim.group(1))
if similarity < 30:
return (True, f"Very low name similarity ({similarity:.1f}%) indicates different institutions")
return (False, '')
def prefill_obvious_errors(matches: List[Dict]) -> tuple[List[Dict], int]:
"""
Pre-fill validation_status for obvious INCORRECT matches.
Returns: (updated_matches, count_prefilled)
"""
print("\nPre-filling obvious INCORRECT matches...")
prefilled_count = 0
for match in matches:
# Skip if already validated
if match.get('validation_status'):
continue
is_incorrect, reason = is_obvious_incorrect(match)
if is_incorrect:
match['validation_status'] = 'INCORRECT'
match['validation_notes'] = f"[AUTO] {reason}"
prefilled_count += 1
print(f" ✅ Pre-filled {prefilled_count} obvious INCORRECT matches")
return matches, prefilled_count
def generate_needs_review_csv(matches: List[Dict], output_path: Path) -> int:
"""
Generate streamlined CSV with only rows needing manual review.
Includes:
- Flagged rows NOT pre-filled as INCORRECT
- Priority 1-2 OK rows (spot check only)
Returns: count of rows in needs_review
"""
needs_review = []
for match in matches:
status = match.get('validation_status', '')
flag = match.get('auto_flag', '')
priority = int(match.get('priority', 5))
# Include if:
# 1. Flagged but not pre-filled INCORRECT (needs judgment)
if flag == 'REVIEW_URGENT' and not status:
needs_review.append(match)
# 2. OK but Priority 1-2 (spot check safety)
elif flag == 'OK' and priority <= 2:
needs_review.append(match)
# Write streamlined CSV (use all original fields)
fieldnames = list(matches[0].keys()) if matches else []
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(needs_review)
print(f"\n ✅ Generated needs_review CSV: {len(needs_review)} rows")
return len(needs_review)
def generate_summary(matches: List[Dict], prefilled_count: int, needs_review_count: int):
"""Print summary of prefilling results."""
total = len(matches)
flagged = sum(1 for m in matches if m['auto_flag'] == 'REVIEW_URGENT')
ok = sum(1 for m in matches if m['auto_flag'] == 'OK')
# Count validation statuses
incorrect_auto = sum(1 for m in matches if m.get('validation_status') == 'INCORRECT'
and '[AUTO]' in m.get('validation_notes', ''))
print("\n" + "=" * 70)
print("Pre-fill Summary")
print("=" * 70)
print(f"\n📊 Before Pre-fill")
print(f" Total fuzzy matches: {total}")
print(f" Flagged issues: {flagged}")
print(f" No issues (OK): {ok}")
print(f"\n✅ After Pre-fill")
print(f" Pre-filled INCORRECT (auto): {incorrect_auto}")
print(f" Needs manual review: {needs_review_count}")
print(f" - Flagged (ambiguous): {flagged - incorrect_auto}")
print(f" - OK (Priority 1-2 check): {needs_review_count - (flagged - incorrect_auto)}")
# Time estimate
review_time_min = needs_review_count * 2 # 2 min per ambiguous case
print(f"\n⏱️ Estimated Review Time")
print(f" Manual review needed: {needs_review_count} rows")
print(f" Est. time (2 min/row): {review_time_min} min ({review_time_min/60:.1f} hours)")
print(f" Time saved by pre-fill: {prefilled_count * 2} min ({prefilled_count * 2 / 60:.1f} hours)")
# Original estimate
original_time = total * 2.5 # Original: 2.5 min/row avg
new_time = needs_review_count * 2 # Only ambiguous cases need deep review
time_saved_percent = (1 - new_time / original_time) * 100
print(f"\n📈 Efficiency Gains")
print(f" Original est. time: {original_time:.0f} min ({original_time/60:.1f} hours)")
print(f" New est. time: {new_time:.0f} min ({new_time/60:.1f} hours)")
print(f" Time saved: {time_saved_percent:.1f}%")
print("\n" + "=" * 70)
print("Next Steps")
print("=" * 70)
print(f"""
1. ✅ AUTOMATIC: {incorrect_auto} obvious errors marked INCORRECT
- No action needed for these
2. 📝 MANUAL REVIEW REQUIRED: {needs_review_count} matches
Option A: Review streamlined CSV (recommended)
- File: data/review/denmark_wikidata_fuzzy_matches_needs_review.csv
- Contains ONLY rows needing your judgment
- Smaller, faster to review
Option B: Review full CSV
- File: data/review/denmark_wikidata_fuzzy_matches_prefilled.csv
- All {total} matches with pre-filled INCORRECT statuses
- Filter for empty validation_status to find remaining work
3. After manual review:
python scripts/apply_wikidata_validation.py
4. Check progress:
python scripts/check_validation_progress.py
""")
print("=" * 70)
def main():
print("=" * 70)
print("Pre-fill Obvious Errors in Wikidata Fuzzy Matches")
print("=" * 70)
# Load flagged CSV
input_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv')
print(f"\nLoading flagged CSV: {input_path}")
matches = load_flagged_csv(input_path)
print(f" ✅ Loaded {len(matches)} matches")
# Pre-fill obvious errors
updated_matches, prefilled_count = prefill_obvious_errors(matches)
# Save updated full CSV
output_path = Path('data/review/denmark_wikidata_fuzzy_matches_prefilled.csv')
print(f"\nSaving updated CSV: {output_path}")
fieldnames = list(updated_matches[0].keys()) # All original fields
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(updated_matches)
size_kb = output_path.stat().st_size / 1024
print(f" ✅ Saved prefilled CSV ({size_kb:.1f} KB)")
# Generate streamlined needs_review CSV
needs_review_path = Path('data/review/denmark_wikidata_fuzzy_matches_needs_review.csv')
print(f"\nGenerating streamlined needs_review CSV: {needs_review_path}")
needs_review_count = generate_needs_review_csv(updated_matches, needs_review_path)
size_kb = needs_review_path.stat().st_size / 1024
print(f" ✅ Saved needs_review CSV ({size_kb:.1f} KB)")
# Print summary
generate_summary(updated_matches, prefilled_count, needs_review_count)
print("\n✅ Pre-fill Complete")
if __name__ == '__main__':
main()