""" Pre-fill Obvious Errors in Fuzzy Wikidata Matches Automatically marks clear INCORRECT matches based on: 1. City mismatches (🚨 flag) - Different cities = different institutions 2. Combined with other strong indicators (low similarity + city mismatch) Generates two outputs: 1. Updated CSV with pre-filled INCORRECT statuses 2. Streamlined "needs_review.csv" with only ambiguous cases """ import csv from pathlib import Path from typing import Dict, List def load_flagged_csv(csv_path: Path) -> List[Dict]: """Load the flagged fuzzy matches CSV.""" matches = [] with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: matches.append(row) return matches def is_obvious_incorrect(match: Dict) -> tuple[bool, str]: """ Determine if match is obviously INCORRECT. Returns: (is_incorrect, reason) """ issues = match.get('spot_check_issues', '') # Rule 1: City mismatch (🚨 flag) = INCORRECT if '🚨 City mismatch:' in issues: # Extract city details from issues city_issue = [i for i in issues.split(' | ') if '🚨 City mismatch:' in i][0] return (True, f"City mismatch detected. {city_issue.replace('🚨 ', '')}") # Rule 2: Type mismatch (museum vs library) if 'āš ļø Type mismatch:' in issues and 'museum' in issues.lower(): return (True, "Type mismatch: institution types fundamentally different (library vs museum)") # Rule 3: Very low name similarity (<30%) + other issues if 'Low name similarity' in issues: # Extract similarity score import re match_sim = re.search(r'(\d+\.\d+)%\)', issues) if match_sim: similarity = float(match_sim.group(1)) if similarity < 30: return (True, f"Very low name similarity ({similarity:.1f}%) indicates different institutions") return (False, '') def prefill_obvious_errors(matches: List[Dict]) -> tuple[List[Dict], int]: """ Pre-fill validation_status for obvious INCORRECT matches. Returns: (updated_matches, count_prefilled) """ print("\nPre-filling obvious INCORRECT matches...") prefilled_count = 0 for match in matches: # Skip if already validated if match.get('validation_status'): continue is_incorrect, reason = is_obvious_incorrect(match) if is_incorrect: match['validation_status'] = 'INCORRECT' match['validation_notes'] = f"[AUTO] {reason}" prefilled_count += 1 print(f" āœ… Pre-filled {prefilled_count} obvious INCORRECT matches") return matches, prefilled_count def generate_needs_review_csv(matches: List[Dict], output_path: Path) -> int: """ Generate streamlined CSV with only rows needing manual review. Includes: - Flagged rows NOT pre-filled as INCORRECT - Priority 1-2 OK rows (spot check only) Returns: count of rows in needs_review """ needs_review = [] for match in matches: status = match.get('validation_status', '') flag = match.get('auto_flag', '') priority = int(match.get('priority', 5)) # Include if: # 1. Flagged but not pre-filled INCORRECT (needs judgment) if flag == 'REVIEW_URGENT' and not status: needs_review.append(match) # 2. OK but Priority 1-2 (spot check safety) elif flag == 'OK' and priority <= 2: needs_review.append(match) # Write streamlined CSV (use all original fields) fieldnames = list(matches[0].keys()) if matches else [] with open(output_path, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(needs_review) print(f"\n āœ… Generated needs_review CSV: {len(needs_review)} rows") return len(needs_review) def generate_summary(matches: List[Dict], prefilled_count: int, needs_review_count: int): """Print summary of prefilling results.""" total = len(matches) flagged = sum(1 for m in matches if m['auto_flag'] == 'REVIEW_URGENT') ok = sum(1 for m in matches if m['auto_flag'] == 'OK') # Count validation statuses incorrect_auto = sum(1 for m in matches if m.get('validation_status') == 'INCORRECT' and '[AUTO]' in m.get('validation_notes', '')) print("\n" + "=" * 70) print("Pre-fill Summary") print("=" * 70) print(f"\nšŸ“Š Before Pre-fill") print(f" Total fuzzy matches: {total}") print(f" Flagged issues: {flagged}") print(f" No issues (OK): {ok}") print(f"\nāœ… After Pre-fill") print(f" Pre-filled INCORRECT (auto): {incorrect_auto}") print(f" Needs manual review: {needs_review_count}") print(f" - Flagged (ambiguous): {flagged - incorrect_auto}") print(f" - OK (Priority 1-2 check): {needs_review_count - (flagged - incorrect_auto)}") # Time estimate review_time_min = needs_review_count * 2 # 2 min per ambiguous case print(f"\nā±ļø Estimated Review Time") print(f" Manual review needed: {needs_review_count} rows") print(f" Est. time (2 min/row): {review_time_min} min ({review_time_min/60:.1f} hours)") print(f" Time saved by pre-fill: {prefilled_count * 2} min ({prefilled_count * 2 / 60:.1f} hours)") # Original estimate original_time = total * 2.5 # Original: 2.5 min/row avg new_time = needs_review_count * 2 # Only ambiguous cases need deep review time_saved_percent = (1 - new_time / original_time) * 100 print(f"\nšŸ“ˆ Efficiency Gains") print(f" Original est. time: {original_time:.0f} min ({original_time/60:.1f} hours)") print(f" New est. time: {new_time:.0f} min ({new_time/60:.1f} hours)") print(f" Time saved: {time_saved_percent:.1f}%") print("\n" + "=" * 70) print("Next Steps") print("=" * 70) print(f""" 1. āœ… AUTOMATIC: {incorrect_auto} obvious errors marked INCORRECT - No action needed for these 2. šŸ“ MANUAL REVIEW REQUIRED: {needs_review_count} matches Option A: Review streamlined CSV (recommended) - File: data/review/denmark_wikidata_fuzzy_matches_needs_review.csv - Contains ONLY rows needing your judgment - Smaller, faster to review Option B: Review full CSV - File: data/review/denmark_wikidata_fuzzy_matches_prefilled.csv - All {total} matches with pre-filled INCORRECT statuses - Filter for empty validation_status to find remaining work 3. After manual review: python scripts/apply_wikidata_validation.py 4. Check progress: python scripts/check_validation_progress.py """) print("=" * 70) def main(): print("=" * 70) print("Pre-fill Obvious Errors in Wikidata Fuzzy Matches") print("=" * 70) # Load flagged CSV input_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv') print(f"\nLoading flagged CSV: {input_path}") matches = load_flagged_csv(input_path) print(f" āœ… Loaded {len(matches)} matches") # Pre-fill obvious errors updated_matches, prefilled_count = prefill_obvious_errors(matches) # Save updated full CSV output_path = Path('data/review/denmark_wikidata_fuzzy_matches_prefilled.csv') print(f"\nSaving updated CSV: {output_path}") fieldnames = list(updated_matches[0].keys()) # All original fields with open(output_path, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(updated_matches) size_kb = output_path.stat().st_size / 1024 print(f" āœ… Saved prefilled CSV ({size_kb:.1f} KB)") # Generate streamlined needs_review CSV needs_review_path = Path('data/review/denmark_wikidata_fuzzy_matches_needs_review.csv') print(f"\nGenerating streamlined needs_review CSV: {needs_review_path}") needs_review_count = generate_needs_review_csv(updated_matches, needs_review_path) size_kb = needs_review_path.stat().st_size / 1024 print(f" āœ… Saved needs_review CSV ({size_kb:.1f} KB)") # Print summary generate_summary(updated_matches, prefilled_count, needs_review_count) print("\nāœ… Pre-fill Complete") if __name__ == '__main__': main()