238 lines
8.4 KiB
Python
238 lines
8.4 KiB
Python
"""
|
|
Pre-fill Obvious Errors in Fuzzy Wikidata Matches
|
|
|
|
Automatically marks clear INCORRECT matches based on:
|
|
1. City mismatches (🚨 flag) - Different cities = different institutions
|
|
2. Combined with other strong indicators (low similarity + city mismatch)
|
|
|
|
Generates two outputs:
|
|
1. Updated CSV with pre-filled INCORRECT statuses
|
|
2. Streamlined "needs_review.csv" with only ambiguous cases
|
|
"""
|
|
|
|
import csv
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
|
|
def load_flagged_csv(csv_path: Path) -> List[Dict]:
|
|
"""Load the flagged fuzzy matches CSV."""
|
|
matches = []
|
|
with open(csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
matches.append(row)
|
|
return matches
|
|
|
|
|
|
def is_obvious_incorrect(match: Dict) -> tuple[bool, str]:
|
|
"""
|
|
Determine if match is obviously INCORRECT.
|
|
|
|
Returns: (is_incorrect, reason)
|
|
"""
|
|
issues = match.get('spot_check_issues', '')
|
|
|
|
# Rule 1: City mismatch (🚨 flag) = INCORRECT
|
|
if '🚨 City mismatch:' in issues:
|
|
# Extract city details from issues
|
|
city_issue = [i for i in issues.split(' | ') if '🚨 City mismatch:' in i][0]
|
|
return (True, f"City mismatch detected. {city_issue.replace('🚨 ', '')}")
|
|
|
|
# Rule 2: Type mismatch (museum vs library)
|
|
if '⚠️ Type mismatch:' in issues and 'museum' in issues.lower():
|
|
return (True, "Type mismatch: institution types fundamentally different (library vs museum)")
|
|
|
|
# Rule 3: Very low name similarity (<30%) + other issues
|
|
if 'Low name similarity' in issues:
|
|
# Extract similarity score
|
|
import re
|
|
match_sim = re.search(r'(\d+\.\d+)%\)', issues)
|
|
if match_sim:
|
|
similarity = float(match_sim.group(1))
|
|
if similarity < 30:
|
|
return (True, f"Very low name similarity ({similarity:.1f}%) indicates different institutions")
|
|
|
|
return (False, '')
|
|
|
|
|
|
def prefill_obvious_errors(matches: List[Dict]) -> tuple[List[Dict], int]:
|
|
"""
|
|
Pre-fill validation_status for obvious INCORRECT matches.
|
|
|
|
Returns: (updated_matches, count_prefilled)
|
|
"""
|
|
print("\nPre-filling obvious INCORRECT matches...")
|
|
|
|
prefilled_count = 0
|
|
|
|
for match in matches:
|
|
# Skip if already validated
|
|
if match.get('validation_status'):
|
|
continue
|
|
|
|
is_incorrect, reason = is_obvious_incorrect(match)
|
|
|
|
if is_incorrect:
|
|
match['validation_status'] = 'INCORRECT'
|
|
match['validation_notes'] = f"[AUTO] {reason}"
|
|
prefilled_count += 1
|
|
|
|
print(f" ✅ Pre-filled {prefilled_count} obvious INCORRECT matches")
|
|
|
|
return matches, prefilled_count
|
|
|
|
|
|
def generate_needs_review_csv(matches: List[Dict], output_path: Path) -> int:
|
|
"""
|
|
Generate streamlined CSV with only rows needing manual review.
|
|
|
|
Includes:
|
|
- Flagged rows NOT pre-filled as INCORRECT
|
|
- Priority 1-2 OK rows (spot check only)
|
|
|
|
Returns: count of rows in needs_review
|
|
"""
|
|
needs_review = []
|
|
|
|
for match in matches:
|
|
status = match.get('validation_status', '')
|
|
flag = match.get('auto_flag', '')
|
|
priority = int(match.get('priority', 5))
|
|
|
|
# Include if:
|
|
# 1. Flagged but not pre-filled INCORRECT (needs judgment)
|
|
if flag == 'REVIEW_URGENT' and not status:
|
|
needs_review.append(match)
|
|
# 2. OK but Priority 1-2 (spot check safety)
|
|
elif flag == 'OK' and priority <= 2:
|
|
needs_review.append(match)
|
|
|
|
# Write streamlined CSV (use all original fields)
|
|
fieldnames = list(matches[0].keys()) if matches else []
|
|
|
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(needs_review)
|
|
|
|
print(f"\n ✅ Generated needs_review CSV: {len(needs_review)} rows")
|
|
|
|
return len(needs_review)
|
|
|
|
|
|
def generate_summary(matches: List[Dict], prefilled_count: int, needs_review_count: int):
|
|
"""Print summary of prefilling results."""
|
|
|
|
total = len(matches)
|
|
flagged = sum(1 for m in matches if m['auto_flag'] == 'REVIEW_URGENT')
|
|
ok = sum(1 for m in matches if m['auto_flag'] == 'OK')
|
|
|
|
# Count validation statuses
|
|
incorrect_auto = sum(1 for m in matches if m.get('validation_status') == 'INCORRECT'
|
|
and '[AUTO]' in m.get('validation_notes', ''))
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Pre-fill Summary")
|
|
print("=" * 70)
|
|
|
|
print(f"\n📊 Before Pre-fill")
|
|
print(f" Total fuzzy matches: {total}")
|
|
print(f" Flagged issues: {flagged}")
|
|
print(f" No issues (OK): {ok}")
|
|
|
|
print(f"\n✅ After Pre-fill")
|
|
print(f" Pre-filled INCORRECT (auto): {incorrect_auto}")
|
|
print(f" Needs manual review: {needs_review_count}")
|
|
print(f" - Flagged (ambiguous): {flagged - incorrect_auto}")
|
|
print(f" - OK (Priority 1-2 check): {needs_review_count - (flagged - incorrect_auto)}")
|
|
|
|
# Time estimate
|
|
review_time_min = needs_review_count * 2 # 2 min per ambiguous case
|
|
print(f"\n⏱️ Estimated Review Time")
|
|
print(f" Manual review needed: {needs_review_count} rows")
|
|
print(f" Est. time (2 min/row): {review_time_min} min ({review_time_min/60:.1f} hours)")
|
|
print(f" Time saved by pre-fill: {prefilled_count * 2} min ({prefilled_count * 2 / 60:.1f} hours)")
|
|
|
|
# Original estimate
|
|
original_time = total * 2.5 # Original: 2.5 min/row avg
|
|
new_time = needs_review_count * 2 # Only ambiguous cases need deep review
|
|
time_saved_percent = (1 - new_time / original_time) * 100
|
|
|
|
print(f"\n📈 Efficiency Gains")
|
|
print(f" Original est. time: {original_time:.0f} min ({original_time/60:.1f} hours)")
|
|
print(f" New est. time: {new_time:.0f} min ({new_time/60:.1f} hours)")
|
|
print(f" Time saved: {time_saved_percent:.1f}%")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Next Steps")
|
|
print("=" * 70)
|
|
print(f"""
|
|
1. ✅ AUTOMATIC: {incorrect_auto} obvious errors marked INCORRECT
|
|
- No action needed for these
|
|
|
|
2. 📝 MANUAL REVIEW REQUIRED: {needs_review_count} matches
|
|
|
|
Option A: Review streamlined CSV (recommended)
|
|
- File: data/review/denmark_wikidata_fuzzy_matches_needs_review.csv
|
|
- Contains ONLY rows needing your judgment
|
|
- Smaller, faster to review
|
|
|
|
Option B: Review full CSV
|
|
- File: data/review/denmark_wikidata_fuzzy_matches_prefilled.csv
|
|
- All {total} matches with pre-filled INCORRECT statuses
|
|
- Filter for empty validation_status to find remaining work
|
|
|
|
3. After manual review:
|
|
python scripts/apply_wikidata_validation.py
|
|
|
|
4. Check progress:
|
|
python scripts/check_validation_progress.py
|
|
""")
|
|
|
|
print("=" * 70)
|
|
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("Pre-fill Obvious Errors in Wikidata Fuzzy Matches")
|
|
print("=" * 70)
|
|
|
|
# Load flagged CSV
|
|
input_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv')
|
|
print(f"\nLoading flagged CSV: {input_path}")
|
|
matches = load_flagged_csv(input_path)
|
|
print(f" ✅ Loaded {len(matches)} matches")
|
|
|
|
# Pre-fill obvious errors
|
|
updated_matches, prefilled_count = prefill_obvious_errors(matches)
|
|
|
|
# Save updated full CSV
|
|
output_path = Path('data/review/denmark_wikidata_fuzzy_matches_prefilled.csv')
|
|
print(f"\nSaving updated CSV: {output_path}")
|
|
|
|
fieldnames = list(updated_matches[0].keys()) # All original fields
|
|
|
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(updated_matches)
|
|
|
|
size_kb = output_path.stat().st_size / 1024
|
|
print(f" ✅ Saved prefilled CSV ({size_kb:.1f} KB)")
|
|
|
|
# Generate streamlined needs_review CSV
|
|
needs_review_path = Path('data/review/denmark_wikidata_fuzzy_matches_needs_review.csv')
|
|
print(f"\nGenerating streamlined needs_review CSV: {needs_review_path}")
|
|
needs_review_count = generate_needs_review_csv(updated_matches, needs_review_path)
|
|
size_kb = needs_review_path.stat().st_size / 1024
|
|
print(f" ✅ Saved needs_review CSV ({size_kb:.1f} KB)")
|
|
|
|
# Print summary
|
|
generate_summary(updated_matches, prefilled_count, needs_review_count)
|
|
|
|
print("\n✅ Pre-fill Complete")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|