""" Fast Automated Spot Checks for Wikidata Fuzzy Matches Pattern-based detection (no Wikidata queries needed for most checks): 1. City name mismatches (from CSV data) 2. Name pattern issues (branch suffixes, gymnasium libraries) 3. Low scores (<87%) without ISIL confirmation 4. Similar institution names in different cities Generates prioritized review list with auto-detected issues. """ import json import csv import re from pathlib import Path from typing import Dict, List, Tuple from rapidfuzz import fuzz def load_csv_matches(csv_path: Path) -> List[Dict]: """Load fuzzy matches from CSV.""" matches = [] with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: matches.append(row) return matches def check_name_patterns(inst_name: str, wd_label: str) -> List[str]: """Check for problematic name patterns.""" issues = [] # Pattern 1: Branch suffix in our name but not Wikidata if ', Biblioteket' in inst_name and 'Bibliotek' not in wd_label: issues.append("Branch suffix ', Biblioteket' in our name but not Wikidata (branch vs main?)") # Pattern 2: Gymnasium library vs public library if 'Gymnasium' in inst_name and 'Gymnasium' not in wd_label: if 'Bibliotek' in wd_label: issues.append("Our 'Gymnasium' library matched to public library (school vs public?)") # Pattern 3: Kombi Bibliotek (different institutions) if 'Kombi' in inst_name: # Extract city/location before "Kombi" our_location = inst_name.split('Kombi')[0].strip() if our_location not in wd_label: issues.append(f"Kombi library location mismatch: '{our_location}' not in Wikidata label") # Pattern 4: Different base institution names # Remove common suffixes for comparison our_base = re.sub(r',?\s*(Biblioteket|Bibliotek|Arkiv)$', '', inst_name).strip() wd_base = re.sub(r'\s*(Biblioteket|Bibliotek|Arkiv)$', '', wd_label).strip() similarity = fuzz.ratio(our_base.lower(), wd_base.lower()) if similarity < 60: issues.append(f"Low name similarity ({similarity}%) - possibly different institutions") # Pattern 5: City/location names differ # Extract first word (often city name) our_first = inst_name.split()[0] if inst_name else "" wd_first = wd_label.split()[0] if wd_label else "" if len(our_first) > 3 and len(wd_first) > 3: # Avoid short words if our_first.lower() != wd_first.lower() and similarity < 85: first_sim = fuzz.ratio(our_first.lower(), wd_first.lower()) if first_sim < 70: issues.append(f"First word differs: '{our_first}' vs '{wd_first}' (city mismatch?)") return issues def check_city_in_labels(our_city: str, inst_name: str, wd_label: str) -> List[str]: """Check if city names are consistent.""" issues = [] if not our_city: return issues our_city_lower = our_city.lower().strip() # Check if Wikidata label contains a different city name # Common Danish cities to check against danish_cities = [ 'københavn', 'aarhus', 'odense', 'aalborg', 'frederiksberg', 'esbjerg', 'randers', 'kolding', 'horsens', 'vejle', 'roskilde', 'herning', 'helsingør', 'silkeborg', 'næstved', 'fredericia', 'viborg', 'køge', 'holstebro', 'taastrup', 'svendborg', 'hvidovre', 'hørsholm', 'greve', 'ballerup', 'gladsaxe', 'gentofte', 'herlev', 'glostrup', 'albertslund' ] wd_label_lower = wd_label.lower() # Check if Wikidata label mentions a different city for city in danish_cities: if city in wd_label_lower and city != our_city_lower: # Make sure it's not a substring match if our_city_lower not in city and city not in our_city_lower: issues.append(f"City mismatch: our '{our_city}' but Wikidata mentions '{city}'") break return issues def check_low_score_no_isil(match_score: float, our_isil: str) -> List[str]: """Flag low scores without ISIL confirmation.""" issues = [] if match_score < 87 and not our_isil: issues.append(f"Low confidence ({match_score:.1f}%) with no ISIL to verify") return issues def check_institution_type_hints(inst_name: str, wd_label: str, inst_type: str) -> List[str]: """Check for type mismatch hints in names.""" issues = [] # Type keywords in names type_keywords = { 'LIBRARY': ['bibliotek', 'library'], 'ARCHIVE': ['arkiv', 'archive', 'arkivet'], 'MUSEUM': ['museum', 'museet'] } our_keywords = type_keywords.get(inst_type, []) # Check if our name has library keywords but Wikidata doesn't (or vice versa) inst_lower = inst_name.lower() wd_lower = wd_label.lower() our_has_keyword = any(kw in inst_lower for kw in our_keywords) wd_has_keyword = any(kw in wd_lower for kw in our_keywords) if our_has_keyword and not wd_has_keyword: # Our name suggests library/archive but Wikidata doesn't mention it issues.append(f"Type keyword mismatch: our name has {inst_type} keyword, Wikidata doesn't") # Check for museum/gallery keywords in Wikidata when we're a library if inst_type == 'LIBRARY' and ('museum' in wd_lower or 'gallery' in wd_lower): issues.append("Type mismatch: we're LIBRARY but Wikidata mentions museum/gallery") return issues def run_fast_spot_checks(matches: List[Dict]) -> List[Dict]: """ Run fast pattern-based spot checks on all fuzzy matches. Returns list of matches with spot_check_issues field added. """ print(f"\nRunning fast automated spot checks on {len(matches)} fuzzy matches...") print("Using pattern-based detection (no Wikidata queries needed)\n") results = [] issue_count = 0 issue_type_counts = {} for i, match in enumerate(matches, 1): if i % 50 == 0: print(f" Progress: {i}/{len(matches)} ({i/len(matches)*100:.1f}%)") inst_name = match['institution_name'] wd_label = match['wikidata_label'] our_city = match['city'] our_type = match['institution_type'] our_isil = match['isil_code'] match_score = float(match['match_score']) all_issues = [] # Check 1: Low score without ISIL issues = check_low_score_no_isil(match_score, our_isil) if issues: all_issues.extend([f"⚠️ {issue}" for issue in issues]) issue_type_counts['Low score no ISIL'] = issue_type_counts.get('Low score no ISIL', 0) + 1 # Check 2: Name patterns issues = check_name_patterns(inst_name, wd_label) if issues: all_issues.extend([f"🔍 {issue}" for issue in issues]) issue_type_counts['Name pattern'] = issue_type_counts.get('Name pattern', 0) + len(issues) # Check 3: City names issues = check_city_in_labels(our_city, inst_name, wd_label) if issues: all_issues.extend([f"🚨 {issue}" for issue in issues]) issue_type_counts['City mismatch'] = issue_type_counts.get('City mismatch', 0) + 1 # Check 4: Institution type hints issues = check_institution_type_hints(inst_name, wd_label, our_type) if issues: all_issues.extend([f"⚠️ {issue}" for issue in issues]) issue_type_counts['Type hint'] = issue_type_counts.get('Type hint', 0) + 1 # Add spot check results to match match_with_issues = match.copy() if all_issues: match_with_issues['spot_check_issues'] = " | ".join(all_issues) match_with_issues['auto_flag'] = 'REVIEW_URGENT' issue_count += 1 else: match_with_issues['spot_check_issues'] = '' match_with_issues['auto_flag'] = 'OK' results.append(match_with_issues) print(f"\n ✅ Spot checks complete: {issue_count}/{len(matches)} matches flagged\n") # Print issue breakdown print(" Issue type breakdown:") for issue_type, count in sorted(issue_type_counts.items(), key=lambda x: -x[1]): print(f" {issue_type:<25}: {count:3d}") return results def generate_flagged_report(results: List[Dict], output_path: Path): """Generate CSV with spot check results.""" # Sort by: auto_flag (REVIEW_URGENT first), then priority, then score def sort_key(r): flag_priority = 0 if r['auto_flag'] == 'REVIEW_URGENT' else 1 return (flag_priority, int(r['priority']), float(r['match_score'])) results_sorted = sorted(results, key=sort_key) # Write CSV with new columns fieldnames = [ 'auto_flag', 'spot_check_issues', 'priority', 'match_score', 'institution_name', 'wikidata_label', 'city', 'institution_type', 'isil_code', 'ghcid', 'wikidata_qid', 'wikidata_url', 'validation_status', 'validation_notes', 'institution_id' ] with open(output_path, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(results_sorted) def generate_summary(results: List[Dict]): """Print summary of spot check findings.""" total = len(results) flagged = sum(1 for r in results if r['auto_flag'] == 'REVIEW_URGENT') ok = total - flagged print("\n" + "=" * 70) print("Fast Automated Spot Check Summary") print("=" * 70) print(f"\n📊 Overall Results") print(f" Total fuzzy matches: {total}") print(f" Flagged issues: {flagged} ({flagged/total*100:.1f}%)") print(f" No issues detected: {ok} ({ok/total*100:.1f}%)") # Count by priority flagged_by_priority = {} for result in results: if result['auto_flag'] == 'REVIEW_URGENT': priority = result['priority'] flagged_by_priority[priority] = flagged_by_priority.get(priority, 0) + 1 print(f"\n🎯 Flagged Matches by Priority") for priority in sorted(flagged_by_priority.keys()): count = flagged_by_priority[priority] print(f" Priority {priority}: {count:3d} flagged") # Sample flagged records print(f"\n🔍 Sample Flagged Records (Top 10)") flagged_records = [r for r in results if r['auto_flag'] == 'REVIEW_URGENT'] for i, record in enumerate(flagged_records[:10], 1): print(f"\n {i}. Priority {record['priority']} - Score {record['match_score']}%") print(f" Institution: {record['institution_name']}") print(f" Wikidata: {record['wikidata_label']}") # Parse issues (limit display) issues_text = record['spot_check_issues'] issues_list = issues_text.split(' | ') for issue in issues_list[:2]: # Show first 2 issues print(f" {issue}") if len(issues_list) > 2: print(f" ... and {len(issues_list)-2} more issue(s)") print("\n" + "=" * 70) print("Interpretation Guide") print("=" * 70) print(""" 🚨 City mismatch → Very likely INCORRECT (different cities) ⚠️ Type keyword mismatch → Likely INCORRECT (e.g., library vs museum) 🔍 Branch suffix → Probably INCORRECT (branch vs main library) 🔍 Gymnasium library → Likely INCORRECT (school vs public library) 🔍 Low name similarity → Uncertain, needs manual check ⚠️ Low score no ISIL → Uncertain, needs verification """) print("=" * 70) print("Recommended Actions") print("=" * 70) print(f""" 1. Open flagged CSV: data/review/denmark_wikidata_fuzzy_matches_flagged.csv 2. Focus on REVIEW_URGENT rows first ({flagged} matches) - Sort by auto_flag column - Start with 🚨 city/type mismatches (likely INCORRECT) - Then review 🔍 name pattern issues (needs judgment) 3. Fill validation_status for flagged rows: - City mismatch → INCORRECT - Type mismatch → INCORRECT - Branch vs main → INCORRECT (usually) - Name similarity issues → Needs manual judgment 4. Review OK rows ({ok} matches) - lower priority - These passed automated checks - Still review Priority 1-2 for safety 5. After review: python scripts/apply_wikidata_validation.py """) print("=" * 70) def main(): print("=" * 70) print("Fast Automated Spot Checks for Wikidata Fuzzy Matches") print("=" * 70) # Load fuzzy matches csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv') print(f"\nLoading fuzzy matches: {csv_path}") matches = load_csv_matches(csv_path) print(f" ✅ Loaded {len(matches)} matches") # Run fast spot checks (pattern-based, no Wikidata queries) results = run_fast_spot_checks(matches) # Generate flagged report output_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv') print(f"\nGenerating flagged report: {output_path}") generate_flagged_report(results, output_path) size_kb = output_path.stat().st_size / 1024 print(f" ✅ Saved flagged report ({size_kb:.1f} KB)") # Print summary generate_summary(results) print("\n✅ Fast Automated Spot Checks Complete") if __name__ == '__main__': main()