glam/scripts/spot_check_fuzzy_matches_fast.py
2025-11-21 22:12:33 +01:00

365 lines
13 KiB
Python

"""
Fast Automated Spot Checks for Wikidata Fuzzy Matches
Pattern-based detection (no Wikidata queries needed for most checks):
1. City name mismatches (from CSV data)
2. Name pattern issues (branch suffixes, gymnasium libraries)
3. Low scores (<87%) without ISIL confirmation
4. Similar institution names in different cities
Generates prioritized review list with auto-detected issues.
"""
import json
import csv
import re
from pathlib import Path
from typing import Dict, List, Tuple
from rapidfuzz import fuzz
def load_csv_matches(csv_path: Path) -> List[Dict]:
"""Load fuzzy matches from CSV."""
matches = []
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
matches.append(row)
return matches
def check_name_patterns(inst_name: str, wd_label: str) -> List[str]:
"""Check for problematic name patterns."""
issues = []
# Pattern 1: Branch suffix in our name but not Wikidata
if ', Biblioteket' in inst_name and 'Bibliotek' not in wd_label:
issues.append("Branch suffix ', Biblioteket' in our name but not Wikidata (branch vs main?)")
# Pattern 2: Gymnasium library vs public library
if 'Gymnasium' in inst_name and 'Gymnasium' not in wd_label:
if 'Bibliotek' in wd_label:
issues.append("Our 'Gymnasium' library matched to public library (school vs public?)")
# Pattern 3: Kombi Bibliotek (different institutions)
if 'Kombi' in inst_name:
# Extract city/location before "Kombi"
our_location = inst_name.split('Kombi')[0].strip()
if our_location not in wd_label:
issues.append(f"Kombi library location mismatch: '{our_location}' not in Wikidata label")
# Pattern 4: Different base institution names
# Remove common suffixes for comparison
our_base = re.sub(r',?\s*(Biblioteket|Bibliotek|Arkiv)$', '', inst_name).strip()
wd_base = re.sub(r'\s*(Biblioteket|Bibliotek|Arkiv)$', '', wd_label).strip()
similarity = fuzz.ratio(our_base.lower(), wd_base.lower())
if similarity < 60:
issues.append(f"Low name similarity ({similarity}%) - possibly different institutions")
# Pattern 5: City/location names differ
# Extract first word (often city name)
our_first = inst_name.split()[0] if inst_name else ""
wd_first = wd_label.split()[0] if wd_label else ""
if len(our_first) > 3 and len(wd_first) > 3: # Avoid short words
if our_first.lower() != wd_first.lower() and similarity < 85:
first_sim = fuzz.ratio(our_first.lower(), wd_first.lower())
if first_sim < 70:
issues.append(f"First word differs: '{our_first}' vs '{wd_first}' (city mismatch?)")
return issues
def check_city_in_labels(our_city: str, inst_name: str, wd_label: str) -> List[str]:
"""Check if city names are consistent."""
issues = []
if not our_city:
return issues
our_city_lower = our_city.lower().strip()
# Check if Wikidata label contains a different city name
# Common Danish cities to check against
danish_cities = [
'københavn', 'aarhus', 'odense', 'aalborg', 'frederiksberg',
'esbjerg', 'randers', 'kolding', 'horsens', 'vejle',
'roskilde', 'herning', 'helsingør', 'silkeborg', 'næstved',
'fredericia', 'viborg', 'køge', 'holstebro', 'taastrup',
'svendborg', 'hvidovre', 'hørsholm', 'greve', 'ballerup',
'gladsaxe', 'gentofte', 'herlev', 'glostrup', 'albertslund'
]
wd_label_lower = wd_label.lower()
# Check if Wikidata label mentions a different city
for city in danish_cities:
if city in wd_label_lower and city != our_city_lower:
# Make sure it's not a substring match
if our_city_lower not in city and city not in our_city_lower:
issues.append(f"City mismatch: our '{our_city}' but Wikidata mentions '{city}'")
break
return issues
def check_low_score_no_isil(match_score: float, our_isil: str) -> List[str]:
"""Flag low scores without ISIL confirmation."""
issues = []
if match_score < 87 and not our_isil:
issues.append(f"Low confidence ({match_score:.1f}%) with no ISIL to verify")
return issues
def check_institution_type_hints(inst_name: str, wd_label: str, inst_type: str) -> List[str]:
"""Check for type mismatch hints in names."""
issues = []
# Type keywords in names
type_keywords = {
'LIBRARY': ['bibliotek', 'library'],
'ARCHIVE': ['arkiv', 'archive', 'arkivet'],
'MUSEUM': ['museum', 'museet']
}
our_keywords = type_keywords.get(inst_type, [])
# Check if our name has library keywords but Wikidata doesn't (or vice versa)
inst_lower = inst_name.lower()
wd_lower = wd_label.lower()
our_has_keyword = any(kw in inst_lower for kw in our_keywords)
wd_has_keyword = any(kw in wd_lower for kw in our_keywords)
if our_has_keyword and not wd_has_keyword:
# Our name suggests library/archive but Wikidata doesn't mention it
issues.append(f"Type keyword mismatch: our name has {inst_type} keyword, Wikidata doesn't")
# Check for museum/gallery keywords in Wikidata when we're a library
if inst_type == 'LIBRARY' and ('museum' in wd_lower or 'gallery' in wd_lower):
issues.append("Type mismatch: we're LIBRARY but Wikidata mentions museum/gallery")
return issues
def run_fast_spot_checks(matches: List[Dict]) -> List[Dict]:
"""
Run fast pattern-based spot checks on all fuzzy matches.
Returns list of matches with spot_check_issues field added.
"""
print(f"\nRunning fast automated spot checks on {len(matches)} fuzzy matches...")
print("Using pattern-based detection (no Wikidata queries needed)\n")
results = []
issue_count = 0
issue_type_counts = {}
for i, match in enumerate(matches, 1):
if i % 50 == 0:
print(f" Progress: {i}/{len(matches)} ({i/len(matches)*100:.1f}%)")
inst_name = match['institution_name']
wd_label = match['wikidata_label']
our_city = match['city']
our_type = match['institution_type']
our_isil = match['isil_code']
match_score = float(match['match_score'])
all_issues = []
# Check 1: Low score without ISIL
issues = check_low_score_no_isil(match_score, our_isil)
if issues:
all_issues.extend([f"⚠️ {issue}" for issue in issues])
issue_type_counts['Low score no ISIL'] = issue_type_counts.get('Low score no ISIL', 0) + 1
# Check 2: Name patterns
issues = check_name_patterns(inst_name, wd_label)
if issues:
all_issues.extend([f"🔍 {issue}" for issue in issues])
issue_type_counts['Name pattern'] = issue_type_counts.get('Name pattern', 0) + len(issues)
# Check 3: City names
issues = check_city_in_labels(our_city, inst_name, wd_label)
if issues:
all_issues.extend([f"🚨 {issue}" for issue in issues])
issue_type_counts['City mismatch'] = issue_type_counts.get('City mismatch', 0) + 1
# Check 4: Institution type hints
issues = check_institution_type_hints(inst_name, wd_label, our_type)
if issues:
all_issues.extend([f"⚠️ {issue}" for issue in issues])
issue_type_counts['Type hint'] = issue_type_counts.get('Type hint', 0) + 1
# Add spot check results to match
match_with_issues = match.copy()
if all_issues:
match_with_issues['spot_check_issues'] = " | ".join(all_issues)
match_with_issues['auto_flag'] = 'REVIEW_URGENT'
issue_count += 1
else:
match_with_issues['spot_check_issues'] = ''
match_with_issues['auto_flag'] = 'OK'
results.append(match_with_issues)
print(f"\n ✅ Spot checks complete: {issue_count}/{len(matches)} matches flagged\n")
# Print issue breakdown
print(" Issue type breakdown:")
for issue_type, count in sorted(issue_type_counts.items(), key=lambda x: -x[1]):
print(f" {issue_type:<25}: {count:3d}")
return results
def generate_flagged_report(results: List[Dict], output_path: Path):
"""Generate CSV with spot check results."""
# Sort by: auto_flag (REVIEW_URGENT first), then priority, then score
def sort_key(r):
flag_priority = 0 if r['auto_flag'] == 'REVIEW_URGENT' else 1
return (flag_priority, int(r['priority']), float(r['match_score']))
results_sorted = sorted(results, key=sort_key)
# Write CSV with new columns
fieldnames = [
'auto_flag',
'spot_check_issues',
'priority',
'match_score',
'institution_name',
'wikidata_label',
'city',
'institution_type',
'isil_code',
'ghcid',
'wikidata_qid',
'wikidata_url',
'validation_status',
'validation_notes',
'institution_id'
]
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results_sorted)
def generate_summary(results: List[Dict]):
"""Print summary of spot check findings."""
total = len(results)
flagged = sum(1 for r in results if r['auto_flag'] == 'REVIEW_URGENT')
ok = total - flagged
print("\n" + "=" * 70)
print("Fast Automated Spot Check Summary")
print("=" * 70)
print(f"\n📊 Overall Results")
print(f" Total fuzzy matches: {total}")
print(f" Flagged issues: {flagged} ({flagged/total*100:.1f}%)")
print(f" No issues detected: {ok} ({ok/total*100:.1f}%)")
# Count by priority
flagged_by_priority = {}
for result in results:
if result['auto_flag'] == 'REVIEW_URGENT':
priority = result['priority']
flagged_by_priority[priority] = flagged_by_priority.get(priority, 0) + 1
print(f"\n🎯 Flagged Matches by Priority")
for priority in sorted(flagged_by_priority.keys()):
count = flagged_by_priority[priority]
print(f" Priority {priority}: {count:3d} flagged")
# Sample flagged records
print(f"\n🔍 Sample Flagged Records (Top 10)")
flagged_records = [r for r in results if r['auto_flag'] == 'REVIEW_URGENT']
for i, record in enumerate(flagged_records[:10], 1):
print(f"\n {i}. Priority {record['priority']} - Score {record['match_score']}%")
print(f" Institution: {record['institution_name']}")
print(f" Wikidata: {record['wikidata_label']}")
# Parse issues (limit display)
issues_text = record['spot_check_issues']
issues_list = issues_text.split(' | ')
for issue in issues_list[:2]: # Show first 2 issues
print(f" {issue}")
if len(issues_list) > 2:
print(f" ... and {len(issues_list)-2} more issue(s)")
print("\n" + "=" * 70)
print("Interpretation Guide")
print("=" * 70)
print("""
🚨 City mismatch → Very likely INCORRECT (different cities)
⚠️ Type keyword mismatch → Likely INCORRECT (e.g., library vs museum)
🔍 Branch suffix → Probably INCORRECT (branch vs main library)
🔍 Gymnasium library → Likely INCORRECT (school vs public library)
🔍 Low name similarity → Uncertain, needs manual check
⚠️ Low score no ISIL → Uncertain, needs verification
""")
print("=" * 70)
print("Recommended Actions")
print("=" * 70)
print(f"""
1. Open flagged CSV: data/review/denmark_wikidata_fuzzy_matches_flagged.csv
2. Focus on REVIEW_URGENT rows first ({flagged} matches)
- Sort by auto_flag column
- Start with 🚨 city/type mismatches (likely INCORRECT)
- Then review 🔍 name pattern issues (needs judgment)
3. Fill validation_status for flagged rows:
- City mismatch → INCORRECT
- Type mismatch → INCORRECT
- Branch vs main → INCORRECT (usually)
- Name similarity issues → Needs manual judgment
4. Review OK rows ({ok} matches) - lower priority
- These passed automated checks
- Still review Priority 1-2 for safety
5. After review: python scripts/apply_wikidata_validation.py
""")
print("=" * 70)
def main():
print("=" * 70)
print("Fast Automated Spot Checks for Wikidata Fuzzy Matches")
print("=" * 70)
# Load fuzzy matches
csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
print(f"\nLoading fuzzy matches: {csv_path}")
matches = load_csv_matches(csv_path)
print(f" ✅ Loaded {len(matches)} matches")
# Run fast spot checks (pattern-based, no Wikidata queries)
results = run_fast_spot_checks(matches)
# Generate flagged report
output_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv')
print(f"\nGenerating flagged report: {output_path}")
generate_flagged_report(results, output_path)
size_kb = output_path.stat().st_size / 1024
print(f" ✅ Saved flagged report ({size_kb:.1f} KB)")
# Print summary
generate_summary(results)
print("\n✅ Fast Automated Spot Checks Complete")
if __name__ == '__main__':
main()