365 lines
13 KiB
Python
365 lines
13 KiB
Python
"""
|
|
Fast Automated Spot Checks for Wikidata Fuzzy Matches
|
|
|
|
Pattern-based detection (no Wikidata queries needed for most checks):
|
|
1. City name mismatches (from CSV data)
|
|
2. Name pattern issues (branch suffixes, gymnasium libraries)
|
|
3. Low scores (<87%) without ISIL confirmation
|
|
4. Similar institution names in different cities
|
|
|
|
Generates prioritized review list with auto-detected issues.
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple
|
|
from rapidfuzz import fuzz
|
|
|
|
|
|
def load_csv_matches(csv_path: Path) -> List[Dict]:
|
|
"""Load fuzzy matches from CSV."""
|
|
matches = []
|
|
with open(csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
matches.append(row)
|
|
return matches
|
|
|
|
|
|
def check_name_patterns(inst_name: str, wd_label: str) -> List[str]:
|
|
"""Check for problematic name patterns."""
|
|
issues = []
|
|
|
|
# Pattern 1: Branch suffix in our name but not Wikidata
|
|
if ', Biblioteket' in inst_name and 'Bibliotek' not in wd_label:
|
|
issues.append("Branch suffix ', Biblioteket' in our name but not Wikidata (branch vs main?)")
|
|
|
|
# Pattern 2: Gymnasium library vs public library
|
|
if 'Gymnasium' in inst_name and 'Gymnasium' not in wd_label:
|
|
if 'Bibliotek' in wd_label:
|
|
issues.append("Our 'Gymnasium' library matched to public library (school vs public?)")
|
|
|
|
# Pattern 3: Kombi Bibliotek (different institutions)
|
|
if 'Kombi' in inst_name:
|
|
# Extract city/location before "Kombi"
|
|
our_location = inst_name.split('Kombi')[0].strip()
|
|
if our_location not in wd_label:
|
|
issues.append(f"Kombi library location mismatch: '{our_location}' not in Wikidata label")
|
|
|
|
# Pattern 4: Different base institution names
|
|
# Remove common suffixes for comparison
|
|
our_base = re.sub(r',?\s*(Biblioteket|Bibliotek|Arkiv)$', '', inst_name).strip()
|
|
wd_base = re.sub(r'\s*(Biblioteket|Bibliotek|Arkiv)$', '', wd_label).strip()
|
|
|
|
similarity = fuzz.ratio(our_base.lower(), wd_base.lower())
|
|
if similarity < 60:
|
|
issues.append(f"Low name similarity ({similarity}%) - possibly different institutions")
|
|
|
|
# Pattern 5: City/location names differ
|
|
# Extract first word (often city name)
|
|
our_first = inst_name.split()[0] if inst_name else ""
|
|
wd_first = wd_label.split()[0] if wd_label else ""
|
|
|
|
if len(our_first) > 3 and len(wd_first) > 3: # Avoid short words
|
|
if our_first.lower() != wd_first.lower() and similarity < 85:
|
|
first_sim = fuzz.ratio(our_first.lower(), wd_first.lower())
|
|
if first_sim < 70:
|
|
issues.append(f"First word differs: '{our_first}' vs '{wd_first}' (city mismatch?)")
|
|
|
|
return issues
|
|
|
|
|
|
def check_city_in_labels(our_city: str, inst_name: str, wd_label: str) -> List[str]:
|
|
"""Check if city names are consistent."""
|
|
issues = []
|
|
|
|
if not our_city:
|
|
return issues
|
|
|
|
our_city_lower = our_city.lower().strip()
|
|
|
|
# Check if Wikidata label contains a different city name
|
|
# Common Danish cities to check against
|
|
danish_cities = [
|
|
'københavn', 'aarhus', 'odense', 'aalborg', 'frederiksberg',
|
|
'esbjerg', 'randers', 'kolding', 'horsens', 'vejle',
|
|
'roskilde', 'herning', 'helsingør', 'silkeborg', 'næstved',
|
|
'fredericia', 'viborg', 'køge', 'holstebro', 'taastrup',
|
|
'svendborg', 'hvidovre', 'hørsholm', 'greve', 'ballerup',
|
|
'gladsaxe', 'gentofte', 'herlev', 'glostrup', 'albertslund'
|
|
]
|
|
|
|
wd_label_lower = wd_label.lower()
|
|
|
|
# Check if Wikidata label mentions a different city
|
|
for city in danish_cities:
|
|
if city in wd_label_lower and city != our_city_lower:
|
|
# Make sure it's not a substring match
|
|
if our_city_lower not in city and city not in our_city_lower:
|
|
issues.append(f"City mismatch: our '{our_city}' but Wikidata mentions '{city}'")
|
|
break
|
|
|
|
return issues
|
|
|
|
|
|
def check_low_score_no_isil(match_score: float, our_isil: str) -> List[str]:
|
|
"""Flag low scores without ISIL confirmation."""
|
|
issues = []
|
|
if match_score < 87 and not our_isil:
|
|
issues.append(f"Low confidence ({match_score:.1f}%) with no ISIL to verify")
|
|
return issues
|
|
|
|
|
|
def check_institution_type_hints(inst_name: str, wd_label: str, inst_type: str) -> List[str]:
|
|
"""Check for type mismatch hints in names."""
|
|
issues = []
|
|
|
|
# Type keywords in names
|
|
type_keywords = {
|
|
'LIBRARY': ['bibliotek', 'library'],
|
|
'ARCHIVE': ['arkiv', 'archive', 'arkivet'],
|
|
'MUSEUM': ['museum', 'museet']
|
|
}
|
|
|
|
our_keywords = type_keywords.get(inst_type, [])
|
|
|
|
# Check if our name has library keywords but Wikidata doesn't (or vice versa)
|
|
inst_lower = inst_name.lower()
|
|
wd_lower = wd_label.lower()
|
|
|
|
our_has_keyword = any(kw in inst_lower for kw in our_keywords)
|
|
wd_has_keyword = any(kw in wd_lower for kw in our_keywords)
|
|
|
|
if our_has_keyword and not wd_has_keyword:
|
|
# Our name suggests library/archive but Wikidata doesn't mention it
|
|
issues.append(f"Type keyword mismatch: our name has {inst_type} keyword, Wikidata doesn't")
|
|
|
|
# Check for museum/gallery keywords in Wikidata when we're a library
|
|
if inst_type == 'LIBRARY' and ('museum' in wd_lower or 'gallery' in wd_lower):
|
|
issues.append("Type mismatch: we're LIBRARY but Wikidata mentions museum/gallery")
|
|
|
|
return issues
|
|
|
|
|
|
def run_fast_spot_checks(matches: List[Dict]) -> List[Dict]:
|
|
"""
|
|
Run fast pattern-based spot checks on all fuzzy matches.
|
|
|
|
Returns list of matches with spot_check_issues field added.
|
|
"""
|
|
print(f"\nRunning fast automated spot checks on {len(matches)} fuzzy matches...")
|
|
print("Using pattern-based detection (no Wikidata queries needed)\n")
|
|
|
|
results = []
|
|
issue_count = 0
|
|
issue_type_counts = {}
|
|
|
|
for i, match in enumerate(matches, 1):
|
|
if i % 50 == 0:
|
|
print(f" Progress: {i}/{len(matches)} ({i/len(matches)*100:.1f}%)")
|
|
|
|
inst_name = match['institution_name']
|
|
wd_label = match['wikidata_label']
|
|
our_city = match['city']
|
|
our_type = match['institution_type']
|
|
our_isil = match['isil_code']
|
|
match_score = float(match['match_score'])
|
|
|
|
all_issues = []
|
|
|
|
# Check 1: Low score without ISIL
|
|
issues = check_low_score_no_isil(match_score, our_isil)
|
|
if issues:
|
|
all_issues.extend([f"⚠️ {issue}" for issue in issues])
|
|
issue_type_counts['Low score no ISIL'] = issue_type_counts.get('Low score no ISIL', 0) + 1
|
|
|
|
# Check 2: Name patterns
|
|
issues = check_name_patterns(inst_name, wd_label)
|
|
if issues:
|
|
all_issues.extend([f"🔍 {issue}" for issue in issues])
|
|
issue_type_counts['Name pattern'] = issue_type_counts.get('Name pattern', 0) + len(issues)
|
|
|
|
# Check 3: City names
|
|
issues = check_city_in_labels(our_city, inst_name, wd_label)
|
|
if issues:
|
|
all_issues.extend([f"🚨 {issue}" for issue in issues])
|
|
issue_type_counts['City mismatch'] = issue_type_counts.get('City mismatch', 0) + 1
|
|
|
|
# Check 4: Institution type hints
|
|
issues = check_institution_type_hints(inst_name, wd_label, our_type)
|
|
if issues:
|
|
all_issues.extend([f"⚠️ {issue}" for issue in issues])
|
|
issue_type_counts['Type hint'] = issue_type_counts.get('Type hint', 0) + 1
|
|
|
|
# Add spot check results to match
|
|
match_with_issues = match.copy()
|
|
if all_issues:
|
|
match_with_issues['spot_check_issues'] = " | ".join(all_issues)
|
|
match_with_issues['auto_flag'] = 'REVIEW_URGENT'
|
|
issue_count += 1
|
|
else:
|
|
match_with_issues['spot_check_issues'] = ''
|
|
match_with_issues['auto_flag'] = 'OK'
|
|
|
|
results.append(match_with_issues)
|
|
|
|
print(f"\n ✅ Spot checks complete: {issue_count}/{len(matches)} matches flagged\n")
|
|
|
|
# Print issue breakdown
|
|
print(" Issue type breakdown:")
|
|
for issue_type, count in sorted(issue_type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {issue_type:<25}: {count:3d}")
|
|
|
|
return results
|
|
|
|
|
|
def generate_flagged_report(results: List[Dict], output_path: Path):
|
|
"""Generate CSV with spot check results."""
|
|
|
|
# Sort by: auto_flag (REVIEW_URGENT first), then priority, then score
|
|
def sort_key(r):
|
|
flag_priority = 0 if r['auto_flag'] == 'REVIEW_URGENT' else 1
|
|
return (flag_priority, int(r['priority']), float(r['match_score']))
|
|
|
|
results_sorted = sorted(results, key=sort_key)
|
|
|
|
# Write CSV with new columns
|
|
fieldnames = [
|
|
'auto_flag',
|
|
'spot_check_issues',
|
|
'priority',
|
|
'match_score',
|
|
'institution_name',
|
|
'wikidata_label',
|
|
'city',
|
|
'institution_type',
|
|
'isil_code',
|
|
'ghcid',
|
|
'wikidata_qid',
|
|
'wikidata_url',
|
|
'validation_status',
|
|
'validation_notes',
|
|
'institution_id'
|
|
]
|
|
|
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(results_sorted)
|
|
|
|
|
|
def generate_summary(results: List[Dict]):
|
|
"""Print summary of spot check findings."""
|
|
|
|
total = len(results)
|
|
flagged = sum(1 for r in results if r['auto_flag'] == 'REVIEW_URGENT')
|
|
ok = total - flagged
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Fast Automated Spot Check Summary")
|
|
print("=" * 70)
|
|
|
|
print(f"\n📊 Overall Results")
|
|
print(f" Total fuzzy matches: {total}")
|
|
print(f" Flagged issues: {flagged} ({flagged/total*100:.1f}%)")
|
|
print(f" No issues detected: {ok} ({ok/total*100:.1f}%)")
|
|
|
|
# Count by priority
|
|
flagged_by_priority = {}
|
|
for result in results:
|
|
if result['auto_flag'] == 'REVIEW_URGENT':
|
|
priority = result['priority']
|
|
flagged_by_priority[priority] = flagged_by_priority.get(priority, 0) + 1
|
|
|
|
print(f"\n🎯 Flagged Matches by Priority")
|
|
for priority in sorted(flagged_by_priority.keys()):
|
|
count = flagged_by_priority[priority]
|
|
print(f" Priority {priority}: {count:3d} flagged")
|
|
|
|
# Sample flagged records
|
|
print(f"\n🔍 Sample Flagged Records (Top 10)")
|
|
flagged_records = [r for r in results if r['auto_flag'] == 'REVIEW_URGENT']
|
|
|
|
for i, record in enumerate(flagged_records[:10], 1):
|
|
print(f"\n {i}. Priority {record['priority']} - Score {record['match_score']}%")
|
|
print(f" Institution: {record['institution_name']}")
|
|
print(f" Wikidata: {record['wikidata_label']}")
|
|
|
|
# Parse issues (limit display)
|
|
issues_text = record['spot_check_issues']
|
|
issues_list = issues_text.split(' | ')
|
|
for issue in issues_list[:2]: # Show first 2 issues
|
|
print(f" {issue}")
|
|
if len(issues_list) > 2:
|
|
print(f" ... and {len(issues_list)-2} more issue(s)")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Interpretation Guide")
|
|
print("=" * 70)
|
|
print("""
|
|
🚨 City mismatch → Very likely INCORRECT (different cities)
|
|
⚠️ Type keyword mismatch → Likely INCORRECT (e.g., library vs museum)
|
|
🔍 Branch suffix → Probably INCORRECT (branch vs main library)
|
|
🔍 Gymnasium library → Likely INCORRECT (school vs public library)
|
|
🔍 Low name similarity → Uncertain, needs manual check
|
|
⚠️ Low score no ISIL → Uncertain, needs verification
|
|
""")
|
|
|
|
print("=" * 70)
|
|
print("Recommended Actions")
|
|
print("=" * 70)
|
|
print(f"""
|
|
1. Open flagged CSV: data/review/denmark_wikidata_fuzzy_matches_flagged.csv
|
|
|
|
2. Focus on REVIEW_URGENT rows first ({flagged} matches)
|
|
- Sort by auto_flag column
|
|
- Start with 🚨 city/type mismatches (likely INCORRECT)
|
|
- Then review 🔍 name pattern issues (needs judgment)
|
|
|
|
3. Fill validation_status for flagged rows:
|
|
- City mismatch → INCORRECT
|
|
- Type mismatch → INCORRECT
|
|
- Branch vs main → INCORRECT (usually)
|
|
- Name similarity issues → Needs manual judgment
|
|
|
|
4. Review OK rows ({ok} matches) - lower priority
|
|
- These passed automated checks
|
|
- Still review Priority 1-2 for safety
|
|
|
|
5. After review: python scripts/apply_wikidata_validation.py
|
|
""")
|
|
|
|
print("=" * 70)
|
|
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("Fast Automated Spot Checks for Wikidata Fuzzy Matches")
|
|
print("=" * 70)
|
|
|
|
# Load fuzzy matches
|
|
csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
|
|
print(f"\nLoading fuzzy matches: {csv_path}")
|
|
matches = load_csv_matches(csv_path)
|
|
print(f" ✅ Loaded {len(matches)} matches")
|
|
|
|
# Run fast spot checks (pattern-based, no Wikidata queries)
|
|
results = run_fast_spot_checks(matches)
|
|
|
|
# Generate flagged report
|
|
output_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv')
|
|
print(f"\nGenerating flagged report: {output_path}")
|
|
generate_flagged_report(results, output_path)
|
|
size_kb = output_path.stat().st_size / 1024
|
|
print(f" ✅ Saved flagged report ({size_kb:.1f} KB)")
|
|
|
|
# Print summary
|
|
generate_summary(results)
|
|
|
|
print("\n✅ Fast Automated Spot Checks Complete")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|