glam/scripts/spot_check_fuzzy_matches.py
2025-11-21 22:12:33 +01:00

384 lines
13 KiB
Python

"""
Automated Spot Checks for Wikidata Fuzzy Matches
Programmatically detects obvious errors in fuzzy matches to prioritize manual review:
1. City name mismatches (different cities = likely wrong match)
2. Institution type mismatches (detected from Wikidata)
3. ISIL code conflicts (if Wikidata has different ISIL)
4. Name pattern issues (branch suffixes, gymnasium libraries)
5. Very low scores (<87%) with no ISIL confirmation
Generates prioritized review list with auto-detected issues.
"""
import json
import csv
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from rapidfuzz import fuzz
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
import time
def load_csv_matches(csv_path: Path) -> List[Dict]:
"""Load fuzzy matches from CSV."""
matches = []
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
matches.append(row)
return matches
def query_wikidata_entity(qid: str) -> Optional[Dict]:
"""
Query Wikidata for entity details.
Returns dict with: type (P31), isil (P791), city (P131)
"""
query = f"""
SELECT ?type ?typeLabel ?isil ?city ?cityLabel WHERE {{
wd:{qid} wdt:P31 ?type .
OPTIONAL {{ wd:{qid} wdt:P791 ?isil }}
OPTIONAL {{ wd:{qid} wdt:P131 ?city }}
SERVICE wikibase:label {{
bd:serviceParam wikibase:language "da,en"
}}
}}
LIMIT 5
"""
endpoint = SPARQLWrapper("https://query.wikidata.org/sparql")
endpoint.setQuery(query)
endpoint.setReturnFormat(SPARQL_JSON)
endpoint.addCustomHttpHeader('User-Agent', 'GLAM-Spot-Check/1.0')
try:
results = endpoint.query().convert()
bindings = results['results']['bindings']
if not bindings:
return None
# Aggregate results
types = [b.get('typeLabel', {}).get('value') for b in bindings]
isil = bindings[0].get('isil', {}).get('value') if bindings else None
city = bindings[0].get('cityLabel', {}).get('value') if bindings else None
return {
'types': list(set(filter(None, types))),
'isil': isil,
'city': city
}
except Exception as e:
print(f" ⚠️ Error querying {qid}: {e}")
return None
def check_city_mismatch(our_city: str, wd_city: Optional[str]) -> Tuple[bool, str]:
"""Check if cities match (accounting for variations)."""
if not our_city or not wd_city:
return False, ""
our_city_clean = our_city.lower().strip()
wd_city_clean = wd_city.lower().strip()
# Exact match
if our_city_clean == wd_city_clean:
return False, ""
# One contains the other
if our_city_clean in wd_city_clean or wd_city_clean in our_city_clean:
return False, ""
# Fuzzy match (allow minor spelling variations)
similarity = fuzz.ratio(our_city_clean, wd_city_clean)
if similarity > 85:
return False, ""
# Cities don't match
return True, f"City mismatch: '{our_city}' vs Wikidata '{wd_city}'"
def check_isil_conflict(our_isil: str, wd_isil: Optional[str]) -> Tuple[bool, str]:
"""Check if ISIL codes conflict."""
if not our_isil or not wd_isil:
return False, ""
our_isil_clean = our_isil.strip()
wd_isil_clean = wd_isil.strip()
if our_isil_clean != wd_isil_clean:
return True, f"ISIL conflict: our '{our_isil}' vs Wikidata '{wd_isil}'"
return False, ""
def check_type_mismatch(our_type: str, wd_types: List[str]) -> Tuple[bool, str]:
"""Check if institution types match."""
if not wd_types:
return False, ""
# Map our types to Wikidata type labels
type_mappings = {
'LIBRARY': ['library', 'public library', 'academic library',
'university library', 'national library'],
'ARCHIVE': ['archive', 'archives', 'archival institution',
'state archive', 'national archives'],
'MUSEUM': ['museum', 'art museum', 'history museum']
}
expected_types = type_mappings.get(our_type, [])
# Check if any Wikidata type matches our expected types
for wd_type in wd_types:
wd_type_lower = wd_type.lower()
for expected in expected_types:
if expected in wd_type_lower:
return False, ""
# No match found
return True, f"Type mismatch: our {our_type} vs Wikidata {', '.join(wd_types[:3])}"
def check_name_patterns(inst_name: str, wd_label: str) -> Tuple[bool, str]:
"""Check for problematic name patterns."""
issues = []
# Pattern 1: Branch suffix in our name
if ', Biblioteket' in inst_name and ', Biblioteket' not in wd_label:
issues.append("Our name has ', Biblioteket' suffix (branch?), Wikidata doesn't")
# Pattern 2: Gymnasium library
if 'Gymnasium' in inst_name and 'Gymnasium' not in wd_label:
issues.append("Our name has 'Gymnasium' (school library?), Wikidata doesn't")
# Pattern 3: Different institution names entirely
# Extract base names (remove suffixes)
our_base = re.sub(r',\s*Biblioteket$', '', inst_name)
wd_base = re.sub(r'\s+Bibliotek$', '', wd_label)
similarity = fuzz.ratio(our_base.lower(), wd_base.lower())
if similarity < 60:
issues.append(f"Low name similarity ({similarity}%) - possibly different institutions")
if issues:
return True, "; ".join(issues)
return False, ""
def check_low_score_no_isil(match_score: float, our_isil: str) -> Tuple[bool, str]:
"""Flag low scores without ISIL confirmation."""
if match_score < 87 and not our_isil:
return True, f"Low score ({match_score}%) with no ISIL to verify"
return False, ""
def run_spot_checks(matches: List[Dict]) -> List[Dict]:
"""
Run automated spot checks on all fuzzy matches.
Returns list of matches with spot_check_issues field added.
"""
print(f"\nRunning automated spot checks on {len(matches)} fuzzy matches...")
print("This will query Wikidata for each Q-number (may take ~5 minutes)\n")
results = []
issue_count = 0
for i, match in enumerate(matches, 1):
if i % 20 == 0:
print(f" Progress: {i}/{len(matches)} ({i/len(matches)*100:.1f}%)")
qid = match['wikidata_qid']
inst_name = match['institution_name']
wd_label = match['wikidata_label']
our_city = match['city']
our_type = match['institution_type']
our_isil = match['isil_code']
match_score = float(match['match_score'])
issues = []
# Check 1: Low score without ISIL
has_issue, msg = check_low_score_no_isil(match_score, our_isil)
if has_issue:
issues.append(f"⚠️ {msg}")
# Check 2: Name patterns
has_issue, msg = check_name_patterns(inst_name, wd_label)
if has_issue:
issues.append(f"🔍 {msg}")
# Query Wikidata for entity details
wd_data = query_wikidata_entity(qid)
time.sleep(0.5) # Rate limiting (2 req/sec)
if wd_data:
# Check 3: City mismatch
has_issue, msg = check_city_mismatch(our_city, wd_data.get('city'))
if has_issue:
issues.append(f"🚨 {msg}")
# Check 4: ISIL conflict
has_issue, msg = check_isil_conflict(our_isil, wd_data.get('isil'))
if has_issue:
issues.append(f"🚨 {msg}")
# Check 5: Type mismatch
has_issue, msg = check_type_mismatch(our_type, wd_data.get('types', []))
if has_issue:
issues.append(f"⚠️ {msg}")
# Add spot check results to match
match_with_issues = match.copy()
if issues:
match_with_issues['spot_check_issues'] = " | ".join(issues)
match_with_issues['auto_flag'] = 'REVIEW_URGENT'
issue_count += 1
else:
match_with_issues['spot_check_issues'] = ''
match_with_issues['auto_flag'] = 'OK'
results.append(match_with_issues)
print(f"\n ✅ Spot checks complete: {issue_count}/{len(matches)} matches flagged")
return results
def generate_flagged_report(results: List[Dict], output_path: Path):
"""Generate CSV with spot check results."""
# Sort by: auto_flag (REVIEW_URGENT first), then priority, then score
def sort_key(r):
flag_priority = 0 if r['auto_flag'] == 'REVIEW_URGENT' else 1
return (flag_priority, int(r['priority']), float(r['match_score']))
results_sorted = sorted(results, key=sort_key)
# Write CSV with new columns
fieldnames = [
'auto_flag',
'spot_check_issues',
'priority',
'match_score',
'institution_name',
'wikidata_label',
'city',
'institution_type',
'isil_code',
'ghcid',
'wikidata_qid',
'wikidata_url',
'validation_status',
'validation_notes',
'institution_id'
]
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results_sorted)
def generate_summary(results: List[Dict]):
"""Print summary of spot check findings."""
total = len(results)
flagged = sum(1 for r in results if r['auto_flag'] == 'REVIEW_URGENT')
ok = total - flagged
print("\n" + "=" * 70)
print("Automated Spot Check Summary")
print("=" * 70)
print(f"\n📊 Overall Results")
print(f" Total fuzzy matches: {total}")
print(f" Flagged issues: {flagged} ({flagged/total*100:.1f}%)")
print(f" No issues detected: {ok} ({ok/total*100:.1f}%)")
# Count issue types
issue_types = {
'City mismatch': 0,
'ISIL conflict': 0,
'Type mismatch': 0,
'Low score no ISIL': 0,
'Name pattern issue': 0
}
for result in results:
issues = result.get('spot_check_issues', '')
if 'City mismatch' in issues:
issue_types['City mismatch'] += 1
if 'ISIL conflict' in issues:
issue_types['ISIL conflict'] += 1
if 'Type mismatch' in issues:
issue_types['Type mismatch'] += 1
if 'Low score' in issues and 'no ISIL' in issues:
issue_types['Low score no ISIL'] += 1
if 'Biblioteket' in issues or 'Gymnasium' in issues or 'similarity' in issues:
issue_types['Name pattern issue'] += 1
print(f"\n🚨 Issue Breakdown")
for issue_type, count in sorted(issue_types.items(), key=lambda x: -x[1]):
if count > 0:
print(f" {issue_type:<25}: {count:3d} matches")
# Sample flagged records
print(f"\n🔍 Sample Flagged Records (Top 5)")
flagged_records = [r for r in results if r['auto_flag'] == 'REVIEW_URGENT']
for i, record in enumerate(flagged_records[:5], 1):
print(f"\n {i}. Priority {record['priority']} - Score {record['match_score']}%")
print(f" Institution: {record['institution_name']}")
print(f" Wikidata: {record['wikidata_label']}")
print(f" Issues: {record['spot_check_issues']}")
print("\n" + "=" * 70)
print("Next Steps")
print("=" * 70)
print(f"""
1. Review flagged CSV: data/review/denmark_wikidata_fuzzy_matches_flagged.csv
2. Focus on REVIEW_URGENT rows first ({flagged} matches)
3. Fill validation_status for flagged rows:
- City/ISIL conflicts → Likely INCORRECT
- Type mismatches → Likely INCORRECT
- Name pattern issues → Needs manual judgment
4. Then review remaining OK rows ({ok} matches)
5. Run: python scripts/apply_wikidata_validation.py
""")
print("=" * 70)
def main():
print("=" * 70)
print("Automated Spot Checks for Wikidata Fuzzy Matches")
print("=" * 70)
# Load fuzzy matches
csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
print(f"\nLoading fuzzy matches: {csv_path}")
matches = load_csv_matches(csv_path)
print(f" ✅ Loaded {len(matches)} matches")
# Run spot checks (queries Wikidata)
results = run_spot_checks(matches)
# Generate flagged report
output_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv')
print(f"\nGenerating flagged report: {output_path}")
generate_flagged_report(results, output_path)
print(f" ✅ Saved flagged report")
# Print summary
generate_summary(results)
print("\n✅ Automated Spot Checks Complete")
if __name__ == '__main__':
main()