glam/scripts/spot_check_fuzzy_matches_fast.py

"""
Fast Automated Spot Checks for Wikidata Fuzzy Matches

Pattern-based detection (no Wikidata queries needed for most checks):
1. City name mismatches (from CSV data)
2. Name pattern issues (branch suffixes, gymnasium libraries)
3. Low scores (<87%) without ISIL confirmation
4. Similar institution names in different cities

Generates prioritized review list with auto-detected issues.
"""

import json
import csv
import re
from pathlib import Path
from typing import Dict, List, Tuple
from rapidfuzz import fuzz


def load_csv_matches(csv_path: Path) -> List[Dict]:
    """Load fuzzy matches from CSV."""
    matches = []
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            matches.append(row)
    return matches


def check_name_patterns(inst_name: str, wd_label: str) -> List[str]:
    """Check for problematic name patterns."""
    issues = []

    # Pattern 1: Branch suffix in our name but not Wikidata
    if ', Biblioteket' in inst_name and 'Bibliotek' not in wd_label:
        issues.append("Branch suffix ', Biblioteket' in our name but not Wikidata (branch vs main?)")

    # Pattern 2: Gymnasium library vs public library
    if 'Gymnasium' in inst_name and 'Gymnasium' not in wd_label:
        if 'Bibliotek' in wd_label:
            issues.append("Our 'Gymnasium' library matched to public library (school vs public?)")

    # Pattern 3: Kombi Bibliotek (different institutions)
    if 'Kombi' in inst_name:
        # Extract city/location before "Kombi"
        our_location = inst_name.split('Kombi')[0].strip()
        if our_location not in wd_label:
            issues.append(f"Kombi library location mismatch: '{our_location}' not in Wikidata label")

    # Pattern 4: Different base institution names
    # Remove common suffixes for comparison
    our_base = re.sub(r',?\s*(Biblioteket|Bibliotek|Arkiv)$', '', inst_name).strip()
    wd_base = re.sub(r'\s*(Biblioteket|Bibliotek|Arkiv)$', '', wd_label).strip()

    similarity = fuzz.ratio(our_base.lower(), wd_base.lower())
    if similarity < 60:
        issues.append(f"Low name similarity ({similarity}%) - possibly different institutions")

    # Pattern 5: City/location names differ
    # Extract first word (often city name)
    our_first = inst_name.split()[0] if inst_name else ""
    wd_first = wd_label.split()[0] if wd_label else ""

    if len(our_first) > 3 and len(wd_first) > 3:  # Avoid short words
        if our_first.lower() != wd_first.lower() and similarity < 85:
            first_sim = fuzz.ratio(our_first.lower(), wd_first.lower())
            if first_sim < 70:
                issues.append(f"First word differs: '{our_first}' vs '{wd_first}' (city mismatch?)")

    return issues


def check_city_in_labels(our_city: str, inst_name: str, wd_label: str) -> List[str]:
    """Check if city names are consistent."""
    issues = []

    if not our_city:
        return issues

    our_city_lower = our_city.lower().strip()

    # Check if Wikidata label contains a different city name
    # Common Danish cities to check against
    danish_cities = [
        'københavn', 'aarhus', 'odense', 'aalborg', 'frederiksberg',
        'esbjerg', 'randers', 'kolding', 'horsens', 'vejle',
        'roskilde', 'herning', 'helsingør', 'silkeborg', 'næstved',
        'fredericia', 'viborg', 'køge', 'holstebro', 'taastrup',
        'svendborg', 'hvidovre', 'hørsholm', 'greve', 'ballerup',
        'gladsaxe', 'gentofte', 'herlev', 'glostrup', 'albertslund'
    ]

    wd_label_lower = wd_label.lower()

    # Check if Wikidata label mentions a different city
    for city in danish_cities:
        if city in wd_label_lower and city != our_city_lower:
            # Make sure it's not a substring match
            if our_city_lower not in city and city not in our_city_lower:
                issues.append(f"City mismatch: our '{our_city}' but Wikidata mentions '{city}'")
                break

    return issues


def check_low_score_no_isil(match_score: float, our_isil: str) -> List[str]:
    """Flag low scores without ISIL confirmation."""
    issues = []
    if match_score < 87 and not our_isil:
        issues.append(f"Low confidence ({match_score:.1f}%) with no ISIL to verify")
    return issues


def check_institution_type_hints(inst_name: str, wd_label: str, inst_type: str) -> List[str]:
    """Check for type mismatch hints in names."""
    issues = []

    # Type keywords in names
    type_keywords = {
        'LIBRARY': ['bibliotek', 'library'],
        'ARCHIVE': ['arkiv', 'archive', 'arkivet'],
        'MUSEUM': ['museum', 'museet']
    }

    our_keywords = type_keywords.get(inst_type, [])

    # Check if our name has library keywords but Wikidata doesn't (or vice versa)
    inst_lower = inst_name.lower()
    wd_lower = wd_label.lower()

    our_has_keyword = any(kw in inst_lower for kw in our_keywords)
    wd_has_keyword = any(kw in wd_lower for kw in our_keywords)

    if our_has_keyword and not wd_has_keyword:
        # Our name suggests library/archive but Wikidata doesn't mention it
        issues.append(f"Type keyword mismatch: our name has {inst_type} keyword, Wikidata doesn't")

    # Check for museum/gallery keywords in Wikidata when we're a library
    if inst_type == 'LIBRARY' and ('museum' in wd_lower or 'gallery' in wd_lower):
        issues.append("Type mismatch: we're LIBRARY but Wikidata mentions museum/gallery")

    return issues


def run_fast_spot_checks(matches: List[Dict]) -> List[Dict]:
    """
    Run fast pattern-based spot checks on all fuzzy matches.

    Returns list of matches with spot_check_issues field added.
    """
    print(f"\nRunning fast automated spot checks on {len(matches)} fuzzy matches...")
    print("Using pattern-based detection (no Wikidata queries needed)\n")

    results = []
    issue_count = 0
    issue_type_counts = {}

    for i, match in enumerate(matches, 1):
        if i % 50 == 0:
            print(f"  Progress: {i}/{len(matches)} ({i/len(matches)*100:.1f}%)")

        inst_name = match['institution_name']
        wd_label = match['wikidata_label']
        our_city = match['city']
        our_type = match['institution_type']
        our_isil = match['isil_code']
        match_score = float(match['match_score'])

        all_issues = []

        # Check 1: Low score without ISIL
        issues = check_low_score_no_isil(match_score, our_isil)
        if issues:
            all_issues.extend([f"⚠️  {issue}" for issue in issues])
            issue_type_counts['Low score no ISIL'] = issue_type_counts.get('Low score no ISIL', 0) + 1

        # Check 2: Name patterns
        issues = check_name_patterns(inst_name, wd_label)
        if issues:
            all_issues.extend([f"🔍 {issue}" for issue in issues])
            issue_type_counts['Name pattern'] = issue_type_counts.get('Name pattern', 0) + len(issues)

        # Check 3: City names
        issues = check_city_in_labels(our_city, inst_name, wd_label)
        if issues:
            all_issues.extend([f"🚨 {issue}" for issue in issues])
            issue_type_counts['City mismatch'] = issue_type_counts.get('City mismatch', 0) + 1

        # Check 4: Institution type hints
        issues = check_institution_type_hints(inst_name, wd_label, our_type)
        if issues:
            all_issues.extend([f"⚠️  {issue}" for issue in issues])
            issue_type_counts['Type hint'] = issue_type_counts.get('Type hint', 0) + 1

        # Add spot check results to match
        match_with_issues = match.copy()
        if all_issues:
            match_with_issues['spot_check_issues'] = " | ".join(all_issues)
            match_with_issues['auto_flag'] = 'REVIEW_URGENT'
            issue_count += 1
        else:
            match_with_issues['spot_check_issues'] = ''
            match_with_issues['auto_flag'] = 'OK'

        results.append(match_with_issues)

    print(f"\n  ✅ Spot checks complete: {issue_count}/{len(matches)} matches flagged\n")

    # Print issue breakdown
    print("  Issue type breakdown:")
    for issue_type, count in sorted(issue_type_counts.items(), key=lambda x: -x[1]):
        print(f"    {issue_type:<25}: {count:3d}")

    return results


def generate_flagged_report(results: List[Dict], output_path: Path):
    """Generate CSV with spot check results."""

    # Sort by: auto_flag (REVIEW_URGENT first), then priority, then score
    def sort_key(r):
        flag_priority = 0 if r['auto_flag'] == 'REVIEW_URGENT' else 1
        return (flag_priority, int(r['priority']), float(r['match_score']))

    results_sorted = sorted(results, key=sort_key)

    # Write CSV with new columns
    fieldnames = [
        'auto_flag',
        'spot_check_issues',
        'priority',
        'match_score',
        'institution_name',
        'wikidata_label',
        'city',
        'institution_type',
        'isil_code',
        'ghcid',
        'wikidata_qid',
        'wikidata_url',
        'validation_status',
        'validation_notes',
        'institution_id'
    ]

    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results_sorted)


def generate_summary(results: List[Dict]):
    """Print summary of spot check findings."""

    total = len(results)
    flagged = sum(1 for r in results if r['auto_flag'] == 'REVIEW_URGENT')
    ok = total - flagged

    print("\n" + "=" * 70)
    print("Fast Automated Spot Check Summary")
    print("=" * 70)

    print(f"\n📊 Overall Results")
    print(f"   Total fuzzy matches:    {total}")
    print(f"   Flagged issues:         {flagged} ({flagged/total*100:.1f}%)")
    print(f"   No issues detected:     {ok} ({ok/total*100:.1f}%)")

    # Count by priority
    flagged_by_priority = {}
    for result in results:
        if result['auto_flag'] == 'REVIEW_URGENT':
            priority = result['priority']
            flagged_by_priority[priority] = flagged_by_priority.get(priority, 0) + 1

    print(f"\n🎯 Flagged Matches by Priority")
    for priority in sorted(flagged_by_priority.keys()):
        count = flagged_by_priority[priority]
        print(f"   Priority {priority}: {count:3d} flagged")

    # Sample flagged records
    print(f"\n🔍 Sample Flagged Records (Top 10)")
    flagged_records = [r for r in results if r['auto_flag'] == 'REVIEW_URGENT']

    for i, record in enumerate(flagged_records[:10], 1):
        print(f"\n   {i}. Priority {record['priority']} - Score {record['match_score']}%")
        print(f"      Institution: {record['institution_name']}")
        print(f"      Wikidata:    {record['wikidata_label']}")

        # Parse issues (limit display)
        issues_text = record['spot_check_issues']
        issues_list = issues_text.split(' | ')
        for issue in issues_list[:2]:  # Show first 2 issues
            print(f"      {issue}")
        if len(issues_list) > 2:
            print(f"      ... and {len(issues_list)-2} more issue(s)")

    print("\n" + "=" * 70)
    print("Interpretation Guide")
    print("=" * 70)
    print("""
🚨 City mismatch          → Very likely INCORRECT (different cities)
⚠️  Type keyword mismatch  → Likely INCORRECT (e.g., library vs museum)
🔍 Branch suffix          → Probably INCORRECT (branch vs main library)
🔍 Gymnasium library      → Likely INCORRECT (school vs public library)
🔍 Low name similarity    → Uncertain, needs manual check
⚠️  Low score no ISIL     → Uncertain, needs verification
""")

    print("=" * 70)
    print("Recommended Actions")
    print("=" * 70)
    print(f"""
1. Open flagged CSV: data/review/denmark_wikidata_fuzzy_matches_flagged.csv

2. Focus on REVIEW_URGENT rows first ({flagged} matches)
   - Sort by auto_flag column
   - Start with 🚨 city/type mismatches (likely INCORRECT)
   - Then review 🔍 name pattern issues (needs judgment)

3. Fill validation_status for flagged rows:
   - City mismatch → INCORRECT
   - Type mismatch → INCORRECT
   - Branch vs main → INCORRECT (usually)
   - Name similarity issues → Needs manual judgment

4. Review OK rows ({ok} matches) - lower priority
   - These passed automated checks
   - Still review Priority 1-2 for safety

5. After review: python scripts/apply_wikidata_validation.py
""")

    print("=" * 70)


def main():
    print("=" * 70)
    print("Fast Automated Spot Checks for Wikidata Fuzzy Matches")
    print("=" * 70)

    # Load fuzzy matches
    csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
    print(f"\nLoading fuzzy matches: {csv_path}")
    matches = load_csv_matches(csv_path)
    print(f"  ✅ Loaded {len(matches)} matches")

    # Run fast spot checks (pattern-based, no Wikidata queries)
    results = run_fast_spot_checks(matches)

    # Generate flagged report
    output_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv')
    print(f"\nGenerating flagged report: {output_path}")
    generate_flagged_report(results, output_path)
    size_kb = output_path.stat().st_size / 1024
    print(f"  ✅ Saved flagged report ({size_kb:.1f} KB)")

    # Print summary
    generate_summary(results)

    print("\n✅ Fast Automated Spot Checks Complete")


if __name__ == '__main__':
    main()