glam/scripts/spot_check_fuzzy_matches.py

"""
Automated Spot Checks for Wikidata Fuzzy Matches

Programmatically detects obvious errors in fuzzy matches to prioritize manual review:
1. City name mismatches (different cities = likely wrong match)
2. Institution type mismatches (detected from Wikidata)
3. ISIL code conflicts (if Wikidata has different ISIL)
4. Name pattern issues (branch suffixes, gymnasium libraries)
5. Very low scores (<87%) with no ISIL confirmation

Generates prioritized review list with auto-detected issues.
"""

import json
import csv
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from rapidfuzz import fuzz
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
import time


def load_csv_matches(csv_path: Path) -> List[Dict]:
    """Load fuzzy matches from CSV."""
    matches = []
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            matches.append(row)
    return matches


def query_wikidata_entity(qid: str) -> Optional[Dict]:
    """
    Query Wikidata for entity details.

    Returns dict with: type (P31), isil (P791), city (P131)
    """
    query = f"""
    SELECT ?type ?typeLabel ?isil ?city ?cityLabel WHERE {{
      wd:{qid} wdt:P31 ?type .
      OPTIONAL {{ wd:{qid} wdt:P791 ?isil }}
      OPTIONAL {{ wd:{qid} wdt:P131 ?city }}

      SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "da,en"
      }}
    }}
    LIMIT 5
    """

    endpoint = SPARQLWrapper("https://query.wikidata.org/sparql")
    endpoint.setQuery(query)
    endpoint.setReturnFormat(SPARQL_JSON)
    endpoint.addCustomHttpHeader('User-Agent', 'GLAM-Spot-Check/1.0')

    try:
        results = endpoint.query().convert()
        bindings = results['results']['bindings']

        if not bindings:
            return None

        # Aggregate results
        types = [b.get('typeLabel', {}).get('value') for b in bindings]
        isil = bindings[0].get('isil', {}).get('value') if bindings else None
        city = bindings[0].get('cityLabel', {}).get('value') if bindings else None

        return {
            'types': list(set(filter(None, types))),
            'isil': isil,
            'city': city
        }
    except Exception as e:
        print(f"  ⚠️  Error querying {qid}: {e}")
        return None


def check_city_mismatch(our_city: str, wd_city: Optional[str]) -> Tuple[bool, str]:
    """Check if cities match (accounting for variations)."""
    if not our_city or not wd_city:
        return False, ""

    our_city_clean = our_city.lower().strip()
    wd_city_clean = wd_city.lower().strip()

    # Exact match
    if our_city_clean == wd_city_clean:
        return False, ""

    # One contains the other
    if our_city_clean in wd_city_clean or wd_city_clean in our_city_clean:
        return False, ""

    # Fuzzy match (allow minor spelling variations)
    similarity = fuzz.ratio(our_city_clean, wd_city_clean)
    if similarity > 85:
        return False, ""

    # Cities don't match
    return True, f"City mismatch: '{our_city}' vs Wikidata '{wd_city}'"


def check_isil_conflict(our_isil: str, wd_isil: Optional[str]) -> Tuple[bool, str]:
    """Check if ISIL codes conflict."""
    if not our_isil or not wd_isil:
        return False, ""

    our_isil_clean = our_isil.strip()
    wd_isil_clean = wd_isil.strip()

    if our_isil_clean != wd_isil_clean:
        return True, f"ISIL conflict: our '{our_isil}' vs Wikidata '{wd_isil}'"

    return False, ""


def check_type_mismatch(our_type: str, wd_types: List[str]) -> Tuple[bool, str]:
    """Check if institution types match."""
    if not wd_types:
        return False, ""

    # Map our types to Wikidata type labels
    type_mappings = {
        'LIBRARY': ['library', 'public library', 'academic library',
                    'university library', 'national library'],
        'ARCHIVE': ['archive', 'archives', 'archival institution',
                   'state archive', 'national archives'],
        'MUSEUM': ['museum', 'art museum', 'history museum']
    }

    expected_types = type_mappings.get(our_type, [])

    # Check if any Wikidata type matches our expected types
    for wd_type in wd_types:
        wd_type_lower = wd_type.lower()
        for expected in expected_types:
            if expected in wd_type_lower:
                return False, ""

    # No match found
    return True, f"Type mismatch: our {our_type} vs Wikidata {', '.join(wd_types[:3])}"


def check_name_patterns(inst_name: str, wd_label: str) -> Tuple[bool, str]:
    """Check for problematic name patterns."""
    issues = []

    # Pattern 1: Branch suffix in our name
    if ', Biblioteket' in inst_name and ', Biblioteket' not in wd_label:
        issues.append("Our name has ', Biblioteket' suffix (branch?), Wikidata doesn't")

    # Pattern 2: Gymnasium library
    if 'Gymnasium' in inst_name and 'Gymnasium' not in wd_label:
        issues.append("Our name has 'Gymnasium' (school library?), Wikidata doesn't")

    # Pattern 3: Different institution names entirely
    # Extract base names (remove suffixes)
    our_base = re.sub(r',\s*Biblioteket$', '', inst_name)
    wd_base = re.sub(r'\s+Bibliotek$', '', wd_label)

    similarity = fuzz.ratio(our_base.lower(), wd_base.lower())
    if similarity < 60:
        issues.append(f"Low name similarity ({similarity}%) - possibly different institutions")

    if issues:
        return True, "; ".join(issues)

    return False, ""


def check_low_score_no_isil(match_score: float, our_isil: str) -> Tuple[bool, str]:
    """Flag low scores without ISIL confirmation."""
    if match_score < 87 and not our_isil:
        return True, f"Low score ({match_score}%) with no ISIL to verify"
    return False, ""


def run_spot_checks(matches: List[Dict]) -> List[Dict]:
    """
    Run automated spot checks on all fuzzy matches.

    Returns list of matches with spot_check_issues field added.
    """
    print(f"\nRunning automated spot checks on {len(matches)} fuzzy matches...")
    print("This will query Wikidata for each Q-number (may take ~5 minutes)\n")

    results = []
    issue_count = 0

    for i, match in enumerate(matches, 1):
        if i % 20 == 0:
            print(f"  Progress: {i}/{len(matches)} ({i/len(matches)*100:.1f}%)")

        qid = match['wikidata_qid']
        inst_name = match['institution_name']
        wd_label = match['wikidata_label']
        our_city = match['city']
        our_type = match['institution_type']
        our_isil = match['isil_code']
        match_score = float(match['match_score'])

        issues = []

        # Check 1: Low score without ISIL
        has_issue, msg = check_low_score_no_isil(match_score, our_isil)
        if has_issue:
            issues.append(f"⚠️  {msg}")

        # Check 2: Name patterns
        has_issue, msg = check_name_patterns(inst_name, wd_label)
        if has_issue:
            issues.append(f"🔍 {msg}")

        # Query Wikidata for entity details
        wd_data = query_wikidata_entity(qid)
        time.sleep(0.5)  # Rate limiting (2 req/sec)

        if wd_data:
            # Check 3: City mismatch
            has_issue, msg = check_city_mismatch(our_city, wd_data.get('city'))
            if has_issue:
                issues.append(f"🚨 {msg}")

            # Check 4: ISIL conflict
            has_issue, msg = check_isil_conflict(our_isil, wd_data.get('isil'))
            if has_issue:
                issues.append(f"🚨 {msg}")

            # Check 5: Type mismatch
            has_issue, msg = check_type_mismatch(our_type, wd_data.get('types', []))
            if has_issue:
                issues.append(f"⚠️  {msg}")

        # Add spot check results to match
        match_with_issues = match.copy()
        if issues:
            match_with_issues['spot_check_issues'] = " | ".join(issues)
            match_with_issues['auto_flag'] = 'REVIEW_URGENT'
            issue_count += 1
        else:
            match_with_issues['spot_check_issues'] = ''
            match_with_issues['auto_flag'] = 'OK'

        results.append(match_with_issues)

    print(f"\n  ✅ Spot checks complete: {issue_count}/{len(matches)} matches flagged")
    return results


def generate_flagged_report(results: List[Dict], output_path: Path):
    """Generate CSV with spot check results."""

    # Sort by: auto_flag (REVIEW_URGENT first), then priority, then score
    def sort_key(r):
        flag_priority = 0 if r['auto_flag'] == 'REVIEW_URGENT' else 1
        return (flag_priority, int(r['priority']), float(r['match_score']))

    results_sorted = sorted(results, key=sort_key)

    # Write CSV with new columns
    fieldnames = [
        'auto_flag',
        'spot_check_issues',
        'priority',
        'match_score',
        'institution_name',
        'wikidata_label',
        'city',
        'institution_type',
        'isil_code',
        'ghcid',
        'wikidata_qid',
        'wikidata_url',
        'validation_status',
        'validation_notes',
        'institution_id'
    ]

    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results_sorted)


def generate_summary(results: List[Dict]):
    """Print summary of spot check findings."""

    total = len(results)
    flagged = sum(1 for r in results if r['auto_flag'] == 'REVIEW_URGENT')
    ok = total - flagged

    print("\n" + "=" * 70)
    print("Automated Spot Check Summary")
    print("=" * 70)

    print(f"\n📊 Overall Results")
    print(f"   Total fuzzy matches:    {total}")
    print(f"   Flagged issues:         {flagged} ({flagged/total*100:.1f}%)")
    print(f"   No issues detected:     {ok} ({ok/total*100:.1f}%)")

    # Count issue types
    issue_types = {
        'City mismatch': 0,
        'ISIL conflict': 0,
        'Type mismatch': 0,
        'Low score no ISIL': 0,
        'Name pattern issue': 0
    }

    for result in results:
        issues = result.get('spot_check_issues', '')
        if 'City mismatch' in issues:
            issue_types['City mismatch'] += 1
        if 'ISIL conflict' in issues:
            issue_types['ISIL conflict'] += 1
        if 'Type mismatch' in issues:
            issue_types['Type mismatch'] += 1
        if 'Low score' in issues and 'no ISIL' in issues:
            issue_types['Low score no ISIL'] += 1
        if 'Biblioteket' in issues or 'Gymnasium' in issues or 'similarity' in issues:
            issue_types['Name pattern issue'] += 1

    print(f"\n🚨 Issue Breakdown")
    for issue_type, count in sorted(issue_types.items(), key=lambda x: -x[1]):
        if count > 0:
            print(f"   {issue_type:<25}: {count:3d} matches")

    # Sample flagged records
    print(f"\n🔍 Sample Flagged Records (Top 5)")
    flagged_records = [r for r in results if r['auto_flag'] == 'REVIEW_URGENT']

    for i, record in enumerate(flagged_records[:5], 1):
        print(f"\n   {i}. Priority {record['priority']} - Score {record['match_score']}%")
        print(f"      Institution: {record['institution_name']}")
        print(f"      Wikidata:    {record['wikidata_label']}")
        print(f"      Issues:      {record['spot_check_issues']}")

    print("\n" + "=" * 70)
    print("Next Steps")
    print("=" * 70)
    print(f"""
1. Review flagged CSV: data/review/denmark_wikidata_fuzzy_matches_flagged.csv
2. Focus on REVIEW_URGENT rows first ({flagged} matches)
3. Fill validation_status for flagged rows:
   - City/ISIL conflicts → Likely INCORRECT
   - Type mismatches → Likely INCORRECT
   - Name pattern issues → Needs manual judgment
4. Then review remaining OK rows ({ok} matches)
5. Run: python scripts/apply_wikidata_validation.py
""")

    print("=" * 70)


def main():
    print("=" * 70)
    print("Automated Spot Checks for Wikidata Fuzzy Matches")
    print("=" * 70)

    # Load fuzzy matches
    csv_path = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
    print(f"\nLoading fuzzy matches: {csv_path}")
    matches = load_csv_matches(csv_path)
    print(f"  ✅ Loaded {len(matches)} matches")

    # Run spot checks (queries Wikidata)
    results = run_spot_checks(matches)

    # Generate flagged report
    output_path = Path('data/review/denmark_wikidata_fuzzy_matches_flagged.csv')
    print(f"\nGenerating flagged report: {output_path}")
    generate_flagged_report(results, output_path)
    print(f"  ✅ Saved flagged report")

    # Print summary
    generate_summary(results)

    print("\n✅ Automated Spot Checks Complete")


if __name__ == '__main__':
    main()