glam/scripts/review_batch11_matches.py

#!/usr/bin/env python3
"""
Manual Review of Batch 11 Query Results
========================================

Review and filter the automated matches to identify:
1. Valid matches (high confidence)
2. Questionable matches (needs verification)
3. False positives (clearly wrong)
"""

import json

def review_matches():
    """Review and categorize Batch 11 matches"""

    with open('scripts/batch11_query_results.json', 'r', encoding='utf-8') as f:
        data = json.load(f)

    print("="*80)
    print("BATCH 11 MATCH REVIEW")
    print("="*80)

    valid_matches = []
    questionable_matches = []
    false_positives = []

    # Known false positive: Museo Di (Q112135646) - Virtual LGBT museum
    MUSEO_DI = "Q112135646"

    for match in data['matches']:
        museum_name = match['museum']['name']
        q_number = match['match']['q_number']
        wd_name = match['match']['wikidata_name']
        location = match['match']['location']
        description = match['match']['description']
        score = match['match']['match_score']

        print(f"\n{'='*80}")
        print(f"Target: {museum_name}")
        print(f"Match:  {wd_name} ({q_number})")
        print(f"Location: {location}")
        print(f"Description: {description}")
        print(f"Score: {score:.2f}")

        # Filter obvious false positives
        if q_number == MUSEO_DI:
            print("❌ FALSE POSITIVE: Virtual LGBT museum (incorrect match)")
            false_positives.append(match)

        elif "Zoológico" in wd_name or "zoo" in description.lower():
            print("❌ FALSE POSITIVE: Zoo, not a museum")
            false_positives.append(match)

        # High confidence matches (score > 0.95 and location matches)
        elif score > 0.95 and location and location.lower() in museum_name.lower():
            print("✅ VALID MATCH: High confidence")
            valid_matches.append(match)

        # Exact name matches
        elif museum_name.lower() in wd_name.lower() or wd_name.lower() in museum_name.lower():
            if location and (match['museum']['city'].lower() in location.lower() or
                           match['museum']['region'].lower() in location.lower()):
                print("✅ VALID MATCH: Name and location align")
                valid_matches.append(match)
            else:
                print("⚠️  QUESTIONABLE: Name matches but location unclear")
                questionable_matches.append(match)

        # Location mismatch
        elif location and location != match['museum']['city']:
            print("⚠️  QUESTIONABLE: Location mismatch")
            questionable_matches.append(match)

        else:
            print("⚠️  QUESTIONABLE: Needs manual verification")
            questionable_matches.append(match)

    # Summary
    print("\n" + "="*80)
    print("REVIEW SUMMARY")
    print("="*80)
    print(f"✅ Valid matches: {len(valid_matches)}")
    print(f"⚠️  Questionable matches: {len(questionable_matches)}")
    print(f"❌ False positives: {len(false_positives)}")

    if valid_matches:
        print("\n" + "-"*80)
        print("VALID MATCHES (High Confidence):")
        print("-"*80)
        for match in valid_matches:
            print(f"{match['museum']['name']}")
            print(f"  → {match['match']['wikidata_name']} ({match['match']['q_number']})")

    if questionable_matches:
        print("\n" + "-"*80)
        print("QUESTIONABLE MATCHES (Need Verification):")
        print("-"*80)
        for match in questionable_matches:
            print(f"{match['museum']['name']}")
            print(f"  → {match['match']['wikidata_name']} ({match['match']['q_number']})")
            print(f"     Location: {match['match']['location']} vs {match['museum']['city']}")

    if false_positives:
        print("\n" + "-"*80)
        print("FALSE POSITIVES (Rejected):")
        print("-"*80)
        for match in false_positives:
            print(f"{match['museum']['name']}")
            print(f"  ✗ {match['match']['wikidata_name']} ({match['match']['q_number']})")

    # Save filtered results
    filtered = {
        'batch': 11,
        'review_date': '2025-11-09',
        'valid_matches': valid_matches,
        'questionable_matches': questionable_matches,
        'false_positives': false_positives,
        'summary': {
            'valid': len(valid_matches),
            'questionable': len(questionable_matches),
            'false_positives': len(false_positives),
            'expected_coverage': f"{55 + len(valid_matches)}/90",
            'expected_coverage_percent': f"{((55 + len(valid_matches))/90*100):.1f}%"
        }
    }

    with open('scripts/batch11_reviewed_matches.json', 'w', encoding='utf-8') as f:
        json.dump(filtered, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Filtered results saved to: scripts/batch11_reviewed_matches.json")
    print(f"\n📊 Conservative coverage estimate: {55 + len(valid_matches)}/90 = {((55 + len(valid_matches))/90*100):.1f}%")

if __name__ == "__main__":
    review_matches()