glam/archive/scripts/brazil/diagnose_brazil_matching.py

#!/usr/bin/env python3
"""
Diagnose why Brazil fuzzy matching found 0 matches.

This script shows:
1. Sample Brazilian institution names from our dataset
2. Sample Brazilian institution names from Wikidata
3. Best similarity scores (even below 0.85 threshold)
"""

import sys
from pathlib import Path
from typing import Any
import yaml
from difflib import SequenceMatcher
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON  # type: ignore


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching."""
    name = name.lower()

    # Remove common prefixes/suffixes (Portuguese + Spanish)
    name = re.sub(r'^(fundação|museu|biblioteca|arquivo|centro|memorial)\s+', '', name)
    name = re.sub(r'\s+(museu|biblioteca|arquivo|nacional|estadual|municipal)$', '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def query_brazilian_institutions(sparql: SPARQLWrapper) -> list[dict[str, Any]]:
    """Query Wikidata for Brazilian heritage institutions."""

    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?cityLabel
    WHERE {
      VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 }

      ?item wdt:P31 ?type .           # instance of museum/library/archive
      ?item wdt:P17 wd:Q155 .         # country: Brazil

      OPTIONAL { ?item wdt:P131 ?city . }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "pt,en" . }
    }
    LIMIT 2000
    """

    sparql.setQuery(query)

    try:
        raw_results = sparql.query().convert()
        bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

        results = []
        for binding in bindings:
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                continue

            results.append({
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "type": binding.get("typeLabel", {}).get("value", ""),
                "city": binding.get("cityLabel", {}).get("value", "")
            })

        return results

    except Exception as e:
        print(f"\n❌ Error querying Wikidata: {e}")
        import traceback
        traceback.print_exc()
        return []


def main():
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"

    print("="*80)
    print("🇧🇷 BRAZIL FUZZY MATCHING DIAGNOSTIC")
    print("="*80)
    print(f"\n📖 Loading dataset...\n")

    with open(input_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    # Filter Brazilian institutions without Wikidata
    brazilian_insts = [
        inst for inst in institutions
        if inst.get('locations', [{}])[0].get('country') == 'BR' and
        not any(
            id_obj.get("identifier_scheme") == "Wikidata" and
            id_obj.get("identifier_value", "").startswith("Q") and
            int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
            for id_obj in inst.get("identifiers", [])
        )
    ]

    print(f"✅ Found {len(brazilian_insts)} Brazilian institutions without Wikidata\n")

    # Show sample names
    print("="*80)
    print("📋 SAMPLE LOCAL BRAZILIAN INSTITUTION NAMES (First 20)")
    print("="*80)
    for i, inst in enumerate(brazilian_insts[:20], 1):
        name = inst.get('name', '')
        city = inst.get('locations', [{}])[0].get('city', 'Unknown')
        normalized = normalize_name(name)
        print(f"{i:2}. {name}")
        print(f"    City: {city}")
        print(f"    Normalized: '{normalized}'\n")

    # Query Wikidata
    print("="*80)
    print("🔍 QUERYING WIKIDATA FOR BRAZILIAN INSTITUTIONS")
    print("="*80 + "\n")

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod('POST')
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")

    wd_insts = query_brazilian_institutions(sparql)

    print(f"✅ Found {len(wd_insts)} Brazilian institutions in Wikidata\n")

    # Show sample Wikidata names
    print("="*80)
    print("📋 SAMPLE WIKIDATA BRAZILIAN INSTITUTION NAMES (First 20)")
    print("="*80)
    for i, inst in enumerate(wd_insts[:20], 1):
        name = inst.get('name', '')
        city = inst.get('city', 'Unknown')
        inst_type = inst.get('type', 'Unknown')
        normalized = normalize_name(name)
        print(f"{i:2}. {name} ({inst['qid']})")
        print(f"    City: {city}, Type: {inst_type}")
        print(f"    Normalized: '{normalized}'\n")

    # Find best matches for each local institution (regardless of threshold)
    print("="*80)
    print("📊 BEST MATCHES (Top 20, any score)")
    print("="*80 + "\n")

    best_matches = []

    for inst in brazilian_insts[:30]:  # Check first 30 local institutions
        inst_name = inst.get('name', '')
        if not inst_name:
            continue

        best_score = 0.0
        best_wd = None

        for wd_inst in wd_insts:
            wd_name = wd_inst.get('name', '')
            if not wd_name:
                continue

            score = similarity_score(inst_name, wd_name)
            if score > best_score:
                best_score = score
                best_wd = wd_inst

        if best_wd:
            best_matches.append((inst, best_wd, best_score))

    # Sort by score descending
    best_matches.sort(key=lambda x: x[2], reverse=True)

    for i, (local, wd, score) in enumerate(best_matches[:20], 1):
        local_city = local.get('locations', [{}])[0].get('city', 'Unknown')
        wd_city = wd.get('city', 'Unknown')

        print(f"{i:2}. Score: {score:.3f}")
        print(f"    Local:    {local.get('name')} ({local_city})")
        print(f"    Wikidata: {wd.get('name')} ({wd['qid']}, {wd_city})")

        # Show if would match at different thresholds
        if score >= 0.85:
            print(f"    ✅ Would match at 0.85 threshold")
        elif score >= 0.80:
            print(f"    ⚠️  Would match at 0.80 threshold")
        elif score >= 0.75:
            print(f"    ⚠️  Would match at 0.75 threshold")
        else:
            print(f"    ❌ Below reasonable threshold")
        print()

    # Threshold analysis
    print("="*80)
    print("📊 THRESHOLD ANALYSIS")
    print("="*80 + "\n")

    thresholds = [0.95, 0.90, 0.85, 0.80, 0.75, 0.70]
    for threshold in thresholds:
        matches_at_threshold = len([m for m in best_matches if m[2] >= threshold])
        print(f"Threshold {threshold:.2f}: {matches_at_threshold:3} matches")

    print("\n" + "="*80 + "\n")


if __name__ == "__main__":
    main()