glam/archive/scripts/brazil/diagnose_brazil_matching.py
2025-11-19 23:25:22 +01:00

225 lines
7.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Diagnose why Brazil fuzzy matching found 0 matches.
This script shows:
1. Sample Brazilian institution names from our dataset
2. Sample Brazilian institution names from Wikidata
3. Best similarity scores (even below 0.85 threshold)
"""
import sys
from pathlib import Path
from typing import Any
import yaml
from difflib import SequenceMatcher
import re
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
def normalize_name(name: str) -> str:
"""Normalize institution name for fuzzy matching."""
name = name.lower()
# Remove common prefixes/suffixes (Portuguese + Spanish)
name = re.sub(r'^(fundação|museu|biblioteca|arquivo|centro|memorial)\s+', '', name)
name = re.sub(r'\s+(museu|biblioteca|arquivo|nacional|estadual|municipal)$', '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', ' ', name)
# Normalize whitespace
name = ' '.join(name.split())
return name
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names (0-1)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
def query_brazilian_institutions(sparql: SPARQLWrapper) -> list[dict[str, Any]]:
"""Query Wikidata for Brazilian heritage institutions."""
query = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?cityLabel
WHERE {
VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 }
?item wdt:P31 ?type . # instance of museum/library/archive
?item wdt:P17 wd:Q155 . # country: Brazil
OPTIONAL { ?item wdt:P131 ?city . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "pt,en" . }
}
LIMIT 2000
"""
sparql.setQuery(query)
try:
raw_results = sparql.query().convert()
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
results = []
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
continue
results.append({
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"type": binding.get("typeLabel", {}).get("value", ""),
"city": binding.get("cityLabel", {}).get("value", "")
})
return results
except Exception as e:
print(f"\n❌ Error querying Wikidata: {e}")
import traceback
traceback.print_exc()
return []
def main():
base_dir = Path(__file__).parent.parent
input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
print("="*80)
print("🇧🇷 BRAZIL FUZZY MATCHING DIAGNOSTIC")
print("="*80)
print(f"\n📖 Loading dataset...\n")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
# Filter Brazilian institutions without Wikidata
brazilian_insts = [
inst for inst in institutions
if inst.get('locations', [{}])[0].get('country') == 'BR' and
not any(
id_obj.get("identifier_scheme") == "Wikidata" and
id_obj.get("identifier_value", "").startswith("Q") and
int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
for id_obj in inst.get("identifiers", [])
)
]
print(f"✅ Found {len(brazilian_insts)} Brazilian institutions without Wikidata\n")
# Show sample names
print("="*80)
print("📋 SAMPLE LOCAL BRAZILIAN INSTITUTION NAMES (First 20)")
print("="*80)
for i, inst in enumerate(brazilian_insts[:20], 1):
name = inst.get('name', '')
city = inst.get('locations', [{}])[0].get('city', 'Unknown')
normalized = normalize_name(name)
print(f"{i:2}. {name}")
print(f" City: {city}")
print(f" Normalized: '{normalized}'\n")
# Query Wikidata
print("="*80)
print("🔍 QUERYING WIKIDATA FOR BRAZILIAN INSTITUTIONS")
print("="*80 + "\n")
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
sparql.setMethod('POST')
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")
wd_insts = query_brazilian_institutions(sparql)
print(f"✅ Found {len(wd_insts)} Brazilian institutions in Wikidata\n")
# Show sample Wikidata names
print("="*80)
print("📋 SAMPLE WIKIDATA BRAZILIAN INSTITUTION NAMES (First 20)")
print("="*80)
for i, inst in enumerate(wd_insts[:20], 1):
name = inst.get('name', '')
city = inst.get('city', 'Unknown')
inst_type = inst.get('type', 'Unknown')
normalized = normalize_name(name)
print(f"{i:2}. {name} ({inst['qid']})")
print(f" City: {city}, Type: {inst_type}")
print(f" Normalized: '{normalized}'\n")
# Find best matches for each local institution (regardless of threshold)
print("="*80)
print("📊 BEST MATCHES (Top 20, any score)")
print("="*80 + "\n")
best_matches = []
for inst in brazilian_insts[:30]: # Check first 30 local institutions
inst_name = inst.get('name', '')
if not inst_name:
continue
best_score = 0.0
best_wd = None
for wd_inst in wd_insts:
wd_name = wd_inst.get('name', '')
if not wd_name:
continue
score = similarity_score(inst_name, wd_name)
if score > best_score:
best_score = score
best_wd = wd_inst
if best_wd:
best_matches.append((inst, best_wd, best_score))
# Sort by score descending
best_matches.sort(key=lambda x: x[2], reverse=True)
for i, (local, wd, score) in enumerate(best_matches[:20], 1):
local_city = local.get('locations', [{}])[0].get('city', 'Unknown')
wd_city = wd.get('city', 'Unknown')
print(f"{i:2}. Score: {score:.3f}")
print(f" Local: {local.get('name')} ({local_city})")
print(f" Wikidata: {wd.get('name')} ({wd['qid']}, {wd_city})")
# Show if would match at different thresholds
if score >= 0.85:
print(f" ✅ Would match at 0.85 threshold")
elif score >= 0.80:
print(f" ⚠️ Would match at 0.80 threshold")
elif score >= 0.75:
print(f" ⚠️ Would match at 0.75 threshold")
else:
print(f" ❌ Below reasonable threshold")
print()
# Threshold analysis
print("="*80)
print("📊 THRESHOLD ANALYSIS")
print("="*80 + "\n")
thresholds = [0.95, 0.90, 0.85, 0.80, 0.75, 0.70]
for threshold in thresholds:
matches_at_threshold = len([m for m in best_matches if m[2] >= threshold])
print(f"Threshold {threshold:.2f}: {matches_at_threshold:3} matches")
print("\n" + "="*80 + "\n")
if __name__ == "__main__":
main()