225 lines
7.4 KiB
Python
Executable file
225 lines
7.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Diagnose why Brazil fuzzy matching found 0 matches.
|
|
|
|
This script shows:
|
|
1. Sample Brazilian institution names from our dataset
|
|
2. Sample Brazilian institution names from Wikidata
|
|
3. Best similarity scores (even below 0.85 threshold)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any
|
|
import yaml
|
|
from difflib import SequenceMatcher
|
|
import re
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for fuzzy matching."""
|
|
name = name.lower()
|
|
|
|
# Remove common prefixes/suffixes (Portuguese + Spanish)
|
|
name = re.sub(r'^(fundação|museu|biblioteca|arquivo|centro|memorial)\s+', '', name)
|
|
name = re.sub(r'\s+(museu|biblioteca|arquivo|nacional|estadual|municipal)$', '', name)
|
|
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', ' ', name)
|
|
|
|
# Normalize whitespace
|
|
name = ' '.join(name.split())
|
|
|
|
return name
|
|
|
|
|
|
def similarity_score(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names (0-1)."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
return SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
|
|
def query_brazilian_institutions(sparql: SPARQLWrapper) -> list[dict[str, Any]]:
|
|
"""Query Wikidata for Brazilian heritage institutions."""
|
|
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?cityLabel
|
|
WHERE {
|
|
VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 }
|
|
|
|
?item wdt:P31 ?type . # instance of museum/library/archive
|
|
?item wdt:P17 wd:Q155 . # country: Brazil
|
|
|
|
OPTIONAL { ?item wdt:P131 ?city . }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "pt,en" . }
|
|
}
|
|
LIMIT 2000
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
|
|
try:
|
|
raw_results = sparql.query().convert()
|
|
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
|
|
|
|
results = []
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
continue
|
|
|
|
results.append({
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"type": binding.get("typeLabel", {}).get("value", ""),
|
|
"city": binding.get("cityLabel", {}).get("value", "")
|
|
})
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ Error querying Wikidata: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return []
|
|
|
|
|
|
def main():
|
|
base_dir = Path(__file__).parent.parent
|
|
input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
|
|
|
|
print("="*80)
|
|
print("🇧🇷 BRAZIL FUZZY MATCHING DIAGNOSTIC")
|
|
print("="*80)
|
|
print(f"\n📖 Loading dataset...\n")
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
# Filter Brazilian institutions without Wikidata
|
|
brazilian_insts = [
|
|
inst for inst in institutions
|
|
if inst.get('locations', [{}])[0].get('country') == 'BR' and
|
|
not any(
|
|
id_obj.get("identifier_scheme") == "Wikidata" and
|
|
id_obj.get("identifier_value", "").startswith("Q") and
|
|
int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
|
|
for id_obj in inst.get("identifiers", [])
|
|
)
|
|
]
|
|
|
|
print(f"✅ Found {len(brazilian_insts)} Brazilian institutions without Wikidata\n")
|
|
|
|
# Show sample names
|
|
print("="*80)
|
|
print("📋 SAMPLE LOCAL BRAZILIAN INSTITUTION NAMES (First 20)")
|
|
print("="*80)
|
|
for i, inst in enumerate(brazilian_insts[:20], 1):
|
|
name = inst.get('name', '')
|
|
city = inst.get('locations', [{}])[0].get('city', 'Unknown')
|
|
normalized = normalize_name(name)
|
|
print(f"{i:2}. {name}")
|
|
print(f" City: {city}")
|
|
print(f" Normalized: '{normalized}'\n")
|
|
|
|
# Query Wikidata
|
|
print("="*80)
|
|
print("🔍 QUERYING WIKIDATA FOR BRAZILIAN INSTITUTIONS")
|
|
print("="*80 + "\n")
|
|
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
sparql.setMethod('POST')
|
|
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")
|
|
|
|
wd_insts = query_brazilian_institutions(sparql)
|
|
|
|
print(f"✅ Found {len(wd_insts)} Brazilian institutions in Wikidata\n")
|
|
|
|
# Show sample Wikidata names
|
|
print("="*80)
|
|
print("📋 SAMPLE WIKIDATA BRAZILIAN INSTITUTION NAMES (First 20)")
|
|
print("="*80)
|
|
for i, inst in enumerate(wd_insts[:20], 1):
|
|
name = inst.get('name', '')
|
|
city = inst.get('city', 'Unknown')
|
|
inst_type = inst.get('type', 'Unknown')
|
|
normalized = normalize_name(name)
|
|
print(f"{i:2}. {name} ({inst['qid']})")
|
|
print(f" City: {city}, Type: {inst_type}")
|
|
print(f" Normalized: '{normalized}'\n")
|
|
|
|
# Find best matches for each local institution (regardless of threshold)
|
|
print("="*80)
|
|
print("📊 BEST MATCHES (Top 20, any score)")
|
|
print("="*80 + "\n")
|
|
|
|
best_matches = []
|
|
|
|
for inst in brazilian_insts[:30]: # Check first 30 local institutions
|
|
inst_name = inst.get('name', '')
|
|
if not inst_name:
|
|
continue
|
|
|
|
best_score = 0.0
|
|
best_wd = None
|
|
|
|
for wd_inst in wd_insts:
|
|
wd_name = wd_inst.get('name', '')
|
|
if not wd_name:
|
|
continue
|
|
|
|
score = similarity_score(inst_name, wd_name)
|
|
if score > best_score:
|
|
best_score = score
|
|
best_wd = wd_inst
|
|
|
|
if best_wd:
|
|
best_matches.append((inst, best_wd, best_score))
|
|
|
|
# Sort by score descending
|
|
best_matches.sort(key=lambda x: x[2], reverse=True)
|
|
|
|
for i, (local, wd, score) in enumerate(best_matches[:20], 1):
|
|
local_city = local.get('locations', [{}])[0].get('city', 'Unknown')
|
|
wd_city = wd.get('city', 'Unknown')
|
|
|
|
print(f"{i:2}. Score: {score:.3f}")
|
|
print(f" Local: {local.get('name')} ({local_city})")
|
|
print(f" Wikidata: {wd.get('name')} ({wd['qid']}, {wd_city})")
|
|
|
|
# Show if would match at different thresholds
|
|
if score >= 0.85:
|
|
print(f" ✅ Would match at 0.85 threshold")
|
|
elif score >= 0.80:
|
|
print(f" ⚠️ Would match at 0.80 threshold")
|
|
elif score >= 0.75:
|
|
print(f" ⚠️ Would match at 0.75 threshold")
|
|
else:
|
|
print(f" ❌ Below reasonable threshold")
|
|
print()
|
|
|
|
# Threshold analysis
|
|
print("="*80)
|
|
print("📊 THRESHOLD ANALYSIS")
|
|
print("="*80 + "\n")
|
|
|
|
thresholds = [0.95, 0.90, 0.85, 0.80, 0.75, 0.70]
|
|
for threshold in thresholds:
|
|
matches_at_threshold = len([m for m in best_matches if m[2] >= threshold])
|
|
print(f"Threshold {threshold:.2f}: {matches_at_threshold:3} matches")
|
|
|
|
print("\n" + "="*80 + "\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|