#!/usr/bin/env python3 """ Diagnose fuzzy matching between Egyptian institutions and Wikidata. Shows top 3 Wikidata matches for each institution with similarity scores. """ import sys from pathlib import Path import yaml import re from difflib import SequenceMatcher sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON def normalize_name(name: str) -> str: """Improved normalization that preserves type words.""" name = name.lower() name = re.sub(r'^(the|a|an)\s+', '', name) name = re.sub(r'\b(national|regional|central|public|state|royal|great)\b', '', name) name = re.sub(r'\b(dar|dār)\b', 'dar', name) name = re.sub(r'\b(mathaf|mat?haf)\b', 'mathaf', name) name = re.sub(r'\b(maktabat)\b', 'library', name) name = re.sub(r'\b(al-|el-)\b', '', name) name = re.sub(r'[^\w\s]', ' ', name) name = ' '.join(name.split()) return name def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names.""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() # Load local institutions with open('data/instances/egypt_institutions.yaml', 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) # Query Wikidata sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) query = """ SELECT DISTINCT ?item ?itemLabel ?type WHERE { VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 wd:Q1007870 wd:Q31855 } ?item wdt:P31 ?type . ?item wdt:P17 wd:Q79 . SERVICE wikibase:label { bd:serviceParam wikibase:language "en,ar" } } """ sparql.setQuery(query) results = sparql.query().convert() # Build Wikidata institutions dict wd_institutions = {} for result in results["results"]["bindings"]: qid = result["item"]["value"].split("/")[-1] name = result.get("itemLabel", {}).get("value", "") if name and qid: wd_institutions[qid] = name print(f"Found {len(wd_institutions)} Wikidata institutions\n") print("="*100) print("DIAGNOSTIC: Top 3 Wikidata matches for each institution") print("="*100) for i, inst in enumerate(institutions, 1): inst_name = inst.get("name", "NO NAME") print(f"\n{i:2d}. {inst_name}") print(f" Normalized: '{normalize_name(inst_name)}'") # Calculate scores for all Wikidata institutions scores = [] for qid, wd_name in wd_institutions.items(): score = similarity_score(inst_name, wd_name) if score > 0.5: # Only show promising matches scores.append((score, qid, wd_name)) # Sort by score and show top 3 scores.sort(reverse=True) for rank, (score, qid, wd_name) in enumerate(scores[:3], 1): print(f" {rank}. {score:.3f} - {wd_name} ({qid})") print(f" Normalized: '{normalize_name(wd_name)}'") if not scores: print(" ❌ No matches above 0.5 threshold")