#!/usr/bin/env python3 """Analyze why Egyptian institutions didn't match with Wikidata.""" import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) import yaml from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON from scripts.enrich_egypt_wikidata import ( query_wikidata_institutions, similarity_score, normalize_name ) # Load institutions with open("data/instances/egypt_institutions.yaml") as f: content = f.read().split('---\n')[-1] institutions = yaml.safe_load(content) # Query Wikidata sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod('POST') sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2 (Analysis)") print("Querying Wikidata...") wd_results = query_wikidata_institutions(sparql) print(f"Found {len(wd_results)} Wikidata institutions\n") # Analyze each institution print("="*80) print("FUZZY MATCH ANALYSIS (Top 5 matches per institution)") print("="*80) for inst in institutions[:10]: # Sample first 10 name = inst.get("name", "") inst_type = inst.get("institution_type", "") print(f"\nšŸ›ļø {name} ({inst_type})") print(f" Normalized: '{normalize_name(name)}'") # Find top 5 matches matches = [] for qid, wd_data in wd_results.items(): wd_name = wd_data.get("name", "") score = similarity_score(name, wd_name) matches.append((score, wd_name, qid, wd_data.get("type", ""))) matches.sort(reverse=True) for i, (score, wd_name, qid, wd_type) in enumerate(matches[:5], 1): marker = "āœ…" if score >= 0.85 else "āŒ" print(f" {marker} {i}. {score:.3f} - {wd_name} ({qid}) [{wd_type}]") print(f" Normalized: '{normalize_name(wd_name)}'")