glam/scripts/analyze_egypt_matches.py

#!/usr/bin/env python3
"""Analyze why Egyptian institutions didn't match with Wikidata."""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

import yaml
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
from scripts.enrich_egypt_wikidata import (
    query_wikidata_institutions,
    similarity_score,
    normalize_name
)

# Load institutions
with open("data/instances/egypt_institutions.yaml") as f:
    content = f.read().split('---\n')[-1]
    institutions = yaml.safe_load(content)

# Query Wikidata
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
sparql.setMethod('POST')
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2 (Analysis)")

print("Querying Wikidata...")
wd_results = query_wikidata_institutions(sparql)
print(f"Found {len(wd_results)} Wikidata institutions\n")

# Analyze each institution
print("="*80)
print("FUZZY MATCH ANALYSIS (Top 5 matches per institution)")
print("="*80)

for inst in institutions[:10]:  # Sample first 10
    name = inst.get("name", "")
    inst_type = inst.get("institution_type", "")

    print(f"\n🏛️  {name} ({inst_type})")
    print(f"   Normalized: '{normalize_name(name)}'")

    # Find top 5 matches
    matches = []
    for qid, wd_data in wd_results.items():
        wd_name = wd_data.get("name", "")
        score = similarity_score(name, wd_name)
        matches.append((score, wd_name, qid, wd_data.get("type", "")))

    matches.sort(reverse=True)

    for i, (score, wd_name, qid, wd_type) in enumerate(matches[:5], 1):
        marker = "✅" if score >= 0.85 else "❌"
        print(f"   {marker} {i}. {score:.3f} - {wd_name} ({qid}) [{wd_type}]")
        print(f"      Normalized: '{normalize_name(wd_name)}'")