glam/scripts/diagnose_egypt_matching.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

88 lines
2.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Diagnose fuzzy matching between Egyptian institutions and Wikidata.
Shows top 3 Wikidata matches for each institution with similarity scores.
"""
import sys
from pathlib import Path
import yaml
import re
from difflib import SequenceMatcher
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
def normalize_name(name: str) -> str:
"""Improved normalization that preserves type words."""
name = name.lower()
name = re.sub(r'^(the|a|an)\s+', '', name)
name = re.sub(r'\b(national|regional|central|public|state|royal|great)\b', '', name)
name = re.sub(r'\b(dar|dār)\b', 'dar', name)
name = re.sub(r'\b(mathaf|mat?haf)\b', 'mathaf', name)
name = re.sub(r'\b(maktabat)\b', 'library', name)
name = re.sub(r'\b(al-|el-)\b', '', name)
name = re.sub(r'[^\w\s]', ' ', name)
name = ' '.join(name.split())
return name
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
# Load local institutions
with open('data/instances/egypt_institutions.yaml', 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
# Query Wikidata
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
query = """
SELECT DISTINCT ?item ?itemLabel ?type WHERE {
VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 wd:Q1007870 wd:Q31855 }
?item wdt:P31 ?type .
?item wdt:P17 wd:Q79 .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,ar" }
}
"""
sparql.setQuery(query)
results = sparql.query().convert()
# Build Wikidata institutions dict
wd_institutions = {}
for result in results["results"]["bindings"]:
qid = result["item"]["value"].split("/")[-1]
name = result.get("itemLabel", {}).get("value", "")
if name and qid:
wd_institutions[qid] = name
print(f"Found {len(wd_institutions)} Wikidata institutions\n")
print("="*100)
print("DIAGNOSTIC: Top 3 Wikidata matches for each institution")
print("="*100)
for i, inst in enumerate(institutions, 1):
inst_name = inst.get("name", "NO NAME")
print(f"\n{i:2d}. {inst_name}")
print(f" Normalized: '{normalize_name(inst_name)}'")
# Calculate scores for all Wikidata institutions
scores = []
for qid, wd_name in wd_institutions.items():
score = similarity_score(inst_name, wd_name)
if score > 0.5: # Only show promising matches
scores.append((score, qid, wd_name))
# Sort by score and show top 3
scores.sort(reverse=True)
for rank, (score, qid, wd_name) in enumerate(scores[:3], 1):
print(f" {rank}. {score:.3f} - {wd_name} ({qid})")
print(f" Normalized: '{normalize_name(wd_name)}'")
if not scores:
print(" ❌ No matches above 0.5 threshold")