- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
88 lines
2.9 KiB
Python
Executable file
88 lines
2.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Diagnose fuzzy matching between Egyptian institutions and Wikidata.
|
|
Shows top 3 Wikidata matches for each institution with similarity scores.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
import yaml
|
|
import re
|
|
from difflib import SequenceMatcher
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Improved normalization that preserves type words."""
|
|
name = name.lower()
|
|
name = re.sub(r'^(the|a|an)\s+', '', name)
|
|
name = re.sub(r'\b(national|regional|central|public|state|royal|great)\b', '', name)
|
|
name = re.sub(r'\b(dar|dār)\b', 'dar', name)
|
|
name = re.sub(r'\b(mathaf|mat?haf)\b', 'mathaf', name)
|
|
name = re.sub(r'\b(maktabat)\b', 'library', name)
|
|
name = re.sub(r'\b(al-|el-)\b', '', name)
|
|
name = re.sub(r'[^\w\s]', ' ', name)
|
|
name = ' '.join(name.split())
|
|
return name
|
|
|
|
def similarity_score(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
return SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
# Load local institutions
|
|
with open('data/instances/egypt_institutions.yaml', 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
# Query Wikidata
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?type WHERE {
|
|
VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 wd:Q1007870 wd:Q31855 }
|
|
?item wdt:P31 ?type .
|
|
?item wdt:P17 wd:Q79 .
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,ar" }
|
|
}
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
results = sparql.query().convert()
|
|
|
|
# Build Wikidata institutions dict
|
|
wd_institutions = {}
|
|
for result in results["results"]["bindings"]:
|
|
qid = result["item"]["value"].split("/")[-1]
|
|
name = result.get("itemLabel", {}).get("value", "")
|
|
if name and qid:
|
|
wd_institutions[qid] = name
|
|
|
|
print(f"Found {len(wd_institutions)} Wikidata institutions\n")
|
|
print("="*100)
|
|
print("DIAGNOSTIC: Top 3 Wikidata matches for each institution")
|
|
print("="*100)
|
|
|
|
for i, inst in enumerate(institutions, 1):
|
|
inst_name = inst.get("name", "NO NAME")
|
|
print(f"\n{i:2d}. {inst_name}")
|
|
print(f" Normalized: '{normalize_name(inst_name)}'")
|
|
|
|
# Calculate scores for all Wikidata institutions
|
|
scores = []
|
|
for qid, wd_name in wd_institutions.items():
|
|
score = similarity_score(inst_name, wd_name)
|
|
if score > 0.5: # Only show promising matches
|
|
scores.append((score, qid, wd_name))
|
|
|
|
# Sort by score and show top 3
|
|
scores.sort(reverse=True)
|
|
for rank, (score, qid, wd_name) in enumerate(scores[:3], 1):
|
|
print(f" {rank}. {score:.3f} - {wd_name} ({qid})")
|
|
print(f" Normalized: '{normalize_name(wd_name)}'")
|
|
|
|
if not scores:
|
|
print(" ❌ No matches above 0.5 threshold")
|
|
|