- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Analyze why Egyptian institutions didn't match with Wikidata."""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
import yaml
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
|
|
from scripts.enrich_egypt_wikidata import (
|
|
query_wikidata_institutions,
|
|
similarity_score,
|
|
normalize_name
|
|
)
|
|
|
|
# Load institutions
|
|
with open("data/instances/egypt_institutions.yaml") as f:
|
|
content = f.read().split('---\n')[-1]
|
|
institutions = yaml.safe_load(content)
|
|
|
|
# Query Wikidata
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
sparql.setMethod('POST')
|
|
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2 (Analysis)")
|
|
|
|
print("Querying Wikidata...")
|
|
wd_results = query_wikidata_institutions(sparql)
|
|
print(f"Found {len(wd_results)} Wikidata institutions\n")
|
|
|
|
# Analyze each institution
|
|
print("="*80)
|
|
print("FUZZY MATCH ANALYSIS (Top 5 matches per institution)")
|
|
print("="*80)
|
|
|
|
for inst in institutions[:10]: # Sample first 10
|
|
name = inst.get("name", "")
|
|
inst_type = inst.get("institution_type", "")
|
|
|
|
print(f"\n🏛️ {name} ({inst_type})")
|
|
print(f" Normalized: '{normalize_name(name)}'")
|
|
|
|
# Find top 5 matches
|
|
matches = []
|
|
for qid, wd_data in wd_results.items():
|
|
wd_name = wd_data.get("name", "")
|
|
score = similarity_score(name, wd_name)
|
|
matches.append((score, wd_name, qid, wd_data.get("type", "")))
|
|
|
|
matches.sort(reverse=True)
|
|
|
|
for i, (score, wd_name, qid, wd_type) in enumerate(matches[:5], 1):
|
|
marker = "✅" if score >= 0.85 else "❌"
|
|
print(f" {marker} {i}. {score:.3f} - {wd_name} ({qid}) [{wd_type}]")
|
|
print(f" Normalized: '{normalize_name(wd_name)}'")
|
|
|