- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
134 lines
4.6 KiB
Python
134 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test the actual SPARQL query used in the enrichment script for LIBRARY type.
|
|
"""
|
|
|
|
import requests
|
|
import time
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Tunisia-Debug/1.0"
|
|
|
|
# LIBRARY type mapping from the script
|
|
valid_types = {
|
|
'Q7075', # Library
|
|
'Q2668072', # National library
|
|
'Q570116', # Public library
|
|
'Q5193377', # University library
|
|
'Q28564', # Academic library
|
|
'Q1479716', # Regional library
|
|
'Q1622062', # Digital library
|
|
'Q17297735', # Diocesan library
|
|
'Q105338594', # Bibliothèque diocésaine (specific diocesan library subtype)
|
|
}
|
|
|
|
type_values = " ".join([f"wd:{qid}" for qid in valid_types])
|
|
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
|
|
?viaf ?isil ?website ?coords ?inception ?itemAltLabel
|
|
?location ?locationLabel
|
|
WHERE {{
|
|
# Must be in Tunisia
|
|
?item wdt:P17 wd:Q948 .
|
|
|
|
# Must have an instance-of type matching our institution type
|
|
?item wdt:P31 ?type .
|
|
|
|
# Filter to relevant types for this institution (server-side filtering)
|
|
VALUES ?type {{ {type_values} }}
|
|
|
|
# Add location (P131: located in administrative territorial entity)
|
|
OPTIONAL {{ ?item wdt:P131 ?location . }}
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
|
|
}}
|
|
LIMIT 200
|
|
"""
|
|
|
|
print("Testing SPARQL query for LIBRARY institutions in Tunisia")
|
|
print("=" * 60)
|
|
print(f"Valid types in mapping: {len(valid_types)}")
|
|
print(f"Includes Q105338594: {'Q105338594' in valid_types}")
|
|
print("\nExecuting query...")
|
|
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {
|
|
'query': query,
|
|
'format': 'json'
|
|
}
|
|
|
|
time.sleep(1.5)
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
print(f"Found {len(bindings)} LIBRARY institutions in Tunisia\n")
|
|
|
|
# Look for Diocesan Library
|
|
search_name = "Diocesan Library of Tunis"
|
|
name_lower = search_name.lower()
|
|
|
|
print(f"Searching for: '{search_name}'")
|
|
print("-" * 60)
|
|
|
|
found_diocesan = False
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else ""
|
|
item_label = binding.get("itemLabel", {}).get("value", "")
|
|
type_uri = binding.get("type", {}).get("value", "")
|
|
type_qid = type_uri.split("/")[-1] if type_uri else ""
|
|
type_label = binding.get("typeLabel", {}).get("value", "")
|
|
|
|
# Check if this is the Diocesan Library
|
|
if qid == "Q28149782":
|
|
found_diocesan = True
|
|
print(f"✅ FOUND Q28149782 in query results!")
|
|
print(f" Label: {item_label}")
|
|
print(f" Type: {type_qid} ({type_label})")
|
|
print(f" Location: {binding.get('locationLabel', {}).get('value', 'N/A')}")
|
|
|
|
# Test fuzzy matching
|
|
label_score = fuzz.ratio(name_lower, item_label.lower())
|
|
partial_score = fuzz.partial_ratio(name_lower, item_label.lower())
|
|
token_score = fuzz.token_set_ratio(name_lower, item_label.lower())
|
|
best_score = max(label_score, partial_score, token_score)
|
|
|
|
print(f"\n Fuzzy match scores:")
|
|
print(f" Ratio: {label_score}%")
|
|
print(f" Partial: {partial_score}%")
|
|
print(f" Token set: {token_score}%")
|
|
print(f" BEST: {best_score}%")
|
|
|
|
if best_score >= 65:
|
|
print(f" ✅ Would match (>= 65% threshold)")
|
|
else:
|
|
print(f" ❌ Below 65% threshold")
|
|
|
|
if not found_diocesan:
|
|
print("❌ Q28149782 NOT in query results")
|
|
print("\nPossible reasons:")
|
|
print(" 1. Type filtering excluded it (but Q105338594 is in VALUES)")
|
|
print(" 2. Country filter issue (but we confirmed P17 = Q948 Tunisia)")
|
|
print(" 3. SPARQL service issue")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("All libraries found:")
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else ""
|
|
item_label = binding.get("itemLabel", {}).get("value", "")
|
|
type_uri = binding.get("type", {}).get("value", "")
|
|
type_qid = type_uri.split("/")[-1] if type_uri else ""
|
|
print(f" {qid}: {item_label} [{type_qid}]")
|