- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
222 lines
7.3 KiB
Python
222 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Debug script to investigate why University of Sousse (Q3551673) isn't being found.
|
|
|
|
Tests:
|
|
1. Direct query for Q3551673 to see what location labels Wikidata returns
|
|
2. Check if Q3551673 is in our main SPARQL query results
|
|
3. Test fuzzy matching with different location label variations
|
|
"""
|
|
|
|
import requests
|
|
import time
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Tunisia-Debug/1.0"
|
|
|
|
def query_wikidata(query, description):
|
|
"""Execute SPARQL query and return results."""
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {
|
|
'query': query,
|
|
'format': 'json'
|
|
}
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Test: {description}")
|
|
print(f"{'='*60}")
|
|
|
|
try:
|
|
time.sleep(1.5) # Rate limiting
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
print(f"Results found: {len(bindings)}")
|
|
|
|
return bindings
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return []
|
|
|
|
def test_1_direct_query():
|
|
"""Test 1: Query Q3551673 directly to see its location data."""
|
|
query = """
|
|
SELECT ?itemLabel ?locationLabel ?location ?typeLabel WHERE {
|
|
wd:Q3551673 wdt:P131 ?location .
|
|
wd:Q3551673 wdt:P31 ?type .
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
|
|
}
|
|
"""
|
|
|
|
bindings = query_wikidata(query, "Direct query for Q3551673 (University of Sousse)")
|
|
|
|
for binding in bindings:
|
|
item_label = binding.get("itemLabel", {}).get("value", "N/A")
|
|
location_label = binding.get("locationLabel", {}).get("value", "N/A")
|
|
location_qid = binding.get("location", {}).get("value", "").split("/")[-1]
|
|
type_label = binding.get("typeLabel", {}).get("value", "N/A")
|
|
|
|
print(f"\n Institution: {item_label}")
|
|
print(f" Location: {location_label} ({location_qid})")
|
|
print(f" Type: {type_label}")
|
|
|
|
# Test fuzzy matching
|
|
city_test = "sousse"
|
|
match_score = fuzz.ratio(city_test.lower(), location_label.lower())
|
|
print(f"\n Fuzzy match test:")
|
|
print(f" City search: '{city_test}'")
|
|
print(f" Location label: '{location_label}'")
|
|
print(f" Match score: {match_score}%")
|
|
print(f" Passes 70% threshold: {'✅ YES' if match_score >= 70 else '❌ NO'}")
|
|
|
|
def test_2_main_query():
|
|
"""Test 2: Check if Q3551673 appears in our main query."""
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?location ?locationLabel ?typeLabel
|
|
WHERE {
|
|
# Must be in Tunisia
|
|
?item wdt:P17 wd:Q948 .
|
|
|
|
# Must have an instance-of type
|
|
?item wdt:P31 ?type .
|
|
|
|
# Limit to universities
|
|
VALUES ?type {
|
|
wd:Q3918 # University
|
|
wd:Q875538 # Public university
|
|
wd:Q38723 # Higher education institution
|
|
}
|
|
|
|
# Add location
|
|
OPTIONAL { ?item wdt:P131 ?location . }
|
|
|
|
# Filter for institutions with "Sousse" in name
|
|
FILTER(REGEX(STR(?itemLabel), "sousse", "i"))
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
|
|
}
|
|
LIMIT 50
|
|
"""
|
|
|
|
bindings = query_wikidata(query, "Main query filtered for 'Sousse' universities")
|
|
|
|
found_q3551673 = False
|
|
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1]
|
|
item_label = binding.get("itemLabel", {}).get("value", "N/A")
|
|
location_label = binding.get("locationLabel", {}).get("value", "N/A")
|
|
type_label = binding.get("typeLabel", {}).get("value", "N/A")
|
|
|
|
if qid == "Q3551673":
|
|
found_q3551673 = True
|
|
print(f"\n ✅ FOUND Q3551673!")
|
|
|
|
print(f"\n {qid}: {item_label}")
|
|
print(f" Location: {location_label}")
|
|
print(f" Type: {type_label}")
|
|
|
|
if not found_q3551673:
|
|
print(f"\n ❌ Q3551673 NOT found in results")
|
|
|
|
def test_3_all_tunisia_universities():
|
|
"""Test 3: List ALL Tunisian universities to see what we're getting."""
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?location ?locationLabel ?typeLabel
|
|
WHERE {
|
|
# Must be in Tunisia
|
|
?item wdt:P17 wd:Q948 .
|
|
|
|
# Must be a university
|
|
?item wdt:P31 ?type .
|
|
VALUES ?type {
|
|
wd:Q3918 # University
|
|
wd:Q875538 # Public university
|
|
wd:Q38723 # Higher education institution
|
|
}
|
|
|
|
# Add location
|
|
OPTIONAL { ?item wdt:P131 ?location . }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
|
|
}
|
|
LIMIT 100
|
|
"""
|
|
|
|
bindings = query_wikidata(query, "ALL Tunisian universities")
|
|
|
|
found_q3551673 = False
|
|
sousse_institutions = []
|
|
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1]
|
|
item_label = binding.get("itemLabel", {}).get("value", "N/A")
|
|
location_label = binding.get("locationLabel", {}).get("value", "N/A")
|
|
type_label = binding.get("typeLabel", {}).get("value", "N/A")
|
|
|
|
if qid == "Q3551673":
|
|
found_q3551673 = True
|
|
|
|
# Check for Sousse-related institutions
|
|
if "sousse" in item_label.lower() or "sousse" in location_label.lower():
|
|
sousse_institutions.append({
|
|
'qid': qid,
|
|
'name': item_label,
|
|
'location': location_label,
|
|
'type': type_label
|
|
})
|
|
|
|
print(f"\n Total universities found: {len(bindings)}")
|
|
print(f"\n Sousse-related institutions ({len(sousse_institutions)}):")
|
|
|
|
for inst in sousse_institutions:
|
|
marker = "✅" if inst['qid'] == "Q3551673" else " "
|
|
print(f"\n {marker} {inst['qid']}: {inst['name']}")
|
|
print(f" Location: {inst['location']}")
|
|
print(f" Type: {inst['type']}")
|
|
|
|
if not found_q3551673:
|
|
print(f"\n ❌ Q3551673 NOT in results - may be outside LIMIT or not classified as university")
|
|
|
|
def test_4_location_hierarchy():
|
|
"""Test 4: Check location hierarchy for Q3551673."""
|
|
query = """
|
|
SELECT ?locationLabel ?parentLabel WHERE {
|
|
wd:Q3551673 wdt:P131 ?location .
|
|
OPTIONAL { ?location wdt:P131 ?parent . }
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
|
|
}
|
|
"""
|
|
|
|
bindings = query_wikidata(query, "Location hierarchy for Q3551673")
|
|
|
|
for binding in bindings:
|
|
location_label = binding.get("locationLabel", {}).get("value", "N/A")
|
|
parent_label = binding.get("parentLabel", {}).get("value", "N/A")
|
|
|
|
print(f"\n Direct location (P131): {location_label}")
|
|
print(f" Parent location: {parent_label}")
|
|
|
|
def main():
|
|
print("DEBUGGING: Why University of Sousse (Q3551673) isn't found")
|
|
print("="*60)
|
|
|
|
# Run all tests
|
|
test_1_direct_query()
|
|
test_2_main_query()
|
|
test_3_all_tunisia_universities()
|
|
test_4_location_hierarchy()
|
|
|
|
print("\n" + "="*60)
|
|
print("DEBUGGING COMPLETE")
|
|
print("="*60)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|