glam/scripts/debug_sousse_wikidata.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

222 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""
Debug script to investigate why University of Sousse (Q3551673) isn't being found.
Tests:
1. Direct query for Q3551673 to see what location labels Wikidata returns
2. Check if Q3551673 is in our main SPARQL query results
3. Test fuzzy matching with different location label variations
"""
import requests
import time
from rapidfuzz import fuzz
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Tunisia-Debug/1.0"
def query_wikidata(query, description):
"""Execute SPARQL query and return results."""
headers = {'User-Agent': USER_AGENT}
params = {
'query': query,
'format': 'json'
}
print(f"\n{'='*60}")
print(f"Test: {description}")
print(f"{'='*60}")
try:
time.sleep(1.5) # Rate limiting
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
response.raise_for_status()
results = response.json()
bindings = results.get("results", {}).get("bindings", [])
print(f"Results found: {len(bindings)}")
return bindings
except Exception as e:
print(f"❌ Error: {e}")
return []
def test_1_direct_query():
"""Test 1: Query Q3551673 directly to see its location data."""
query = """
SELECT ?itemLabel ?locationLabel ?location ?typeLabel WHERE {
wd:Q3551673 wdt:P131 ?location .
wd:Q3551673 wdt:P31 ?type .
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
}
"""
bindings = query_wikidata(query, "Direct query for Q3551673 (University of Sousse)")
for binding in bindings:
item_label = binding.get("itemLabel", {}).get("value", "N/A")
location_label = binding.get("locationLabel", {}).get("value", "N/A")
location_qid = binding.get("location", {}).get("value", "").split("/")[-1]
type_label = binding.get("typeLabel", {}).get("value", "N/A")
print(f"\n Institution: {item_label}")
print(f" Location: {location_label} ({location_qid})")
print(f" Type: {type_label}")
# Test fuzzy matching
city_test = "sousse"
match_score = fuzz.ratio(city_test.lower(), location_label.lower())
print(f"\n Fuzzy match test:")
print(f" City search: '{city_test}'")
print(f" Location label: '{location_label}'")
print(f" Match score: {match_score}%")
print(f" Passes 70% threshold: {'✅ YES' if match_score >= 70 else '❌ NO'}")
def test_2_main_query():
"""Test 2: Check if Q3551673 appears in our main query."""
query = """
SELECT DISTINCT ?item ?itemLabel ?location ?locationLabel ?typeLabel
WHERE {
# Must be in Tunisia
?item wdt:P17 wd:Q948 .
# Must have an instance-of type
?item wdt:P31 ?type .
# Limit to universities
VALUES ?type {
wd:Q3918 # University
wd:Q875538 # Public university
wd:Q38723 # Higher education institution
}
# Add location
OPTIONAL { ?item wdt:P131 ?location . }
# Filter for institutions with "Sousse" in name
FILTER(REGEX(STR(?itemLabel), "sousse", "i"))
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
}
LIMIT 50
"""
bindings = query_wikidata(query, "Main query filtered for 'Sousse' universities")
found_q3551673 = False
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1]
item_label = binding.get("itemLabel", {}).get("value", "N/A")
location_label = binding.get("locationLabel", {}).get("value", "N/A")
type_label = binding.get("typeLabel", {}).get("value", "N/A")
if qid == "Q3551673":
found_q3551673 = True
print(f"\n ✅ FOUND Q3551673!")
print(f"\n {qid}: {item_label}")
print(f" Location: {location_label}")
print(f" Type: {type_label}")
if not found_q3551673:
print(f"\n ❌ Q3551673 NOT found in results")
def test_3_all_tunisia_universities():
"""Test 3: List ALL Tunisian universities to see what we're getting."""
query = """
SELECT DISTINCT ?item ?itemLabel ?location ?locationLabel ?typeLabel
WHERE {
# Must be in Tunisia
?item wdt:P17 wd:Q948 .
# Must be a university
?item wdt:P31 ?type .
VALUES ?type {
wd:Q3918 # University
wd:Q875538 # Public university
wd:Q38723 # Higher education institution
}
# Add location
OPTIONAL { ?item wdt:P131 ?location . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
}
LIMIT 100
"""
bindings = query_wikidata(query, "ALL Tunisian universities")
found_q3551673 = False
sousse_institutions = []
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1]
item_label = binding.get("itemLabel", {}).get("value", "N/A")
location_label = binding.get("locationLabel", {}).get("value", "N/A")
type_label = binding.get("typeLabel", {}).get("value", "N/A")
if qid == "Q3551673":
found_q3551673 = True
# Check for Sousse-related institutions
if "sousse" in item_label.lower() or "sousse" in location_label.lower():
sousse_institutions.append({
'qid': qid,
'name': item_label,
'location': location_label,
'type': type_label
})
print(f"\n Total universities found: {len(bindings)}")
print(f"\n Sousse-related institutions ({len(sousse_institutions)}):")
for inst in sousse_institutions:
marker = "" if inst['qid'] == "Q3551673" else " "
print(f"\n {marker} {inst['qid']}: {inst['name']}")
print(f" Location: {inst['location']}")
print(f" Type: {inst['type']}")
if not found_q3551673:
print(f"\n ❌ Q3551673 NOT in results - may be outside LIMIT or not classified as university")
def test_4_location_hierarchy():
"""Test 4: Check location hierarchy for Q3551673."""
query = """
SELECT ?locationLabel ?parentLabel WHERE {
wd:Q3551673 wdt:P131 ?location .
OPTIONAL { ?location wdt:P131 ?parent . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
}
"""
bindings = query_wikidata(query, "Location hierarchy for Q3551673")
for binding in bindings:
location_label = binding.get("locationLabel", {}).get("value", "N/A")
parent_label = binding.get("parentLabel", {}).get("value", "N/A")
print(f"\n Direct location (P131): {location_label}")
print(f" Parent location: {parent_label}")
def main():
print("DEBUGGING: Why University of Sousse (Q3551673) isn't found")
print("="*60)
# Run all tests
test_1_direct_query()
test_2_main_query()
test_3_all_tunisia_universities()
test_4_location_hierarchy()
print("\n" + "="*60)
print("DEBUGGING COMPLETE")
print("="*60)
if __name__ == '__main__':
main()