#!/usr/bin/env python3 """ Debug script to investigate why University of Sousse (Q3551673) isn't being found. Tests: 1. Direct query for Q3551673 to see what location labels Wikidata returns 2. Check if Q3551673 is in our main SPARQL query results 3. Test fuzzy matching with different location label variations """ import requests import time from rapidfuzz import fuzz SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Tunisia-Debug/1.0" def query_wikidata(query, description): """Execute SPARQL query and return results.""" headers = {'User-Agent': USER_AGENT} params = { 'query': query, 'format': 'json' } print(f"\n{'='*60}") print(f"Test: {description}") print(f"{'='*60}") try: time.sleep(1.5) # Rate limiting response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60) response.raise_for_status() results = response.json() bindings = results.get("results", {}).get("bindings", []) print(f"Results found: {len(bindings)}") return bindings except Exception as e: print(f"❌ Error: {e}") return [] def test_1_direct_query(): """Test 1: Query Q3551673 directly to see its location data.""" query = """ SELECT ?itemLabel ?locationLabel ?location ?typeLabel WHERE { wd:Q3551673 wdt:P131 ?location . wd:Q3551673 wdt:P31 ?type . SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . } } """ bindings = query_wikidata(query, "Direct query for Q3551673 (University of Sousse)") for binding in bindings: item_label = binding.get("itemLabel", {}).get("value", "N/A") location_label = binding.get("locationLabel", {}).get("value", "N/A") location_qid = binding.get("location", {}).get("value", "").split("/")[-1] type_label = binding.get("typeLabel", {}).get("value", "N/A") print(f"\n Institution: {item_label}") print(f" Location: {location_label} ({location_qid})") print(f" Type: {type_label}") # Test fuzzy matching city_test = "sousse" match_score = fuzz.ratio(city_test.lower(), location_label.lower()) print(f"\n Fuzzy match test:") print(f" City search: '{city_test}'") print(f" Location label: '{location_label}'") print(f" Match score: {match_score}%") print(f" Passes 70% threshold: {'✅ YES' if match_score >= 70 else '❌ NO'}") def test_2_main_query(): """Test 2: Check if Q3551673 appears in our main query.""" query = """ SELECT DISTINCT ?item ?itemLabel ?location ?locationLabel ?typeLabel WHERE { # Must be in Tunisia ?item wdt:P17 wd:Q948 . # Must have an instance-of type ?item wdt:P31 ?type . # Limit to universities VALUES ?type { wd:Q3918 # University wd:Q875538 # Public university wd:Q38723 # Higher education institution } # Add location OPTIONAL { ?item wdt:P131 ?location . } # Filter for institutions with "Sousse" in name FILTER(REGEX(STR(?itemLabel), "sousse", "i")) SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . } } LIMIT 50 """ bindings = query_wikidata(query, "Main query filtered for 'Sousse' universities") found_q3551673 = False for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] item_label = binding.get("itemLabel", {}).get("value", "N/A") location_label = binding.get("locationLabel", {}).get("value", "N/A") type_label = binding.get("typeLabel", {}).get("value", "N/A") if qid == "Q3551673": found_q3551673 = True print(f"\n ✅ FOUND Q3551673!") print(f"\n {qid}: {item_label}") print(f" Location: {location_label}") print(f" Type: {type_label}") if not found_q3551673: print(f"\n ❌ Q3551673 NOT found in results") def test_3_all_tunisia_universities(): """Test 3: List ALL Tunisian universities to see what we're getting.""" query = """ SELECT DISTINCT ?item ?itemLabel ?location ?locationLabel ?typeLabel WHERE { # Must be in Tunisia ?item wdt:P17 wd:Q948 . # Must be a university ?item wdt:P31 ?type . VALUES ?type { wd:Q3918 # University wd:Q875538 # Public university wd:Q38723 # Higher education institution } # Add location OPTIONAL { ?item wdt:P131 ?location . } SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . } } LIMIT 100 """ bindings = query_wikidata(query, "ALL Tunisian universities") found_q3551673 = False sousse_institutions = [] for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] item_label = binding.get("itemLabel", {}).get("value", "N/A") location_label = binding.get("locationLabel", {}).get("value", "N/A") type_label = binding.get("typeLabel", {}).get("value", "N/A") if qid == "Q3551673": found_q3551673 = True # Check for Sousse-related institutions if "sousse" in item_label.lower() or "sousse" in location_label.lower(): sousse_institutions.append({ 'qid': qid, 'name': item_label, 'location': location_label, 'type': type_label }) print(f"\n Total universities found: {len(bindings)}") print(f"\n Sousse-related institutions ({len(sousse_institutions)}):") for inst in sousse_institutions: marker = "✅" if inst['qid'] == "Q3551673" else " " print(f"\n {marker} {inst['qid']}: {inst['name']}") print(f" Location: {inst['location']}") print(f" Type: {inst['type']}") if not found_q3551673: print(f"\n ❌ Q3551673 NOT in results - may be outside LIMIT or not classified as university") def test_4_location_hierarchy(): """Test 4: Check location hierarchy for Q3551673.""" query = """ SELECT ?locationLabel ?parentLabel WHERE { wd:Q3551673 wdt:P131 ?location . OPTIONAL { ?location wdt:P131 ?parent . } SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . } } """ bindings = query_wikidata(query, "Location hierarchy for Q3551673") for binding in bindings: location_label = binding.get("locationLabel", {}).get("value", "N/A") parent_label = binding.get("parentLabel", {}).get("value", "N/A") print(f"\n Direct location (P131): {location_label}") print(f" Parent location: {parent_label}") def main(): print("DEBUGGING: Why University of Sousse (Q3551673) isn't found") print("="*60) # Run all tests test_1_direct_query() test_2_main_query() test_3_all_tunisia_universities() test_4_location_hierarchy() print("\n" + "="*60) print("DEBUGGING COMPLETE") print("="*60) if __name__ == '__main__': main()