#!/usr/bin/env python3 """ Debug version with verbose logging to see exactly what's happening. """ import requests import time from rapidfuzz import fuzz SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Tunisia-Debug/1.0" # Valid types for UNIVERSITY UNIVERSITY_TYPES = { 'Q3918', # University 'Q875538', # Public university 'Q2467461', # Private university 'Q15936437', # Research university 'Q38723', # Higher education institution 'Q3354859', # Technical university } def debug_search(): """Search with detailed logging.""" name = "University of Sousse" inst_type = "UNIVERSITY" city = "Sousse" query = """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?location ?locationLabel WHERE { # Must be in Tunisia ?item wdt:P17 wd:Q948 . # Must have an instance-of type ?item wdt:P31 ?type . # Limit to universities VALUES ?type { wd:Q3918 # University wd:Q875538 # Public university wd:Q38723 # Higher education institution } # Add location (P131: located in administrative territorial entity) OPTIONAL { ?item wdt:P131 ?location . } OPTIONAL { ?item wdt:P214 ?viaf . } OPTIONAL { ?item wdt:P791 ?isil . } OPTIONAL { ?item wdt:P856 ?website . } OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P571 ?inception . } OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) } SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . } } LIMIT 200 """ headers = {'User-Agent': USER_AGENT} params = { 'query': query, 'format': 'json' } print("Executing SPARQL query...") time.sleep(1.5) response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60) response.raise_for_status() results = response.json() bindings = results.get("results", {}).get("bindings", []) print(f"Total results: {len(bindings)}") # Look specifically for Q3551673 found_q3551673 = False q3551673_data = None for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if qid == "Q3551673": found_q3551673 = True q3551673_data = binding break if found_q3551673: print(f"\n✅ Q3551673 IS in the results!") print(f"\nData for Q3551673:") print(f" itemLabel: {q3551673_data.get('itemLabel', {}).get('value', 'N/A')}") print(f" typeLabel: {q3551673_data.get('typeLabel', {}).get('value', 'N/A')}") print(f" locationLabel: {q3551673_data.get('locationLabel', {}).get('value', 'N/A')}") else: print(f"\n❌ Q3551673 NOT in the results") return # Now simulate the matching logic print(f"\n{'='*60}") print("SIMULATING MATCHING LOGIC") print(f"{'='*60}") print(f"\nSearch parameters:") print(f" Name: '{name}'") print(f" Institution Type: {inst_type}") print(f" City: '{city}'") name_lower = name.lower() city_lower = city.lower() requires_city_match = True # UNIVERSITY requires city match # Process Q3551673 binding = q3551673_data # Step 1: Entity type validation print(f"\n--- STEP 1: Entity Type Validation ---") entity_type_uri = binding.get("type", {}).get("value", "") entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None print(f" Entity type QID: {entity_type_qid}") print(f" Valid types: {UNIVERSITY_TYPES}") print(f" Type matches: {entity_type_qid in UNIVERSITY_TYPES}") if entity_type_qid not in UNIVERSITY_TYPES: print(f" ❌ REJECTED: Entity type mismatch") return else: print(f" ✅ PASSED: Entity type validated") # Step 2: Geographic validation print(f"\n--- STEP 2: Geographic Validation ---") print(f" Requires city match: {requires_city_match}") if city_lower and requires_city_match: location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else "" print(f" Location label from Wikidata: '{location_label}'") print(f" Expected city: '{city_lower}'") if not location_label: print(f" ❌ REJECTED: No location data") return else: print(f" ✅ Has location data") location_match = fuzz.ratio(city_lower, location_label) print(f" Location fuzzy match: {location_match}%") print(f" Threshold: 70%") if location_match < 70: print(f" ❌ REJECTED: Location match below threshold") return else: print(f" ✅ PASSED: Location validated") # Step 3: Name fuzzy matching print(f"\n--- STEP 3: Name Fuzzy Matching ---") item_label = binding.get("itemLabel", {}).get("value", "").lower() print(f" Search name: '{name_lower}'") print(f" Wikidata label: '{item_label}'") label_score = fuzz.ratio(name_lower, item_label) partial_score = fuzz.partial_ratio(name_lower, item_label) token_score = fuzz.token_set_ratio(name_lower, item_label) print(f"\n Fuzzy match scores:") print(f" Label score: {label_score}%") print(f" Partial score: {partial_score}%") print(f" Token set score: {token_score}%") best_score = max(label_score, partial_score, token_score) print(f" Best score: {best_score}%") print(f" Threshold: 70%") if best_score < 70: print(f" ❌ REJECTED: Match score below threshold") else: print(f" ✅ PASSED: Name match validated") print(f"\n🎉 Q3551673 SHOULD BE MATCHED!") if __name__ == '__main__': debug_search()