#!/usr/bin/env python3 """ Test the actual SPARQL query used in the enrichment script for LIBRARY type. """ import requests import time from rapidfuzz import fuzz SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Tunisia-Debug/1.0" # LIBRARY type mapping from the script valid_types = { 'Q7075', # Library 'Q2668072', # National library 'Q570116', # Public library 'Q5193377', # University library 'Q28564', # Academic library 'Q1479716', # Regional library 'Q1622062', # Digital library 'Q17297735', # Diocesan library 'Q105338594', # Bibliothèque diocésaine (specific diocesan library subtype) } type_values = " ".join([f"wd:{qid}" for qid in valid_types]) query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?location ?locationLabel WHERE {{ # Must be in Tunisia ?item wdt:P17 wd:Q948 . # Must have an instance-of type matching our institution type ?item wdt:P31 ?type . # Filter to relevant types for this institution (server-side filtering) VALUES ?type {{ {type_values} }} # Add location (P131: located in administrative territorial entity) OPTIONAL {{ ?item wdt:P131 ?location . }} OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P791 ?isil . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }} }} LIMIT 200 """ print("Testing SPARQL query for LIBRARY institutions in Tunisia") print("=" * 60) print(f"Valid types in mapping: {len(valid_types)}") print(f"Includes Q105338594: {'Q105338594' in valid_types}") print("\nExecuting query...") headers = {'User-Agent': USER_AGENT} params = { 'query': query, 'format': 'json' } time.sleep(1.5) response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60) response.raise_for_status() results = response.json() bindings = results.get("results", {}).get("bindings", []) print(f"Found {len(bindings)} LIBRARY institutions in Tunisia\n") # Look for Diocesan Library search_name = "Diocesan Library of Tunis" name_lower = search_name.lower() print(f"Searching for: '{search_name}'") print("-" * 60) found_diocesan = False for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else "" item_label = binding.get("itemLabel", {}).get("value", "") type_uri = binding.get("type", {}).get("value", "") type_qid = type_uri.split("/")[-1] if type_uri else "" type_label = binding.get("typeLabel", {}).get("value", "") # Check if this is the Diocesan Library if qid == "Q28149782": found_diocesan = True print(f"✅ FOUND Q28149782 in query results!") print(f" Label: {item_label}") print(f" Type: {type_qid} ({type_label})") print(f" Location: {binding.get('locationLabel', {}).get('value', 'N/A')}") # Test fuzzy matching label_score = fuzz.ratio(name_lower, item_label.lower()) partial_score = fuzz.partial_ratio(name_lower, item_label.lower()) token_score = fuzz.token_set_ratio(name_lower, item_label.lower()) best_score = max(label_score, partial_score, token_score) print(f"\n Fuzzy match scores:") print(f" Ratio: {label_score}%") print(f" Partial: {partial_score}%") print(f" Token set: {token_score}%") print(f" BEST: {best_score}%") if best_score >= 65: print(f" ✅ Would match (>= 65% threshold)") else: print(f" ❌ Below 65% threshold") if not found_diocesan: print("❌ Q28149782 NOT in query results") print("\nPossible reasons:") print(" 1. Type filtering excluded it (but Q105338594 is in VALUES)") print(" 2. Country filter issue (but we confirmed P17 = Q948 Tunisia)") print(" 3. SPARQL service issue") print("\n" + "=" * 60) print("All libraries found:") for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else "" item_label = binding.get("itemLabel", {}).get("value", "") type_uri = binding.get("type", {}).get("value", "") type_qid = type_uri.split("/")[-1] if type_uri else "" print(f" {qid}: {item_label} [{type_qid}]")