glam/scripts/test_library_query.py

#!/usr/bin/env python3
"""
Test the actual SPARQL query used in the enrichment script for LIBRARY type.
"""

import requests
import time
from rapidfuzz import fuzz

SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Tunisia-Debug/1.0"

# LIBRARY type mapping from the script
valid_types = {
    'Q7075',       # Library
    'Q2668072',    # National library
    'Q570116',     # Public library
    'Q5193377',    # University library
    'Q28564',      # Academic library
    'Q1479716',    # Regional library
    'Q1622062',    # Digital library
    'Q17297735',   # Diocesan library
    'Q105338594',  # Bibliothèque diocésaine (specific diocesan library subtype)
}

type_values = " ".join([f"wd:{qid}" for qid in valid_types])

query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
                ?viaf ?isil ?website ?coords ?inception ?itemAltLabel
                ?location ?locationLabel
WHERE {{
  # Must be in Tunisia
  ?item wdt:P17 wd:Q948 .

  # Must have an instance-of type matching our institution type
  ?item wdt:P31 ?type .

  # Filter to relevant types for this institution (server-side filtering)
  VALUES ?type {{ {type_values} }}

  # Add location (P131: located in administrative territorial entity)
  OPTIONAL {{ ?item wdt:P131 ?location . }}

  OPTIONAL {{ ?item wdt:P214 ?viaf . }}
  OPTIONAL {{ ?item wdt:P791 ?isil . }}
  OPTIONAL {{ ?item wdt:P856 ?website . }}
  OPTIONAL {{ ?item wdt:P625 ?coords . }}
  OPTIONAL {{ ?item wdt:P571 ?inception . }}
  OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }}

  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
}}
LIMIT 200
"""

print("Testing SPARQL query for LIBRARY institutions in Tunisia")
print("=" * 60)
print(f"Valid types in mapping: {len(valid_types)}")
print(f"Includes Q105338594: {'Q105338594' in valid_types}")
print("\nExecuting query...")

headers = {'User-Agent': USER_AGENT}
params = {
    'query': query,
    'format': 'json'
}

time.sleep(1.5)
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
response.raise_for_status()

results = response.json()
bindings = results.get("results", {}).get("bindings", [])

print(f"Found {len(bindings)} LIBRARY institutions in Tunisia\n")

# Look for Diocesan Library
search_name = "Diocesan Library of Tunis"
name_lower = search_name.lower()

print(f"Searching for: '{search_name}'")
print("-" * 60)

found_diocesan = False
for binding in bindings:
    item_uri = binding.get("item", {}).get("value", "")
    qid = item_uri.split("/")[-1] if item_uri else ""
    item_label = binding.get("itemLabel", {}).get("value", "")
    type_uri = binding.get("type", {}).get("value", "")
    type_qid = type_uri.split("/")[-1] if type_uri else ""
    type_label = binding.get("typeLabel", {}).get("value", "")

    # Check if this is the Diocesan Library
    if qid == "Q28149782":
        found_diocesan = True
        print(f"✅ FOUND Q28149782 in query results!")
        print(f"   Label: {item_label}")
        print(f"   Type: {type_qid} ({type_label})")
        print(f"   Location: {binding.get('locationLabel', {}).get('value', 'N/A')}")

        # Test fuzzy matching
        label_score = fuzz.ratio(name_lower, item_label.lower())
        partial_score = fuzz.partial_ratio(name_lower, item_label.lower())
        token_score = fuzz.token_set_ratio(name_lower, item_label.lower())
        best_score = max(label_score, partial_score, token_score)

        print(f"\n   Fuzzy match scores:")
        print(f"     Ratio: {label_score}%")
        print(f"     Partial: {partial_score}%")
        print(f"     Token set: {token_score}%")
        print(f"     BEST: {best_score}%")

        if best_score >= 65:
            print(f"   ✅ Would match (>= 65% threshold)")
        else:
            print(f"   ❌ Below 65% threshold")

if not found_diocesan:
    print("❌ Q28149782 NOT in query results")
    print("\nPossible reasons:")
    print("  1. Type filtering excluded it (but Q105338594 is in VALUES)")
    print("  2. Country filter issue (but we confirmed P17 = Q948 Tunisia)")
    print("  3. SPARQL service issue")

print("\n" + "=" * 60)
print("All libraries found:")
for binding in bindings:
    item_uri = binding.get("item", {}).get("value", "")
    qid = item_uri.split("/")[-1] if item_uri else ""
    item_label = binding.get("itemLabel", {}).get("value", "")
    type_uri = binding.get("type", {}).get("value", "")
    type_qid = type_uri.split("/")[-1] if type_uri else ""
    print(f"  {qid}: {item_label} [{type_qid}]")