glam/scripts/debug_matching_logic.py

#!/usr/bin/env python3
"""
Debug version with verbose logging to see exactly what's happening.
"""

import requests
import time
from rapidfuzz import fuzz

SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Tunisia-Debug/1.0"

# Valid types for UNIVERSITY
UNIVERSITY_TYPES = {
    'Q3918',       # University
    'Q875538',     # Public university
    'Q2467461',    # Private university
    'Q15936437',   # Research university
    'Q38723',      # Higher education institution
    'Q3354859',    # Technical university
}

def debug_search():
    """Search with detailed logging."""

    name = "University of Sousse"
    inst_type = "UNIVERSITY"
    city = "Sousse"

    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
                    ?viaf ?isil ?website ?coords ?inception ?itemAltLabel
                    ?location ?locationLabel
    WHERE {
      # Must be in Tunisia
      ?item wdt:P17 wd:Q948 .

      # Must have an instance-of type
      ?item wdt:P31 ?type .

      # Limit to universities
      VALUES ?type {
        wd:Q3918       # University
        wd:Q875538     # Public university
        wd:Q38723      # Higher education institution
      }

      # Add location (P131: located in administrative territorial entity)
      OPTIONAL { ?item wdt:P131 ?location . }

      OPTIONAL { ?item wdt:P214 ?viaf . }
      OPTIONAL { ?item wdt:P791 ?isil . }
      OPTIONAL { ?item wdt:P856 ?website . }
      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P571 ?inception . }
      OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
    }
    LIMIT 200
    """

    headers = {'User-Agent': USER_AGENT}
    params = {
        'query': query,
        'format': 'json'
    }

    print("Executing SPARQL query...")
    time.sleep(1.5)
    response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
    response.raise_for_status()

    results = response.json()
    bindings = results.get("results", {}).get("bindings", [])

    print(f"Total results: {len(bindings)}")

    # Look specifically for Q3551673
    found_q3551673 = False
    q3551673_data = None

    for binding in bindings:
        item_uri = binding.get("item", {}).get("value", "")
        qid = item_uri.split("/")[-1]

        if qid == "Q3551673":
            found_q3551673 = True
            q3551673_data = binding
            break

    if found_q3551673:
        print(f"\n✅ Q3551673 IS in the results!")
        print(f"\nData for Q3551673:")
        print(f"  itemLabel: {q3551673_data.get('itemLabel', {}).get('value', 'N/A')}")
        print(f"  typeLabel: {q3551673_data.get('typeLabel', {}).get('value', 'N/A')}")
        print(f"  locationLabel: {q3551673_data.get('locationLabel', {}).get('value', 'N/A')}")
    else:
        print(f"\n❌ Q3551673 NOT in the results")
        return

    # Now simulate the matching logic
    print(f"\n{'='*60}")
    print("SIMULATING MATCHING LOGIC")
    print(f"{'='*60}")

    print(f"\nSearch parameters:")
    print(f"  Name: '{name}'")
    print(f"  Institution Type: {inst_type}")
    print(f"  City: '{city}'")

    name_lower = name.lower()
    city_lower = city.lower()
    requires_city_match = True  # UNIVERSITY requires city match

    # Process Q3551673
    binding = q3551673_data

    # Step 1: Entity type validation
    print(f"\n--- STEP 1: Entity Type Validation ---")
    entity_type_uri = binding.get("type", {}).get("value", "")
    entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None
    print(f"  Entity type QID: {entity_type_qid}")
    print(f"  Valid types: {UNIVERSITY_TYPES}")
    print(f"  Type matches: {entity_type_qid in UNIVERSITY_TYPES}")

    if entity_type_qid not in UNIVERSITY_TYPES:
        print(f"  ❌ REJECTED: Entity type mismatch")
        return
    else:
        print(f"  ✅ PASSED: Entity type validated")

    # Step 2: Geographic validation
    print(f"\n--- STEP 2: Geographic Validation ---")
    print(f"  Requires city match: {requires_city_match}")

    if city_lower and requires_city_match:
        location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""
        print(f"  Location label from Wikidata: '{location_label}'")
        print(f"  Expected city: '{city_lower}'")

        if not location_label:
            print(f"  ❌ REJECTED: No location data")
            return
        else:
            print(f"  ✅ Has location data")

        location_match = fuzz.ratio(city_lower, location_label)
        print(f"  Location fuzzy match: {location_match}%")
        print(f"  Threshold: 70%")

        if location_match < 70:
            print(f"  ❌ REJECTED: Location match below threshold")
            return
        else:
            print(f"  ✅ PASSED: Location validated")

    # Step 3: Name fuzzy matching
    print(f"\n--- STEP 3: Name Fuzzy Matching ---")
    item_label = binding.get("itemLabel", {}).get("value", "").lower()
    print(f"  Search name: '{name_lower}'")
    print(f"  Wikidata label: '{item_label}'")

    label_score = fuzz.ratio(name_lower, item_label)
    partial_score = fuzz.partial_ratio(name_lower, item_label)
    token_score = fuzz.token_set_ratio(name_lower, item_label)

    print(f"\n  Fuzzy match scores:")
    print(f"    Label score: {label_score}%")
    print(f"    Partial score: {partial_score}%")
    print(f"    Token set score: {token_score}%")

    best_score = max(label_score, partial_score, token_score)
    print(f"    Best score: {best_score}%")
    print(f"    Threshold: 70%")

    if best_score < 70:
        print(f"  ❌ REJECTED: Match score below threshold")
    else:
        print(f"  ✅ PASSED: Name match validated")
        print(f"\n🎉 Q3551673 SHOULD BE MATCHED!")

if __name__ == '__main__':
    debug_search()