glam/scripts/test_production_function.py

#!/usr/bin/env python3
"""
Test the production script's search function directly.
"""

import yaml
from pathlib import Path

# Import the function from the production script
import sys
sys.path.insert(0, str(Path(__file__).parent))

# We'll copy the function here to avoid import issues
import requests
import time
from typing import Optional, Dict, Any, Set
from rapidfuzz import fuzz

SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Tunisia-Test/1.0"

UNIVERSITY_TYPES = {
    'Q3918',       # University
    'Q875538',     # Public university
    'Q2467461',    # Private university
    'Q15936437',   # Research university
    'Q38723',      # Higher education institution
    'Q3354859',    # Technical university
}

def get_valid_types_for_institution(inst_type: str) -> Set[str]:
    """Get set of valid Wikidata entity types for institution type."""
    if inst_type == 'UNIVERSITY':
        return UNIVERSITY_TYPES
    return set()

def search_wikidata_with_validation(
    name: str,
    inst_type: str,
    city: Optional[str] = None,
    timeout: int = 60
) -> Optional[Dict[str, Any]]:
    """Production script function - copied exactly."""

    # Get valid Wikidata entity types for this institution type
    valid_types = get_valid_types_for_institution(inst_type)

    if not valid_types:
        print(f"  ⚠️  Unknown institution type: {inst_type}")
        return None

    # Build VALUES clause for SPARQL query
    type_values = " ".join([f"wd:{qid}" for qid in valid_types])

    # Build SPARQL query
    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
                    ?viaf ?isil ?website ?coords ?inception ?itemAltLabel
                    ?location ?locationLabel
    WHERE {{
      # Must be in Tunisia
      ?item wdt:P17 wd:Q948 .

      # Must have an instance-of type matching our institution type
      ?item wdt:P31 ?type .

      # Filter to relevant types for this institution (server-side filtering)
      VALUES ?type {{ {type_values} }}

      # Add location (P131: located in administrative territorial entity)
      OPTIONAL {{ ?item wdt:P131 ?location . }}

      OPTIONAL {{ ?item wdt:P214 ?viaf . }}
      OPTIONAL {{ ?item wdt:P791 ?isil . }}
      OPTIONAL {{ ?item wdt:P856 ?website . }}
      OPTIONAL {{ ?item wdt:P625 ?coords . }}
      OPTIONAL {{ ?item wdt:P571 ?inception . }}
      OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
    }}
    LIMIT 200
    """

    headers = {'User-Agent': USER_AGENT}
    params = {
        'query': query,
        'format': 'json'
    }

    try:
        time.sleep(1.5)  # Rate limiting
        response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
        response.raise_for_status()

        results = response.json()
        bindings = results.get("results", {}).get("bindings", [])

        print(f"Got {len(bindings)} results from SPARQL")

        if not bindings:
            return None

        # Fuzzy match against results WITH entity type AND geographic validation
        best_match = None
        best_score = 0

        name_lower = name.lower()
        city_lower = city.lower() if city else None

        # Location-specific institution types require stricter geographic matching
        requires_city_match = inst_type in {'UNIVERSITY', 'RESEARCH_CENTER', 'EDUCATION_PROVIDER'}

        print(f"Requires city match: {requires_city_match}")
        print(f"City filter: {city_lower}")

        passed_validation = 0

        for binding in bindings:
            # CRITICAL: Validate entity type FIRST
            entity_type_uri = binding.get("type", {}).get("value", "")
            entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None

            # Skip if entity type doesn't match our institution type
            if entity_type_qid not in valid_types:
                continue

            # GEOGRAPHIC VALIDATION: Check location match for location-specific institutions
            if city_lower and requires_city_match:
                location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""

                # Must have location data
                if not location_label:
                    continue

                # Location must match expected city (fuzzy match for spelling variations)
                location_match = fuzz.ratio(city_lower, location_label)
                if location_match < 70:  # Location mismatch - skip this result
                    continue

            # Passed validation!
            passed_validation += 1
            item_label = binding.get("itemLabel", {}).get("value", "")
            qid = binding.get("item", {}).get("value", "").split("/")[-1]

            # Now do fuzzy matching on validated entities only
            item_label_lower = item_label.lower()

            # Calculate match score using multiple strategies
            label_score = fuzz.ratio(name_lower, item_label_lower)
            partial_score = fuzz.partial_ratio(name_lower, item_label_lower)
            token_score = fuzz.token_set_ratio(name_lower, item_label_lower)

            # Best of the three fuzzy match strategies
            score = max(label_score, partial_score, token_score)

            if qid == "Q3551673":
                print(f"✅ Found Q3551673 in loop:")
                print(f"   Label: {item_label}")
                print(f"   Score: {score}%")
                print(f"   Best score so far: {best_score}%")

            if score > best_score:
                best_score = score
                best_match = binding
                if qid == "Q3551673":
                    print(f"   ✅ Updated best_match to Q3551673!")

        print(f"Passed validation: {passed_validation}")
        print(f"Best score: {best_score}%")

        # Require minimum 70% match
        if best_score < 70:
            print(f"❌ Best score {best_score}% < 70%, returning None")
            return None

        # Extract data from best match
        if not best_match:
            print(f"❌ No best_match found, returning None")
            return None

        item_uri = best_match.get("item", {}).get("value", "")
        qid = item_uri.split("/")[-1] if item_uri else None

        if not qid or not qid.startswith("Q"):
            print(f"❌ Invalid QID: {qid}")
            return None

        print(f"✅ Returning match: {qid}")

        result = {
            "qid": qid,
            "name": best_match.get("itemLabel", {}).get("value", ""),
            "description": best_match.get("itemDescription", {}).get("value", ""),
            "entity_type": best_match.get("typeLabel", {}).get("value", ""),
            "match_score": best_score
        }

        return result

    except Exception as e:
        print(f"❌ Error: {e}")
        return None

def main():
    print("Testing Production Function")
    print("=" * 60)

    # Test with University of Sousse
    result = search_wikidata_with_validation(
        "University of Sousse",
        "UNIVERSITY",
        "Sousse",
        timeout=60
    )

    if result:
        print("\n✅ SUCCESS!")
        print(f"   QID: {result['qid']}")
        print(f"   Name: {result['name']}")
        print(f"   Score: {result['match_score']}%")
    else:
        print("\n❌ FAILURE! Function returned None")

if __name__ == '__main__':
    main()