#!/usr/bin/env python3 """ Test the production script's search function directly. """ import yaml from pathlib import Path # Import the function from the production script import sys sys.path.insert(0, str(Path(__file__).parent)) # We'll copy the function here to avoid import issues import requests import time from typing import Optional, Dict, Any, Set from rapidfuzz import fuzz SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Tunisia-Test/1.0" UNIVERSITY_TYPES = { 'Q3918', # University 'Q875538', # Public university 'Q2467461', # Private university 'Q15936437', # Research university 'Q38723', # Higher education institution 'Q3354859', # Technical university } def get_valid_types_for_institution(inst_type: str) -> Set[str]: """Get set of valid Wikidata entity types for institution type.""" if inst_type == 'UNIVERSITY': return UNIVERSITY_TYPES return set() def search_wikidata_with_validation( name: str, inst_type: str, city: Optional[str] = None, timeout: int = 60 ) -> Optional[Dict[str, Any]]: """Production script function - copied exactly.""" # Get valid Wikidata entity types for this institution type valid_types = get_valid_types_for_institution(inst_type) if not valid_types: print(f" ⚠️ Unknown institution type: {inst_type}") return None # Build VALUES clause for SPARQL query type_values = " ".join([f"wd:{qid}" for qid in valid_types]) # Build SPARQL query query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?location ?locationLabel WHERE {{ # Must be in Tunisia ?item wdt:P17 wd:Q948 . # Must have an instance-of type matching our institution type ?item wdt:P31 ?type . # Filter to relevant types for this institution (server-side filtering) VALUES ?type {{ {type_values} }} # Add location (P131: located in administrative territorial entity) OPTIONAL {{ ?item wdt:P131 ?location . }} OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P791 ?isil . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }} }} LIMIT 200 """ headers = {'User-Agent': USER_AGENT} params = { 'query': query, 'format': 'json' } try: time.sleep(1.5) # Rate limiting response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout) response.raise_for_status() results = response.json() bindings = results.get("results", {}).get("bindings", []) print(f"Got {len(bindings)} results from SPARQL") if not bindings: return None # Fuzzy match against results WITH entity type AND geographic validation best_match = None best_score = 0 name_lower = name.lower() city_lower = city.lower() if city else None # Location-specific institution types require stricter geographic matching requires_city_match = inst_type in {'UNIVERSITY', 'RESEARCH_CENTER', 'EDUCATION_PROVIDER'} print(f"Requires city match: {requires_city_match}") print(f"City filter: {city_lower}") passed_validation = 0 for binding in bindings: # CRITICAL: Validate entity type FIRST entity_type_uri = binding.get("type", {}).get("value", "") entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None # Skip if entity type doesn't match our institution type if entity_type_qid not in valid_types: continue # GEOGRAPHIC VALIDATION: Check location match for location-specific institutions if city_lower and requires_city_match: location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else "" # Must have location data if not location_label: continue # Location must match expected city (fuzzy match for spelling variations) location_match = fuzz.ratio(city_lower, location_label) if location_match < 70: # Location mismatch - skip this result continue # Passed validation! passed_validation += 1 item_label = binding.get("itemLabel", {}).get("value", "") qid = binding.get("item", {}).get("value", "").split("/")[-1] # Now do fuzzy matching on validated entities only item_label_lower = item_label.lower() # Calculate match score using multiple strategies label_score = fuzz.ratio(name_lower, item_label_lower) partial_score = fuzz.partial_ratio(name_lower, item_label_lower) token_score = fuzz.token_set_ratio(name_lower, item_label_lower) # Best of the three fuzzy match strategies score = max(label_score, partial_score, token_score) if qid == "Q3551673": print(f"✅ Found Q3551673 in loop:") print(f" Label: {item_label}") print(f" Score: {score}%") print(f" Best score so far: {best_score}%") if score > best_score: best_score = score best_match = binding if qid == "Q3551673": print(f" ✅ Updated best_match to Q3551673!") print(f"Passed validation: {passed_validation}") print(f"Best score: {best_score}%") # Require minimum 70% match if best_score < 70: print(f"❌ Best score {best_score}% < 70%, returning None") return None # Extract data from best match if not best_match: print(f"❌ No best_match found, returning None") return None item_uri = best_match.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): print(f"❌ Invalid QID: {qid}") return None print(f"✅ Returning match: {qid}") result = { "qid": qid, "name": best_match.get("itemLabel", {}).get("value", ""), "description": best_match.get("itemDescription", {}).get("value", ""), "entity_type": best_match.get("typeLabel", {}).get("value", ""), "match_score": best_score } return result except Exception as e: print(f"❌ Error: {e}") return None def main(): print("Testing Production Function") print("=" * 60) # Test with University of Sousse result = search_wikidata_with_validation( "University of Sousse", "UNIVERSITY", "Sousse", timeout=60 ) if result: print("\n✅ SUCCESS!") print(f" QID: {result['qid']}") print(f" Name: {result['name']}") print(f" Score: {result['match_score']}%") else: print("\n❌ FAILURE! Function returned None") if __name__ == '__main__': main()