#!/usr/bin/env python3 """ Debug version of Tunisia enrichment script with extensive logging. Focus on finding why University of Sousse (Q3551673) isn't being matched. """ import yaml import time import requests from datetime import datetime, timezone from pathlib import Path from rapidfuzz import fuzz SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Tunisia-Debug/1.0" # Valid types for UNIVERSITY UNIVERSITY_TYPES = { 'Q3918', # University 'Q875538', # Public university 'Q2467461', # Private university 'Q15936437', # Research university 'Q38723', # Higher education institution 'Q3354859', # Technical university } def search_wikidata_debug(name: str, inst_type: str, city: str = None): """Search with extensive logging.""" print(f"\n{'='*60}") print(f"SEARCHING: {name}") print(f"Type: {inst_type}, City: {city}") print(f"{'='*60}") # Build query type_values = " ".join([f"wd:{qid}" for qid in UNIVERSITY_TYPES]) query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?location ?locationLabel WHERE {{ # Must be in Tunisia ?item wdt:P17 wd:Q948 . # Must have an instance-of type matching our institution type ?item wdt:P31 ?type . # Filter to relevant types for this institution (server-side filtering) VALUES ?type {{ {type_values} }} # Add location (P131: located in administrative territorial entity) OPTIONAL {{ ?item wdt:P131 ?location . }} OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P791 ?isil . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }} }} LIMIT 200 """ headers = {'User-Agent': USER_AGENT} params = {'query': query, 'format': 'json'} print(f"Executing SPARQL query...") time.sleep(1.5) response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60) response.raise_for_status() results = response.json() bindings = results.get("results", {}).get("bindings", []) print(f"Total results from Wikidata: {len(bindings)}") # Check if Q3551673 is in results found_target = False for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if qid == "Q3551673": found_target = True print(f"\nāœ… Q3551673 IS in SPARQL results") print(f" Label: {binding.get('itemLabel', {}).get('value', 'N/A')}") print(f" Location: {binding.get('locationLabel', {}).get('value', 'N/A')}") break if not found_target: print(f"\nāŒ Q3551673 NOT in SPARQL results") # Now apply matching logic print(f"\n--- APPLYING MATCHING LOGIC ---") best_match = None best_score = 0 name_lower = name.lower() city_lower = city.lower() if city else None requires_city_match = inst_type in {'UNIVERSITY', 'RESEARCH_CENTER', 'EDUCATION_PROVIDER'} print(f"Requires city match: {requires_city_match}") print(f"Processing {len(bindings)} candidates...") rejected_count = { 'entity_type': 0, 'no_location': 0, 'location_mismatch': 0, 'low_score': 0, 'passed': 0 } for i, binding in enumerate(bindings, 1): item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] item_label = binding.get("itemLabel", {}).get("value", "") # Only log details for Q3551673 is_target = (qid == "Q3551673") if is_target: print(f"\nšŸŽÆ Processing Q3551673 (result {i}/{len(bindings)}):") # Step 1: Entity type validation entity_type_uri = binding.get("type", {}).get("value", "") entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None if entity_type_qid not in UNIVERSITY_TYPES: rejected_count['entity_type'] += 1 if is_target: print(f" āŒ STEP 1 FAILED: Entity type {entity_type_qid} not in valid types") continue if is_target: print(f" āœ… STEP 1 PASSED: Entity type validated ({entity_type_qid})") # Step 2: Geographic validation if city_lower and requires_city_match: location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else "" if not location_label: rejected_count['no_location'] += 1 if is_target: print(f" āŒ STEP 2 FAILED: No location data") continue if is_target: print(f" āœ… STEP 2a: Has location data ('{location_label}')") location_match = fuzz.ratio(city_lower, location_label) if is_target: print(f" Location match score: {location_match}% (threshold: 70%)") if location_match < 70: rejected_count['location_mismatch'] += 1 if is_target: print(f" āŒ STEP 2b FAILED: Location match {location_match}% < 70%") continue if is_target: print(f" āœ… STEP 2b PASSED: Location validated") # Step 3: Name fuzzy matching item_label_lower = item_label.lower() label_score = fuzz.ratio(name_lower, item_label_lower) partial_score = fuzz.partial_ratio(name_lower, item_label_lower) token_score = fuzz.token_set_ratio(name_lower, item_label_lower) score = max(label_score, partial_score, token_score) if is_target: print(f" STEP 3: Name matching:") print(f" Search: '{name_lower}'") print(f" Label: '{item_label_lower}'") print(f" Scores: label={label_score}%, partial={partial_score}%, token={token_score}%") print(f" Best: {score}%") if score > best_score: best_score = score best_match = binding if is_target: print(f" āœ… STEP 3 PASSED: New best match!") if score >= 70: rejected_count['passed'] += 1 print(f"\n--- FILTERING RESULTS ---") print(f" Entity type rejected: {rejected_count['entity_type']}") print(f" No location data: {rejected_count['no_location']}") print(f" Location mismatch: {rejected_count['location_mismatch']}") print(f" Low name score: {rejected_count['low_score']}") print(f" Passed all checks: {rejected_count['passed']}") print(f"\n--- FINAL RESULT ---") print(f"Best score: {best_score}%") print(f"Threshold: 70%") if best_score < 70: print(f"āŒ REJECTED: Best score {best_score}% < 70%") return None if not best_match: print(f"āŒ No match found") return None item_uri = best_match.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] print(f"āœ… MATCH: {qid} - {best_match.get('itemLabel', {}).get('value', '')}") print(f" Score: {best_score}%") return { "qid": qid, "name": best_match.get("itemLabel", {}).get("value", ""), "match_score": best_score } def main(): input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml') print("Tunisia Wikidata Enrichment - DEBUG MODE") print("="*60) print("Testing with University of Sousse") print("="*60) # Load data with open(input_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) institutions = data['institutions'] # Find University of Sousse target_inst = None for inst in institutions: if inst.get('name') == 'University of Sousse': target_inst = inst break if not target_inst: print("āŒ University of Sousse not found in data") return print(f"\nFound institution:") print(f" Name: {target_inst['name']}") print(f" Type: {target_inst.get('institution_type')}") print(f" City: {target_inst.get('locations', [{}])[0].get('city', '')}") # Test search result = search_wikidata_debug( target_inst['name'], target_inst.get('institution_type', 'UNIVERSITY'), target_inst.get('locations', [{}])[0].get('city', '') ) if result: print(f"\n{'='*60}") print(f"SUCCESS! Would enrich with {result['qid']}") print(f"{'='*60}") else: print(f"\n{'='*60}") print(f"FAILURE! No match found (this is the bug)") print(f"{'='*60}") if __name__ == '__main__': main()