glam/scripts/debug_enrich_tunisia.py

#!/usr/bin/env python3
"""
Debug version of Tunisia enrichment script with extensive logging.
Focus on finding why University of Sousse (Q3551673) isn't being matched.
"""

import yaml
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from rapidfuzz import fuzz

SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Tunisia-Debug/1.0"

# Valid types for UNIVERSITY
UNIVERSITY_TYPES = {
    'Q3918',       # University
    'Q875538',     # Public university
    'Q2467461',    # Private university
    'Q15936437',   # Research university
    'Q38723',      # Higher education institution
    'Q3354859',    # Technical university
}

def search_wikidata_debug(name: str, inst_type: str, city: str = None):
    """Search with extensive logging."""

    print(f"\n{'='*60}")
    print(f"SEARCHING: {name}")
    print(f"Type: {inst_type}, City: {city}")
    print(f"{'='*60}")

    # Build query
    type_values = " ".join([f"wd:{qid}" for qid in UNIVERSITY_TYPES])

    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
                    ?viaf ?isil ?website ?coords ?inception ?itemAltLabel
                    ?location ?locationLabel
    WHERE {{
      # Must be in Tunisia
      ?item wdt:P17 wd:Q948 .

      # Must have an instance-of type matching our institution type
      ?item wdt:P31 ?type .

      # Filter to relevant types for this institution (server-side filtering)
      VALUES ?type {{ {type_values} }}

      # Add location (P131: located in administrative territorial entity)
      OPTIONAL {{ ?item wdt:P131 ?location . }}

      OPTIONAL {{ ?item wdt:P214 ?viaf . }}
      OPTIONAL {{ ?item wdt:P791 ?isil . }}
      OPTIONAL {{ ?item wdt:P856 ?website . }}
      OPTIONAL {{ ?item wdt:P625 ?coords . }}
      OPTIONAL {{ ?item wdt:P571 ?inception . }}
      OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
    }}
    LIMIT 200
    """

    headers = {'User-Agent': USER_AGENT}
    params = {'query': query, 'format': 'json'}

    print(f"Executing SPARQL query...")
    time.sleep(1.5)
    response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
    response.raise_for_status()

    results = response.json()
    bindings = results.get("results", {}).get("bindings", [])

    print(f"Total results from Wikidata: {len(bindings)}")

    # Check if Q3551673 is in results
    found_target = False
    for binding in bindings:
        item_uri = binding.get("item", {}).get("value", "")
        qid = item_uri.split("/")[-1]
        if qid == "Q3551673":
            found_target = True
            print(f"\n✅ Q3551673 IS in SPARQL results")
            print(f"   Label: {binding.get('itemLabel', {}).get('value', 'N/A')}")
            print(f"   Location: {binding.get('locationLabel', {}).get('value', 'N/A')}")
            break

    if not found_target:
        print(f"\n❌ Q3551673 NOT in SPARQL results")

    # Now apply matching logic
    print(f"\n--- APPLYING MATCHING LOGIC ---")

    best_match = None
    best_score = 0

    name_lower = name.lower()
    city_lower = city.lower() if city else None
    requires_city_match = inst_type in {'UNIVERSITY', 'RESEARCH_CENTER', 'EDUCATION_PROVIDER'}

    print(f"Requires city match: {requires_city_match}")
    print(f"Processing {len(bindings)} candidates...")

    rejected_count = {
        'entity_type': 0,
        'no_location': 0,
        'location_mismatch': 0,
        'low_score': 0,
        'passed': 0
    }

    for i, binding in enumerate(bindings, 1):
        item_uri = binding.get("item", {}).get("value", "")
        qid = item_uri.split("/")[-1]
        item_label = binding.get("itemLabel", {}).get("value", "")

        # Only log details for Q3551673
        is_target = (qid == "Q3551673")

        if is_target:
            print(f"\n🎯 Processing Q3551673 (result {i}/{len(bindings)}):")

        # Step 1: Entity type validation
        entity_type_uri = binding.get("type", {}).get("value", "")
        entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None

        if entity_type_qid not in UNIVERSITY_TYPES:
            rejected_count['entity_type'] += 1
            if is_target:
                print(f"   ❌ STEP 1 FAILED: Entity type {entity_type_qid} not in valid types")
            continue

        if is_target:
            print(f"   ✅ STEP 1 PASSED: Entity type validated ({entity_type_qid})")

        # Step 2: Geographic validation
        if city_lower and requires_city_match:
            location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""

            if not location_label:
                rejected_count['no_location'] += 1
                if is_target:
                    print(f"   ❌ STEP 2 FAILED: No location data")
                continue

            if is_target:
                print(f"   ✅ STEP 2a: Has location data ('{location_label}')")

            location_match = fuzz.ratio(city_lower, location_label)

            if is_target:
                print(f"   Location match score: {location_match}% (threshold: 70%)")

            if location_match < 70:
                rejected_count['location_mismatch'] += 1
                if is_target:
                    print(f"   ❌ STEP 2b FAILED: Location match {location_match}% < 70%")
                continue

            if is_target:
                print(f"   ✅ STEP 2b PASSED: Location validated")

        # Step 3: Name fuzzy matching
        item_label_lower = item_label.lower()

        label_score = fuzz.ratio(name_lower, item_label_lower)
        partial_score = fuzz.partial_ratio(name_lower, item_label_lower)
        token_score = fuzz.token_set_ratio(name_lower, item_label_lower)

        score = max(label_score, partial_score, token_score)

        if is_target:
            print(f"   STEP 3: Name matching:")
            print(f"     Search: '{name_lower}'")
            print(f"     Label: '{item_label_lower}'")
            print(f"     Scores: label={label_score}%, partial={partial_score}%, token={token_score}%")
            print(f"     Best: {score}%")

        if score > best_score:
            best_score = score
            best_match = binding
            if is_target:
                print(f"   ✅ STEP 3 PASSED: New best match!")

        if score >= 70:
            rejected_count['passed'] += 1

    print(f"\n--- FILTERING RESULTS ---")
    print(f"  Entity type rejected: {rejected_count['entity_type']}")
    print(f"  No location data: {rejected_count['no_location']}")
    print(f"  Location mismatch: {rejected_count['location_mismatch']}")
    print(f"  Low name score: {rejected_count['low_score']}")
    print(f"  Passed all checks: {rejected_count['passed']}")

    print(f"\n--- FINAL RESULT ---")
    print(f"Best score: {best_score}%")
    print(f"Threshold: 70%")

    if best_score < 70:
        print(f"❌ REJECTED: Best score {best_score}% < 70%")
        return None

    if not best_match:
        print(f"❌ No match found")
        return None

    item_uri = best_match.get("item", {}).get("value", "")
    qid = item_uri.split("/")[-1]

    print(f"✅ MATCH: {qid} - {best_match.get('itemLabel', {}).get('value', '')}")
    print(f"   Score: {best_score}%")

    return {
        "qid": qid,
        "name": best_match.get("itemLabel", {}).get("value", ""),
        "match_score": best_score
    }

def main():
    input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')

    print("Tunisia Wikidata Enrichment - DEBUG MODE")
    print("="*60)
    print("Testing with University of Sousse")
    print("="*60)

    # Load data
    with open(input_file, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    institutions = data['institutions']

    # Find University of Sousse
    target_inst = None
    for inst in institutions:
        if inst.get('name') == 'University of Sousse':
            target_inst = inst
            break

    if not target_inst:
        print("❌ University of Sousse not found in data")
        return

    print(f"\nFound institution:")
    print(f"  Name: {target_inst['name']}")
    print(f"  Type: {target_inst.get('institution_type')}")
    print(f"  City: {target_inst.get('locations', [{}])[0].get('city', '')}")

    # Test search
    result = search_wikidata_debug(
        target_inst['name'],
        target_inst.get('institution_type', 'UNIVERSITY'),
        target_inst.get('locations', [{}])[0].get('city', '')
    )

    if result:
        print(f"\n{'='*60}")
        print(f"SUCCESS! Would enrich with {result['qid']}")
        print(f"{'='*60}")
    else:
        print(f"\n{'='*60}")
        print(f"FAILURE! No match found (this is the bug)")
        print(f"{'='*60}")

if __name__ == '__main__':
    main()