glam/scripts/debug_matching_logic.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

184 lines
6 KiB
Python

#!/usr/bin/env python3
"""
Debug version with verbose logging to see exactly what's happening.
"""
import requests
import time
from rapidfuzz import fuzz
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Tunisia-Debug/1.0"
# Valid types for UNIVERSITY
UNIVERSITY_TYPES = {
'Q3918', # University
'Q875538', # Public university
'Q2467461', # Private university
'Q15936437', # Research university
'Q38723', # Higher education institution
'Q3354859', # Technical university
}
def debug_search():
"""Search with detailed logging."""
name = "University of Sousse"
inst_type = "UNIVERSITY"
city = "Sousse"
query = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
?viaf ?isil ?website ?coords ?inception ?itemAltLabel
?location ?locationLabel
WHERE {
# Must be in Tunisia
?item wdt:P17 wd:Q948 .
# Must have an instance-of type
?item wdt:P31 ?type .
# Limit to universities
VALUES ?type {
wd:Q3918 # University
wd:Q875538 # Public university
wd:Q38723 # Higher education institution
}
# Add location (P131: located in administrative territorial entity)
OPTIONAL { ?item wdt:P131 ?location . }
OPTIONAL { ?item wdt:P214 ?viaf . }
OPTIONAL { ?item wdt:P791 ?isil . }
OPTIONAL { ?item wdt:P856 ?website . }
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P571 ?inception . }
OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
}
LIMIT 200
"""
headers = {'User-Agent': USER_AGENT}
params = {
'query': query,
'format': 'json'
}
print("Executing SPARQL query...")
time.sleep(1.5)
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
response.raise_for_status()
results = response.json()
bindings = results.get("results", {}).get("bindings", [])
print(f"Total results: {len(bindings)}")
# Look specifically for Q3551673
found_q3551673 = False
q3551673_data = None
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1]
if qid == "Q3551673":
found_q3551673 = True
q3551673_data = binding
break
if found_q3551673:
print(f"\n✅ Q3551673 IS in the results!")
print(f"\nData for Q3551673:")
print(f" itemLabel: {q3551673_data.get('itemLabel', {}).get('value', 'N/A')}")
print(f" typeLabel: {q3551673_data.get('typeLabel', {}).get('value', 'N/A')}")
print(f" locationLabel: {q3551673_data.get('locationLabel', {}).get('value', 'N/A')}")
else:
print(f"\n❌ Q3551673 NOT in the results")
return
# Now simulate the matching logic
print(f"\n{'='*60}")
print("SIMULATING MATCHING LOGIC")
print(f"{'='*60}")
print(f"\nSearch parameters:")
print(f" Name: '{name}'")
print(f" Institution Type: {inst_type}")
print(f" City: '{city}'")
name_lower = name.lower()
city_lower = city.lower()
requires_city_match = True # UNIVERSITY requires city match
# Process Q3551673
binding = q3551673_data
# Step 1: Entity type validation
print(f"\n--- STEP 1: Entity Type Validation ---")
entity_type_uri = binding.get("type", {}).get("value", "")
entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None
print(f" Entity type QID: {entity_type_qid}")
print(f" Valid types: {UNIVERSITY_TYPES}")
print(f" Type matches: {entity_type_qid in UNIVERSITY_TYPES}")
if entity_type_qid not in UNIVERSITY_TYPES:
print(f" ❌ REJECTED: Entity type mismatch")
return
else:
print(f" ✅ PASSED: Entity type validated")
# Step 2: Geographic validation
print(f"\n--- STEP 2: Geographic Validation ---")
print(f" Requires city match: {requires_city_match}")
if city_lower and requires_city_match:
location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""
print(f" Location label from Wikidata: '{location_label}'")
print(f" Expected city: '{city_lower}'")
if not location_label:
print(f" ❌ REJECTED: No location data")
return
else:
print(f" ✅ Has location data")
location_match = fuzz.ratio(city_lower, location_label)
print(f" Location fuzzy match: {location_match}%")
print(f" Threshold: 70%")
if location_match < 70:
print(f" ❌ REJECTED: Location match below threshold")
return
else:
print(f" ✅ PASSED: Location validated")
# Step 3: Name fuzzy matching
print(f"\n--- STEP 3: Name Fuzzy Matching ---")
item_label = binding.get("itemLabel", {}).get("value", "").lower()
print(f" Search name: '{name_lower}'")
print(f" Wikidata label: '{item_label}'")
label_score = fuzz.ratio(name_lower, item_label)
partial_score = fuzz.partial_ratio(name_lower, item_label)
token_score = fuzz.token_set_ratio(name_lower, item_label)
print(f"\n Fuzzy match scores:")
print(f" Label score: {label_score}%")
print(f" Partial score: {partial_score}%")
print(f" Token set score: {token_score}%")
best_score = max(label_score, partial_score, token_score)
print(f" Best score: {best_score}%")
print(f" Threshold: 70%")
if best_score < 70:
print(f" ❌ REJECTED: Match score below threshold")
else:
print(f" ✅ PASSED: Name match validated")
print(f"\n🎉 Q3551673 SHOULD BE MATCHED!")
if __name__ == '__main__':
debug_search()