- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
184 lines
6 KiB
Python
184 lines
6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Debug version with verbose logging to see exactly what's happening.
|
|
"""
|
|
|
|
import requests
|
|
import time
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Tunisia-Debug/1.0"
|
|
|
|
# Valid types for UNIVERSITY
|
|
UNIVERSITY_TYPES = {
|
|
'Q3918', # University
|
|
'Q875538', # Public university
|
|
'Q2467461', # Private university
|
|
'Q15936437', # Research university
|
|
'Q38723', # Higher education institution
|
|
'Q3354859', # Technical university
|
|
}
|
|
|
|
def debug_search():
|
|
"""Search with detailed logging."""
|
|
|
|
name = "University of Sousse"
|
|
inst_type = "UNIVERSITY"
|
|
city = "Sousse"
|
|
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
|
|
?viaf ?isil ?website ?coords ?inception ?itemAltLabel
|
|
?location ?locationLabel
|
|
WHERE {
|
|
# Must be in Tunisia
|
|
?item wdt:P17 wd:Q948 .
|
|
|
|
# Must have an instance-of type
|
|
?item wdt:P31 ?type .
|
|
|
|
# Limit to universities
|
|
VALUES ?type {
|
|
wd:Q3918 # University
|
|
wd:Q875538 # Public university
|
|
wd:Q38723 # Higher education institution
|
|
}
|
|
|
|
# Add location (P131: located in administrative territorial entity)
|
|
OPTIONAL { ?item wdt:P131 ?location . }
|
|
|
|
OPTIONAL { ?item wdt:P214 ?viaf . }
|
|
OPTIONAL { ?item wdt:P791 ?isil . }
|
|
OPTIONAL { ?item wdt:P856 ?website . }
|
|
OPTIONAL { ?item wdt:P625 ?coords . }
|
|
OPTIONAL { ?item wdt:P571 ?inception . }
|
|
OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
|
|
}
|
|
LIMIT 200
|
|
"""
|
|
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {
|
|
'query': query,
|
|
'format': 'json'
|
|
}
|
|
|
|
print("Executing SPARQL query...")
|
|
time.sleep(1.5)
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
print(f"Total results: {len(bindings)}")
|
|
|
|
# Look specifically for Q3551673
|
|
found_q3551673 = False
|
|
q3551673_data = None
|
|
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1]
|
|
|
|
if qid == "Q3551673":
|
|
found_q3551673 = True
|
|
q3551673_data = binding
|
|
break
|
|
|
|
if found_q3551673:
|
|
print(f"\n✅ Q3551673 IS in the results!")
|
|
print(f"\nData for Q3551673:")
|
|
print(f" itemLabel: {q3551673_data.get('itemLabel', {}).get('value', 'N/A')}")
|
|
print(f" typeLabel: {q3551673_data.get('typeLabel', {}).get('value', 'N/A')}")
|
|
print(f" locationLabel: {q3551673_data.get('locationLabel', {}).get('value', 'N/A')}")
|
|
else:
|
|
print(f"\n❌ Q3551673 NOT in the results")
|
|
return
|
|
|
|
# Now simulate the matching logic
|
|
print(f"\n{'='*60}")
|
|
print("SIMULATING MATCHING LOGIC")
|
|
print(f"{'='*60}")
|
|
|
|
print(f"\nSearch parameters:")
|
|
print(f" Name: '{name}'")
|
|
print(f" Institution Type: {inst_type}")
|
|
print(f" City: '{city}'")
|
|
|
|
name_lower = name.lower()
|
|
city_lower = city.lower()
|
|
requires_city_match = True # UNIVERSITY requires city match
|
|
|
|
# Process Q3551673
|
|
binding = q3551673_data
|
|
|
|
# Step 1: Entity type validation
|
|
print(f"\n--- STEP 1: Entity Type Validation ---")
|
|
entity_type_uri = binding.get("type", {}).get("value", "")
|
|
entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None
|
|
print(f" Entity type QID: {entity_type_qid}")
|
|
print(f" Valid types: {UNIVERSITY_TYPES}")
|
|
print(f" Type matches: {entity_type_qid in UNIVERSITY_TYPES}")
|
|
|
|
if entity_type_qid not in UNIVERSITY_TYPES:
|
|
print(f" ❌ REJECTED: Entity type mismatch")
|
|
return
|
|
else:
|
|
print(f" ✅ PASSED: Entity type validated")
|
|
|
|
# Step 2: Geographic validation
|
|
print(f"\n--- STEP 2: Geographic Validation ---")
|
|
print(f" Requires city match: {requires_city_match}")
|
|
|
|
if city_lower and requires_city_match:
|
|
location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""
|
|
print(f" Location label from Wikidata: '{location_label}'")
|
|
print(f" Expected city: '{city_lower}'")
|
|
|
|
if not location_label:
|
|
print(f" ❌ REJECTED: No location data")
|
|
return
|
|
else:
|
|
print(f" ✅ Has location data")
|
|
|
|
location_match = fuzz.ratio(city_lower, location_label)
|
|
print(f" Location fuzzy match: {location_match}%")
|
|
print(f" Threshold: 70%")
|
|
|
|
if location_match < 70:
|
|
print(f" ❌ REJECTED: Location match below threshold")
|
|
return
|
|
else:
|
|
print(f" ✅ PASSED: Location validated")
|
|
|
|
# Step 3: Name fuzzy matching
|
|
print(f"\n--- STEP 3: Name Fuzzy Matching ---")
|
|
item_label = binding.get("itemLabel", {}).get("value", "").lower()
|
|
print(f" Search name: '{name_lower}'")
|
|
print(f" Wikidata label: '{item_label}'")
|
|
|
|
label_score = fuzz.ratio(name_lower, item_label)
|
|
partial_score = fuzz.partial_ratio(name_lower, item_label)
|
|
token_score = fuzz.token_set_ratio(name_lower, item_label)
|
|
|
|
print(f"\n Fuzzy match scores:")
|
|
print(f" Label score: {label_score}%")
|
|
print(f" Partial score: {partial_score}%")
|
|
print(f" Token set score: {token_score}%")
|
|
|
|
best_score = max(label_score, partial_score, token_score)
|
|
print(f" Best score: {best_score}%")
|
|
print(f" Threshold: 70%")
|
|
|
|
if best_score < 70:
|
|
print(f" ❌ REJECTED: Match score below threshold")
|
|
else:
|
|
print(f" ✅ PASSED: Name match validated")
|
|
print(f"\n🎉 Q3551673 SHOULD BE MATCHED!")
|
|
|
|
if __name__ == '__main__':
|
|
debug_search()
|