- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
270 lines
9.1 KiB
Python
Executable file
270 lines
9.1 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Debug version of Tunisia enrichment script with extensive logging.
|
|
Focus on finding why University of Sousse (Q3551673) isn't being matched.
|
|
"""
|
|
|
|
import yaml
|
|
import time
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Tunisia-Debug/1.0"
|
|
|
|
# Valid types for UNIVERSITY
|
|
UNIVERSITY_TYPES = {
|
|
'Q3918', # University
|
|
'Q875538', # Public university
|
|
'Q2467461', # Private university
|
|
'Q15936437', # Research university
|
|
'Q38723', # Higher education institution
|
|
'Q3354859', # Technical university
|
|
}
|
|
|
|
def search_wikidata_debug(name: str, inst_type: str, city: str = None):
|
|
"""Search with extensive logging."""
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"SEARCHING: {name}")
|
|
print(f"Type: {inst_type}, City: {city}")
|
|
print(f"{'='*60}")
|
|
|
|
# Build query
|
|
type_values = " ".join([f"wd:{qid}" for qid in UNIVERSITY_TYPES])
|
|
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
|
|
?viaf ?isil ?website ?coords ?inception ?itemAltLabel
|
|
?location ?locationLabel
|
|
WHERE {{
|
|
# Must be in Tunisia
|
|
?item wdt:P17 wd:Q948 .
|
|
|
|
# Must have an instance-of type matching our institution type
|
|
?item wdt:P31 ?type .
|
|
|
|
# Filter to relevant types for this institution (server-side filtering)
|
|
VALUES ?type {{ {type_values} }}
|
|
|
|
# Add location (P131: located in administrative territorial entity)
|
|
OPTIONAL {{ ?item wdt:P131 ?location . }}
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
|
|
}}
|
|
LIMIT 200
|
|
"""
|
|
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {'query': query, 'format': 'json'}
|
|
|
|
print(f"Executing SPARQL query...")
|
|
time.sleep(1.5)
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
print(f"Total results from Wikidata: {len(bindings)}")
|
|
|
|
# Check if Q3551673 is in results
|
|
found_target = False
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1]
|
|
if qid == "Q3551673":
|
|
found_target = True
|
|
print(f"\n✅ Q3551673 IS in SPARQL results")
|
|
print(f" Label: {binding.get('itemLabel', {}).get('value', 'N/A')}")
|
|
print(f" Location: {binding.get('locationLabel', {}).get('value', 'N/A')}")
|
|
break
|
|
|
|
if not found_target:
|
|
print(f"\n❌ Q3551673 NOT in SPARQL results")
|
|
|
|
# Now apply matching logic
|
|
print(f"\n--- APPLYING MATCHING LOGIC ---")
|
|
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
name_lower = name.lower()
|
|
city_lower = city.lower() if city else None
|
|
requires_city_match = inst_type in {'UNIVERSITY', 'RESEARCH_CENTER', 'EDUCATION_PROVIDER'}
|
|
|
|
print(f"Requires city match: {requires_city_match}")
|
|
print(f"Processing {len(bindings)} candidates...")
|
|
|
|
rejected_count = {
|
|
'entity_type': 0,
|
|
'no_location': 0,
|
|
'location_mismatch': 0,
|
|
'low_score': 0,
|
|
'passed': 0
|
|
}
|
|
|
|
for i, binding in enumerate(bindings, 1):
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1]
|
|
item_label = binding.get("itemLabel", {}).get("value", "")
|
|
|
|
# Only log details for Q3551673
|
|
is_target = (qid == "Q3551673")
|
|
|
|
if is_target:
|
|
print(f"\n🎯 Processing Q3551673 (result {i}/{len(bindings)}):")
|
|
|
|
# Step 1: Entity type validation
|
|
entity_type_uri = binding.get("type", {}).get("value", "")
|
|
entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None
|
|
|
|
if entity_type_qid not in UNIVERSITY_TYPES:
|
|
rejected_count['entity_type'] += 1
|
|
if is_target:
|
|
print(f" ❌ STEP 1 FAILED: Entity type {entity_type_qid} not in valid types")
|
|
continue
|
|
|
|
if is_target:
|
|
print(f" ✅ STEP 1 PASSED: Entity type validated ({entity_type_qid})")
|
|
|
|
# Step 2: Geographic validation
|
|
if city_lower and requires_city_match:
|
|
location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""
|
|
|
|
if not location_label:
|
|
rejected_count['no_location'] += 1
|
|
if is_target:
|
|
print(f" ❌ STEP 2 FAILED: No location data")
|
|
continue
|
|
|
|
if is_target:
|
|
print(f" ✅ STEP 2a: Has location data ('{location_label}')")
|
|
|
|
location_match = fuzz.ratio(city_lower, location_label)
|
|
|
|
if is_target:
|
|
print(f" Location match score: {location_match}% (threshold: 70%)")
|
|
|
|
if location_match < 70:
|
|
rejected_count['location_mismatch'] += 1
|
|
if is_target:
|
|
print(f" ❌ STEP 2b FAILED: Location match {location_match}% < 70%")
|
|
continue
|
|
|
|
if is_target:
|
|
print(f" ✅ STEP 2b PASSED: Location validated")
|
|
|
|
# Step 3: Name fuzzy matching
|
|
item_label_lower = item_label.lower()
|
|
|
|
label_score = fuzz.ratio(name_lower, item_label_lower)
|
|
partial_score = fuzz.partial_ratio(name_lower, item_label_lower)
|
|
token_score = fuzz.token_set_ratio(name_lower, item_label_lower)
|
|
|
|
score = max(label_score, partial_score, token_score)
|
|
|
|
if is_target:
|
|
print(f" STEP 3: Name matching:")
|
|
print(f" Search: '{name_lower}'")
|
|
print(f" Label: '{item_label_lower}'")
|
|
print(f" Scores: label={label_score}%, partial={partial_score}%, token={token_score}%")
|
|
print(f" Best: {score}%")
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = binding
|
|
if is_target:
|
|
print(f" ✅ STEP 3 PASSED: New best match!")
|
|
|
|
if score >= 70:
|
|
rejected_count['passed'] += 1
|
|
|
|
print(f"\n--- FILTERING RESULTS ---")
|
|
print(f" Entity type rejected: {rejected_count['entity_type']}")
|
|
print(f" No location data: {rejected_count['no_location']}")
|
|
print(f" Location mismatch: {rejected_count['location_mismatch']}")
|
|
print(f" Low name score: {rejected_count['low_score']}")
|
|
print(f" Passed all checks: {rejected_count['passed']}")
|
|
|
|
print(f"\n--- FINAL RESULT ---")
|
|
print(f"Best score: {best_score}%")
|
|
print(f"Threshold: 70%")
|
|
|
|
if best_score < 70:
|
|
print(f"❌ REJECTED: Best score {best_score}% < 70%")
|
|
return None
|
|
|
|
if not best_match:
|
|
print(f"❌ No match found")
|
|
return None
|
|
|
|
item_uri = best_match.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1]
|
|
|
|
print(f"✅ MATCH: {qid} - {best_match.get('itemLabel', {}).get('value', '')}")
|
|
print(f" Score: {best_score}%")
|
|
|
|
return {
|
|
"qid": qid,
|
|
"name": best_match.get("itemLabel", {}).get("value", ""),
|
|
"match_score": best_score
|
|
}
|
|
|
|
def main():
|
|
input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
|
|
|
|
print("Tunisia Wikidata Enrichment - DEBUG MODE")
|
|
print("="*60)
|
|
print("Testing with University of Sousse")
|
|
print("="*60)
|
|
|
|
# Load data
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
institutions = data['institutions']
|
|
|
|
# Find University of Sousse
|
|
target_inst = None
|
|
for inst in institutions:
|
|
if inst.get('name') == 'University of Sousse':
|
|
target_inst = inst
|
|
break
|
|
|
|
if not target_inst:
|
|
print("❌ University of Sousse not found in data")
|
|
return
|
|
|
|
print(f"\nFound institution:")
|
|
print(f" Name: {target_inst['name']}")
|
|
print(f" Type: {target_inst.get('institution_type')}")
|
|
print(f" City: {target_inst.get('locations', [{}])[0].get('city', '')}")
|
|
|
|
# Test search
|
|
result = search_wikidata_debug(
|
|
target_inst['name'],
|
|
target_inst.get('institution_type', 'UNIVERSITY'),
|
|
target_inst.get('locations', [{}])[0].get('city', '')
|
|
)
|
|
|
|
if result:
|
|
print(f"\n{'='*60}")
|
|
print(f"SUCCESS! Would enrich with {result['qid']}")
|
|
print(f"{'='*60}")
|
|
else:
|
|
print(f"\n{'='*60}")
|
|
print(f"FAILURE! No match found (this is the bug)")
|
|
print(f"{'='*60}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|