glam/scripts/debug_enrich_tunisia.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

270 lines
9.1 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Debug version of Tunisia enrichment script with extensive logging.
Focus on finding why University of Sousse (Q3551673) isn't being matched.
"""
import yaml
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from rapidfuzz import fuzz
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Tunisia-Debug/1.0"
# Valid types for UNIVERSITY
UNIVERSITY_TYPES = {
'Q3918', # University
'Q875538', # Public university
'Q2467461', # Private university
'Q15936437', # Research university
'Q38723', # Higher education institution
'Q3354859', # Technical university
}
def search_wikidata_debug(name: str, inst_type: str, city: str = None):
"""Search with extensive logging."""
print(f"\n{'='*60}")
print(f"SEARCHING: {name}")
print(f"Type: {inst_type}, City: {city}")
print(f"{'='*60}")
# Build query
type_values = " ".join([f"wd:{qid}" for qid in UNIVERSITY_TYPES])
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
?viaf ?isil ?website ?coords ?inception ?itemAltLabel
?location ?locationLabel
WHERE {{
# Must be in Tunisia
?item wdt:P17 wd:Q948 .
# Must have an instance-of type matching our institution type
?item wdt:P31 ?type .
# Filter to relevant types for this institution (server-side filtering)
VALUES ?type {{ {type_values} }}
# Add location (P131: located in administrative territorial entity)
OPTIONAL {{ ?item wdt:P131 ?location . }}
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
OPTIONAL {{ ?item wdt:P791 ?isil . }}
OPTIONAL {{ ?item wdt:P856 ?website . }}
OPTIONAL {{ ?item wdt:P625 ?coords . }}
OPTIONAL {{ ?item wdt:P571 ?inception . }}
OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
}}
LIMIT 200
"""
headers = {'User-Agent': USER_AGENT}
params = {'query': query, 'format': 'json'}
print(f"Executing SPARQL query...")
time.sleep(1.5)
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
response.raise_for_status()
results = response.json()
bindings = results.get("results", {}).get("bindings", [])
print(f"Total results from Wikidata: {len(bindings)}")
# Check if Q3551673 is in results
found_target = False
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1]
if qid == "Q3551673":
found_target = True
print(f"\n✅ Q3551673 IS in SPARQL results")
print(f" Label: {binding.get('itemLabel', {}).get('value', 'N/A')}")
print(f" Location: {binding.get('locationLabel', {}).get('value', 'N/A')}")
break
if not found_target:
print(f"\n❌ Q3551673 NOT in SPARQL results")
# Now apply matching logic
print(f"\n--- APPLYING MATCHING LOGIC ---")
best_match = None
best_score = 0
name_lower = name.lower()
city_lower = city.lower() if city else None
requires_city_match = inst_type in {'UNIVERSITY', 'RESEARCH_CENTER', 'EDUCATION_PROVIDER'}
print(f"Requires city match: {requires_city_match}")
print(f"Processing {len(bindings)} candidates...")
rejected_count = {
'entity_type': 0,
'no_location': 0,
'location_mismatch': 0,
'low_score': 0,
'passed': 0
}
for i, binding in enumerate(bindings, 1):
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1]
item_label = binding.get("itemLabel", {}).get("value", "")
# Only log details for Q3551673
is_target = (qid == "Q3551673")
if is_target:
print(f"\n🎯 Processing Q3551673 (result {i}/{len(bindings)}):")
# Step 1: Entity type validation
entity_type_uri = binding.get("type", {}).get("value", "")
entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None
if entity_type_qid not in UNIVERSITY_TYPES:
rejected_count['entity_type'] += 1
if is_target:
print(f" ❌ STEP 1 FAILED: Entity type {entity_type_qid} not in valid types")
continue
if is_target:
print(f" ✅ STEP 1 PASSED: Entity type validated ({entity_type_qid})")
# Step 2: Geographic validation
if city_lower and requires_city_match:
location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""
if not location_label:
rejected_count['no_location'] += 1
if is_target:
print(f" ❌ STEP 2 FAILED: No location data")
continue
if is_target:
print(f" ✅ STEP 2a: Has location data ('{location_label}')")
location_match = fuzz.ratio(city_lower, location_label)
if is_target:
print(f" Location match score: {location_match}% (threshold: 70%)")
if location_match < 70:
rejected_count['location_mismatch'] += 1
if is_target:
print(f" ❌ STEP 2b FAILED: Location match {location_match}% < 70%")
continue
if is_target:
print(f" ✅ STEP 2b PASSED: Location validated")
# Step 3: Name fuzzy matching
item_label_lower = item_label.lower()
label_score = fuzz.ratio(name_lower, item_label_lower)
partial_score = fuzz.partial_ratio(name_lower, item_label_lower)
token_score = fuzz.token_set_ratio(name_lower, item_label_lower)
score = max(label_score, partial_score, token_score)
if is_target:
print(f" STEP 3: Name matching:")
print(f" Search: '{name_lower}'")
print(f" Label: '{item_label_lower}'")
print(f" Scores: label={label_score}%, partial={partial_score}%, token={token_score}%")
print(f" Best: {score}%")
if score > best_score:
best_score = score
best_match = binding
if is_target:
print(f" ✅ STEP 3 PASSED: New best match!")
if score >= 70:
rejected_count['passed'] += 1
print(f"\n--- FILTERING RESULTS ---")
print(f" Entity type rejected: {rejected_count['entity_type']}")
print(f" No location data: {rejected_count['no_location']}")
print(f" Location mismatch: {rejected_count['location_mismatch']}")
print(f" Low name score: {rejected_count['low_score']}")
print(f" Passed all checks: {rejected_count['passed']}")
print(f"\n--- FINAL RESULT ---")
print(f"Best score: {best_score}%")
print(f"Threshold: 70%")
if best_score < 70:
print(f"❌ REJECTED: Best score {best_score}% < 70%")
return None
if not best_match:
print(f"❌ No match found")
return None
item_uri = best_match.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1]
print(f"✅ MATCH: {qid} - {best_match.get('itemLabel', {}).get('value', '')}")
print(f" Score: {best_score}%")
return {
"qid": qid,
"name": best_match.get("itemLabel", {}).get("value", ""),
"match_score": best_score
}
def main():
input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
print("Tunisia Wikidata Enrichment - DEBUG MODE")
print("="*60)
print("Testing with University of Sousse")
print("="*60)
# Load data
with open(input_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
institutions = data['institutions']
# Find University of Sousse
target_inst = None
for inst in institutions:
if inst.get('name') == 'University of Sousse':
target_inst = inst
break
if not target_inst:
print("❌ University of Sousse not found in data")
return
print(f"\nFound institution:")
print(f" Name: {target_inst['name']}")
print(f" Type: {target_inst.get('institution_type')}")
print(f" City: {target_inst.get('locations', [{}])[0].get('city', '')}")
# Test search
result = search_wikidata_debug(
target_inst['name'],
target_inst.get('institution_type', 'UNIVERSITY'),
target_inst.get('locations', [{}])[0].get('city', '')
)
if result:
print(f"\n{'='*60}")
print(f"SUCCESS! Would enrich with {result['qid']}")
print(f"{'='*60}")
else:
print(f"\n{'='*60}")
print(f"FAILURE! No match found (this is the bug)")
print(f"{'='*60}")
if __name__ == '__main__':
main()