- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
226 lines
7.7 KiB
Python
226 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test the production script's search function directly.
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
|
|
# Import the function from the production script
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
# We'll copy the function here to avoid import issues
|
|
import requests
|
|
import time
|
|
from typing import Optional, Dict, Any, Set
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Tunisia-Test/1.0"
|
|
|
|
UNIVERSITY_TYPES = {
|
|
'Q3918', # University
|
|
'Q875538', # Public university
|
|
'Q2467461', # Private university
|
|
'Q15936437', # Research university
|
|
'Q38723', # Higher education institution
|
|
'Q3354859', # Technical university
|
|
}
|
|
|
|
def get_valid_types_for_institution(inst_type: str) -> Set[str]:
|
|
"""Get set of valid Wikidata entity types for institution type."""
|
|
if inst_type == 'UNIVERSITY':
|
|
return UNIVERSITY_TYPES
|
|
return set()
|
|
|
|
def search_wikidata_with_validation(
|
|
name: str,
|
|
inst_type: str,
|
|
city: Optional[str] = None,
|
|
timeout: int = 60
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Production script function - copied exactly."""
|
|
|
|
# Get valid Wikidata entity types for this institution type
|
|
valid_types = get_valid_types_for_institution(inst_type)
|
|
|
|
if not valid_types:
|
|
print(f" ⚠️ Unknown institution type: {inst_type}")
|
|
return None
|
|
|
|
# Build VALUES clause for SPARQL query
|
|
type_values = " ".join([f"wd:{qid}" for qid in valid_types])
|
|
|
|
# Build SPARQL query
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
|
|
?viaf ?isil ?website ?coords ?inception ?itemAltLabel
|
|
?location ?locationLabel
|
|
WHERE {{
|
|
# Must be in Tunisia
|
|
?item wdt:P17 wd:Q948 .
|
|
|
|
# Must have an instance-of type matching our institution type
|
|
?item wdt:P31 ?type .
|
|
|
|
# Filter to relevant types for this institution (server-side filtering)
|
|
VALUES ?type {{ {type_values} }}
|
|
|
|
# Add location (P131: located in administrative territorial entity)
|
|
OPTIONAL {{ ?item wdt:P131 ?location . }}
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
|
|
}}
|
|
LIMIT 200
|
|
"""
|
|
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {
|
|
'query': query,
|
|
'format': 'json'
|
|
}
|
|
|
|
try:
|
|
time.sleep(1.5) # Rate limiting
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
print(f"Got {len(bindings)} results from SPARQL")
|
|
|
|
if not bindings:
|
|
return None
|
|
|
|
# Fuzzy match against results WITH entity type AND geographic validation
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
name_lower = name.lower()
|
|
city_lower = city.lower() if city else None
|
|
|
|
# Location-specific institution types require stricter geographic matching
|
|
requires_city_match = inst_type in {'UNIVERSITY', 'RESEARCH_CENTER', 'EDUCATION_PROVIDER'}
|
|
|
|
print(f"Requires city match: {requires_city_match}")
|
|
print(f"City filter: {city_lower}")
|
|
|
|
passed_validation = 0
|
|
|
|
for binding in bindings:
|
|
# CRITICAL: Validate entity type FIRST
|
|
entity_type_uri = binding.get("type", {}).get("value", "")
|
|
entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None
|
|
|
|
# Skip if entity type doesn't match our institution type
|
|
if entity_type_qid not in valid_types:
|
|
continue
|
|
|
|
# GEOGRAPHIC VALIDATION: Check location match for location-specific institutions
|
|
if city_lower and requires_city_match:
|
|
location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""
|
|
|
|
# Must have location data
|
|
if not location_label:
|
|
continue
|
|
|
|
# Location must match expected city (fuzzy match for spelling variations)
|
|
location_match = fuzz.ratio(city_lower, location_label)
|
|
if location_match < 70: # Location mismatch - skip this result
|
|
continue
|
|
|
|
# Passed validation!
|
|
passed_validation += 1
|
|
item_label = binding.get("itemLabel", {}).get("value", "")
|
|
qid = binding.get("item", {}).get("value", "").split("/")[-1]
|
|
|
|
# Now do fuzzy matching on validated entities only
|
|
item_label_lower = item_label.lower()
|
|
|
|
# Calculate match score using multiple strategies
|
|
label_score = fuzz.ratio(name_lower, item_label_lower)
|
|
partial_score = fuzz.partial_ratio(name_lower, item_label_lower)
|
|
token_score = fuzz.token_set_ratio(name_lower, item_label_lower)
|
|
|
|
# Best of the three fuzzy match strategies
|
|
score = max(label_score, partial_score, token_score)
|
|
|
|
if qid == "Q3551673":
|
|
print(f"✅ Found Q3551673 in loop:")
|
|
print(f" Label: {item_label}")
|
|
print(f" Score: {score}%")
|
|
print(f" Best score so far: {best_score}%")
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = binding
|
|
if qid == "Q3551673":
|
|
print(f" ✅ Updated best_match to Q3551673!")
|
|
|
|
print(f"Passed validation: {passed_validation}")
|
|
print(f"Best score: {best_score}%")
|
|
|
|
# Require minimum 70% match
|
|
if best_score < 70:
|
|
print(f"❌ Best score {best_score}% < 70%, returning None")
|
|
return None
|
|
|
|
# Extract data from best match
|
|
if not best_match:
|
|
print(f"❌ No best_match found, returning None")
|
|
return None
|
|
|
|
item_uri = best_match.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
print(f"❌ Invalid QID: {qid}")
|
|
return None
|
|
|
|
print(f"✅ Returning match: {qid}")
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": best_match.get("itemLabel", {}).get("value", ""),
|
|
"description": best_match.get("itemDescription", {}).get("value", ""),
|
|
"entity_type": best_match.get("typeLabel", {}).get("value", ""),
|
|
"match_score": best_score
|
|
}
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return None
|
|
|
|
def main():
|
|
print("Testing Production Function")
|
|
print("=" * 60)
|
|
|
|
# Test with University of Sousse
|
|
result = search_wikidata_with_validation(
|
|
"University of Sousse",
|
|
"UNIVERSITY",
|
|
"Sousse",
|
|
timeout=60
|
|
)
|
|
|
|
if result:
|
|
print("\n✅ SUCCESS!")
|
|
print(f" QID: {result['qid']}")
|
|
print(f" Name: {result['name']}")
|
|
print(f" Score: {result['match_score']}%")
|
|
else:
|
|
print("\n❌ FAILURE! Function returned None")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|