- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
256 lines
8.4 KiB
Python
Executable file
256 lines
8.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Query Wikidata for Chilean libraries to reach 70% coverage target - Batch 12.
|
|
|
|
Target: 7 libraries without Wikidata identifiers
|
|
Strategy: Bulk SPARQL query + fuzzy matching (threshold 75+)
|
|
Goal: Find 3+ matches to reach 63/90 (70.0%)
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import yaml
|
|
from typing import List, Dict, Any
|
|
from pathlib import Path
|
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
from rapidfuzz import fuzz
|
|
|
|
# Wikidata SPARQL endpoint
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
|
|
# Input dataset
|
|
INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch11_enriched.yaml")
|
|
|
|
def load_institutions() -> List[Dict[str, Any]]:
|
|
"""Load Chilean institutions from YAML."""
|
|
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def get_libraries_without_wikidata(institutions: List[Dict]) -> List[Dict]:
|
|
"""Extract libraries that don't have Wikidata identifiers."""
|
|
libraries = []
|
|
|
|
for inst in institutions:
|
|
if inst.get('institution_type') != 'LIBRARY':
|
|
continue
|
|
|
|
# Check if already has Wikidata
|
|
has_wikidata = any(
|
|
i.get('identifier_scheme') == 'Wikidata'
|
|
for i in inst.get('identifiers', [])
|
|
)
|
|
|
|
if not has_wikidata:
|
|
name = inst.get('name', 'Unknown')
|
|
city = inst.get('locations', [{}])[0].get('city', 'Unknown')
|
|
libraries.append({'name': name, 'city': city})
|
|
|
|
return libraries
|
|
|
|
def query_chilean_libraries() -> List[Dict[str, Any]]:
|
|
"""Query all Chilean libraries from Wikidata."""
|
|
|
|
query = """
|
|
SELECT DISTINCT ?library ?libraryLabel ?location ?locationLabel ?coords ?viaf ?isil ?founded WHERE {
|
|
# Chilean libraries
|
|
?library wdt:P31/wdt:P279* wd:Q7075 . # Instance of library (or subclass)
|
|
?library wdt:P17 wd:Q298 . # Country: Chile
|
|
|
|
# Optional: location
|
|
OPTIONAL {
|
|
?library wdt:P131 ?location .
|
|
}
|
|
|
|
# Optional: coordinates
|
|
OPTIONAL {
|
|
?library wdt:P625 ?coords .
|
|
}
|
|
|
|
# Optional: VIAF identifier
|
|
OPTIONAL {
|
|
?library wdt:P214 ?viaf .
|
|
}
|
|
|
|
# Optional: ISIL code
|
|
OPTIONAL {
|
|
?library wdt:P791 ?isil .
|
|
}
|
|
|
|
# Optional: founding date
|
|
OPTIONAL {
|
|
?library wdt:P571 ?founded .
|
|
}
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" }
|
|
}
|
|
ORDER BY ?libraryLabel
|
|
"""
|
|
|
|
print("Querying Wikidata for Chilean libraries...")
|
|
print(f"SPARQL endpoint: {SPARQL_ENDPOINT}")
|
|
print()
|
|
|
|
sparql = SPARQLWrapper(SPARQL_ENDPOINT)
|
|
sparql.setQuery(query)
|
|
sparql.setReturnFormat(JSON)
|
|
sparql.setTimeout(120) # 2 minute timeout
|
|
|
|
try:
|
|
start_time = time.time()
|
|
results = sparql.query().convert()
|
|
elapsed = time.time() - start_time
|
|
|
|
print(f"✅ Query completed in {elapsed:.1f} seconds")
|
|
print()
|
|
|
|
# Extract results
|
|
bindings = results['results']['bindings'] # type: ignore
|
|
libraries = []
|
|
|
|
for result in bindings:
|
|
library = {
|
|
'q_number': result['library']['value'].split('/')[-1], # type: ignore
|
|
'name': result.get('libraryLabel', {}).get('value', 'Unknown'), # type: ignore
|
|
'location': result.get('locationLabel', {}).get('value'), # type: ignore
|
|
'coords': result.get('coords', {}).get('value'), # type: ignore
|
|
'viaf': result.get('viaf', {}).get('value'), # type: ignore
|
|
'isil': result.get('isil', {}).get('value'), # type: ignore
|
|
'founded': result.get('founded', {}).get('value', '')[:4] if 'founded' in result else None # type: ignore
|
|
}
|
|
libraries.append(library)
|
|
|
|
print(f"Found {len(libraries)} Chilean libraries in Wikidata")
|
|
return libraries
|
|
|
|
except Exception as e:
|
|
print(f"❌ Query failed: {e}")
|
|
return []
|
|
|
|
def fuzzy_match_libraries(search_libraries: List[Dict], wikidata_libraries: List[Dict]) -> List[Dict]:
|
|
"""Fuzzy match search libraries against Wikidata results."""
|
|
|
|
matches = []
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("FUZZY MATCHING RESULTS")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
for search_lib in search_libraries:
|
|
search_name = search_lib['name']
|
|
search_city = search_lib['city']
|
|
|
|
print(f"Searching for: {search_name} ({search_city})")
|
|
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for wd_lib in wikidata_libraries:
|
|
wd_name = wd_lib['name']
|
|
|
|
# Fuzzy match on name
|
|
name_score = fuzz.ratio(search_name.lower(), wd_name.lower())
|
|
|
|
# Bonus for location match
|
|
location_bonus = 0
|
|
if wd_lib.get('location'):
|
|
location_score = fuzz.partial_ratio(search_city.lower(), wd_lib['location'].lower())
|
|
if location_score > 70:
|
|
location_bonus = 10
|
|
|
|
total_score = name_score + location_bonus
|
|
|
|
if total_score > best_score:
|
|
best_score = total_score
|
|
best_match = {
|
|
'search_name': search_name,
|
|
'search_city': search_city,
|
|
'q_number': wd_lib['q_number'],
|
|
'wikidata_name': wd_name,
|
|
'wikidata_location': wd_lib.get('location'),
|
|
'match_score': name_score,
|
|
'total_score': total_score,
|
|
'viaf': wd_lib.get('viaf'),
|
|
'isil': wd_lib.get('isil'),
|
|
'founded': wd_lib.get('founded')
|
|
}
|
|
|
|
if best_match and best_match['total_score'] >= 75: # Threshold: 75+
|
|
print(f" ✅ MATCH: {best_match['wikidata_name']} ({best_match['q_number']})")
|
|
print(f" Score: {best_match['match_score']:.0f} (name) + {best_match['total_score'] - best_match['match_score']:.0f} (location) = {best_match['total_score']:.0f}")
|
|
if best_match.get('wikidata_location'):
|
|
print(f" Location: {best_match['wikidata_location']}")
|
|
if best_match.get('founded'):
|
|
print(f" Founded: {best_match['founded']}")
|
|
matches.append(best_match)
|
|
else:
|
|
print(f" ❌ No match found (best score: {best_score:.0f})")
|
|
|
|
print()
|
|
|
|
return matches
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
print("=" * 80)
|
|
print("CHILEAN LIBRARIES WIKIDATA QUERY - BATCH 12")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load institutions
|
|
print(f"Loading dataset: {INPUT_FILE}")
|
|
institutions = load_institutions()
|
|
print(f" Loaded {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Get libraries without Wikidata
|
|
search_libraries = get_libraries_without_wikidata(institutions)
|
|
print(f"Target: {len(search_libraries)} libraries without Wikidata")
|
|
print(f"Goal: Find 3+ matches to reach 70% coverage (63/90)")
|
|
print()
|
|
|
|
# Query Wikidata
|
|
wikidata_libraries = query_chilean_libraries()
|
|
|
|
if not wikidata_libraries:
|
|
print("❌ No results from Wikidata")
|
|
return
|
|
|
|
# Fuzzy match
|
|
matches = fuzzy_match_libraries(search_libraries, wikidata_libraries)
|
|
|
|
# Save results
|
|
output = {
|
|
"batch": 12,
|
|
"query_date": time.strftime("%Y-%m-%d"),
|
|
"institution_type": "LIBRARY",
|
|
"total_searched": len(search_libraries),
|
|
"matches_found": len(matches),
|
|
"matches": matches
|
|
}
|
|
|
|
output_file = "scripts/batch12_library_query_results.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
|
|
print("=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Wikidata libraries found: {len(wikidata_libraries)}")
|
|
print(f"Matches found: {len(matches)}")
|
|
print(f"Match rate: {len(matches)/len(search_libraries)*100:.1f}%")
|
|
print()
|
|
print(f"✅ Results saved to: {output_file}")
|
|
print()
|
|
|
|
if len(matches) >= 3:
|
|
print(f"🎯 SUCCESS! Found {len(matches)} matches - enough to reach 70% target")
|
|
else:
|
|
print(f"⚠️ Only found {len(matches)} matches - need {3 - len(matches)} more for 70% target")
|
|
|
|
print()
|
|
print("Next step: Manual validation with scripts/finalize_batch12.py")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|