glam/scripts/query_wikidata_libraries_batch12.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

256 lines
8.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Query Wikidata for Chilean libraries to reach 70% coverage target - Batch 12.
Target: 7 libraries without Wikidata identifiers
Strategy: Bulk SPARQL query + fuzzy matching (threshold 75+)
Goal: Find 3+ matches to reach 63/90 (70.0%)
"""
import json
import time
import yaml
from typing import List, Dict, Any
from pathlib import Path
from SPARQLWrapper import SPARQLWrapper, JSON
from rapidfuzz import fuzz
# Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
# Input dataset
INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch11_enriched.yaml")
def load_institutions() -> List[Dict[str, Any]]:
"""Load Chilean institutions from YAML."""
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def get_libraries_without_wikidata(institutions: List[Dict]) -> List[Dict]:
"""Extract libraries that don't have Wikidata identifiers."""
libraries = []
for inst in institutions:
if inst.get('institution_type') != 'LIBRARY':
continue
# Check if already has Wikidata
has_wikidata = any(
i.get('identifier_scheme') == 'Wikidata'
for i in inst.get('identifiers', [])
)
if not has_wikidata:
name = inst.get('name', 'Unknown')
city = inst.get('locations', [{}])[0].get('city', 'Unknown')
libraries.append({'name': name, 'city': city})
return libraries
def query_chilean_libraries() -> List[Dict[str, Any]]:
"""Query all Chilean libraries from Wikidata."""
query = """
SELECT DISTINCT ?library ?libraryLabel ?location ?locationLabel ?coords ?viaf ?isil ?founded WHERE {
# Chilean libraries
?library wdt:P31/wdt:P279* wd:Q7075 . # Instance of library (or subclass)
?library wdt:P17 wd:Q298 . # Country: Chile
# Optional: location
OPTIONAL {
?library wdt:P131 ?location .
}
# Optional: coordinates
OPTIONAL {
?library wdt:P625 ?coords .
}
# Optional: VIAF identifier
OPTIONAL {
?library wdt:P214 ?viaf .
}
# Optional: ISIL code
OPTIONAL {
?library wdt:P791 ?isil .
}
# Optional: founding date
OPTIONAL {
?library wdt:P571 ?founded .
}
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" }
}
ORDER BY ?libraryLabel
"""
print("Querying Wikidata for Chilean libraries...")
print(f"SPARQL endpoint: {SPARQL_ENDPOINT}")
print()
sparql = SPARQLWrapper(SPARQL_ENDPOINT)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
sparql.setTimeout(120) # 2 minute timeout
try:
start_time = time.time()
results = sparql.query().convert()
elapsed = time.time() - start_time
print(f"✅ Query completed in {elapsed:.1f} seconds")
print()
# Extract results
bindings = results['results']['bindings'] # type: ignore
libraries = []
for result in bindings:
library = {
'q_number': result['library']['value'].split('/')[-1], # type: ignore
'name': result.get('libraryLabel', {}).get('value', 'Unknown'), # type: ignore
'location': result.get('locationLabel', {}).get('value'), # type: ignore
'coords': result.get('coords', {}).get('value'), # type: ignore
'viaf': result.get('viaf', {}).get('value'), # type: ignore
'isil': result.get('isil', {}).get('value'), # type: ignore
'founded': result.get('founded', {}).get('value', '')[:4] if 'founded' in result else None # type: ignore
}
libraries.append(library)
print(f"Found {len(libraries)} Chilean libraries in Wikidata")
return libraries
except Exception as e:
print(f"❌ Query failed: {e}")
return []
def fuzzy_match_libraries(search_libraries: List[Dict], wikidata_libraries: List[Dict]) -> List[Dict]:
"""Fuzzy match search libraries against Wikidata results."""
matches = []
print()
print("=" * 80)
print("FUZZY MATCHING RESULTS")
print("=" * 80)
print()
for search_lib in search_libraries:
search_name = search_lib['name']
search_city = search_lib['city']
print(f"Searching for: {search_name} ({search_city})")
best_match = None
best_score = 0
for wd_lib in wikidata_libraries:
wd_name = wd_lib['name']
# Fuzzy match on name
name_score = fuzz.ratio(search_name.lower(), wd_name.lower())
# Bonus for location match
location_bonus = 0
if wd_lib.get('location'):
location_score = fuzz.partial_ratio(search_city.lower(), wd_lib['location'].lower())
if location_score > 70:
location_bonus = 10
total_score = name_score + location_bonus
if total_score > best_score:
best_score = total_score
best_match = {
'search_name': search_name,
'search_city': search_city,
'q_number': wd_lib['q_number'],
'wikidata_name': wd_name,
'wikidata_location': wd_lib.get('location'),
'match_score': name_score,
'total_score': total_score,
'viaf': wd_lib.get('viaf'),
'isil': wd_lib.get('isil'),
'founded': wd_lib.get('founded')
}
if best_match and best_match['total_score'] >= 75: # Threshold: 75+
print(f" ✅ MATCH: {best_match['wikidata_name']} ({best_match['q_number']})")
print(f" Score: {best_match['match_score']:.0f} (name) + {best_match['total_score'] - best_match['match_score']:.0f} (location) = {best_match['total_score']:.0f}")
if best_match.get('wikidata_location'):
print(f" Location: {best_match['wikidata_location']}")
if best_match.get('founded'):
print(f" Founded: {best_match['founded']}")
matches.append(best_match)
else:
print(f" ❌ No match found (best score: {best_score:.0f})")
print()
return matches
def main():
"""Main execution."""
print("=" * 80)
print("CHILEAN LIBRARIES WIKIDATA QUERY - BATCH 12")
print("=" * 80)
print()
# Load institutions
print(f"Loading dataset: {INPUT_FILE}")
institutions = load_institutions()
print(f" Loaded {len(institutions)} institutions")
print()
# Get libraries without Wikidata
search_libraries = get_libraries_without_wikidata(institutions)
print(f"Target: {len(search_libraries)} libraries without Wikidata")
print(f"Goal: Find 3+ matches to reach 70% coverage (63/90)")
print()
# Query Wikidata
wikidata_libraries = query_chilean_libraries()
if not wikidata_libraries:
print("❌ No results from Wikidata")
return
# Fuzzy match
matches = fuzzy_match_libraries(search_libraries, wikidata_libraries)
# Save results
output = {
"batch": 12,
"query_date": time.strftime("%Y-%m-%d"),
"institution_type": "LIBRARY",
"total_searched": len(search_libraries),
"matches_found": len(matches),
"matches": matches
}
output_file = "scripts/batch12_library_query_results.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Wikidata libraries found: {len(wikidata_libraries)}")
print(f"Matches found: {len(matches)}")
print(f"Match rate: {len(matches)/len(search_libraries)*100:.1f}%")
print()
print(f"✅ Results saved to: {output_file}")
print()
if len(matches) >= 3:
print(f"🎯 SUCCESS! Found {len(matches)} matches - enough to reach 70% target")
else:
print(f"⚠️ Only found {len(matches)} matches - need {3 - len(matches)} more for 70% target")
print()
print("Next step: Manual validation with scripts/finalize_batch12.py")
if __name__ == "__main__":
main()