#!/usr/bin/env python3 """ Query Wikidata for Chilean libraries to reach 70% coverage target - Batch 12. Target: 7 libraries without Wikidata identifiers Strategy: Bulk SPARQL query + fuzzy matching (threshold 75+) Goal: Find 3+ matches to reach 63/90 (70.0%) """ import json import time import yaml from typing import List, Dict, Any from pathlib import Path from SPARQLWrapper import SPARQLWrapper, JSON from rapidfuzz import fuzz # Wikidata SPARQL endpoint SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" # Input dataset INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch11_enriched.yaml") def load_institutions() -> List[Dict[str, Any]]: """Load Chilean institutions from YAML.""" with open(INPUT_FILE, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def get_libraries_without_wikidata(institutions: List[Dict]) -> List[Dict]: """Extract libraries that don't have Wikidata identifiers.""" libraries = [] for inst in institutions: if inst.get('institution_type') != 'LIBRARY': continue # Check if already has Wikidata has_wikidata = any( i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []) ) if not has_wikidata: name = inst.get('name', 'Unknown') city = inst.get('locations', [{}])[0].get('city', 'Unknown') libraries.append({'name': name, 'city': city}) return libraries def query_chilean_libraries() -> List[Dict[str, Any]]: """Query all Chilean libraries from Wikidata.""" query = """ SELECT DISTINCT ?library ?libraryLabel ?location ?locationLabel ?coords ?viaf ?isil ?founded WHERE { # Chilean libraries ?library wdt:P31/wdt:P279* wd:Q7075 . # Instance of library (or subclass) ?library wdt:P17 wd:Q298 . # Country: Chile # Optional: location OPTIONAL { ?library wdt:P131 ?location . } # Optional: coordinates OPTIONAL { ?library wdt:P625 ?coords . } # Optional: VIAF identifier OPTIONAL { ?library wdt:P214 ?viaf . } # Optional: ISIL code OPTIONAL { ?library wdt:P791 ?isil . } # Optional: founding date OPTIONAL { ?library wdt:P571 ?founded . } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" } } ORDER BY ?libraryLabel """ print("Querying Wikidata for Chilean libraries...") print(f"SPARQL endpoint: {SPARQL_ENDPOINT}") print() sparql = SPARQLWrapper(SPARQL_ENDPOINT) sparql.setQuery(query) sparql.setReturnFormat(JSON) sparql.setTimeout(120) # 2 minute timeout try: start_time = time.time() results = sparql.query().convert() elapsed = time.time() - start_time print(f"✅ Query completed in {elapsed:.1f} seconds") print() # Extract results bindings = results['results']['bindings'] # type: ignore libraries = [] for result in bindings: library = { 'q_number': result['library']['value'].split('/')[-1], # type: ignore 'name': result.get('libraryLabel', {}).get('value', 'Unknown'), # type: ignore 'location': result.get('locationLabel', {}).get('value'), # type: ignore 'coords': result.get('coords', {}).get('value'), # type: ignore 'viaf': result.get('viaf', {}).get('value'), # type: ignore 'isil': result.get('isil', {}).get('value'), # type: ignore 'founded': result.get('founded', {}).get('value', '')[:4] if 'founded' in result else None # type: ignore } libraries.append(library) print(f"Found {len(libraries)} Chilean libraries in Wikidata") return libraries except Exception as e: print(f"❌ Query failed: {e}") return [] def fuzzy_match_libraries(search_libraries: List[Dict], wikidata_libraries: List[Dict]) -> List[Dict]: """Fuzzy match search libraries against Wikidata results.""" matches = [] print() print("=" * 80) print("FUZZY MATCHING RESULTS") print("=" * 80) print() for search_lib in search_libraries: search_name = search_lib['name'] search_city = search_lib['city'] print(f"Searching for: {search_name} ({search_city})") best_match = None best_score = 0 for wd_lib in wikidata_libraries: wd_name = wd_lib['name'] # Fuzzy match on name name_score = fuzz.ratio(search_name.lower(), wd_name.lower()) # Bonus for location match location_bonus = 0 if wd_lib.get('location'): location_score = fuzz.partial_ratio(search_city.lower(), wd_lib['location'].lower()) if location_score > 70: location_bonus = 10 total_score = name_score + location_bonus if total_score > best_score: best_score = total_score best_match = { 'search_name': search_name, 'search_city': search_city, 'q_number': wd_lib['q_number'], 'wikidata_name': wd_name, 'wikidata_location': wd_lib.get('location'), 'match_score': name_score, 'total_score': total_score, 'viaf': wd_lib.get('viaf'), 'isil': wd_lib.get('isil'), 'founded': wd_lib.get('founded') } if best_match and best_match['total_score'] >= 75: # Threshold: 75+ print(f" ✅ MATCH: {best_match['wikidata_name']} ({best_match['q_number']})") print(f" Score: {best_match['match_score']:.0f} (name) + {best_match['total_score'] - best_match['match_score']:.0f} (location) = {best_match['total_score']:.0f}") if best_match.get('wikidata_location'): print(f" Location: {best_match['wikidata_location']}") if best_match.get('founded'): print(f" Founded: {best_match['founded']}") matches.append(best_match) else: print(f" ❌ No match found (best score: {best_score:.0f})") print() return matches def main(): """Main execution.""" print("=" * 80) print("CHILEAN LIBRARIES WIKIDATA QUERY - BATCH 12") print("=" * 80) print() # Load institutions print(f"Loading dataset: {INPUT_FILE}") institutions = load_institutions() print(f" Loaded {len(institutions)} institutions") print() # Get libraries without Wikidata search_libraries = get_libraries_without_wikidata(institutions) print(f"Target: {len(search_libraries)} libraries without Wikidata") print(f"Goal: Find 3+ matches to reach 70% coverage (63/90)") print() # Query Wikidata wikidata_libraries = query_chilean_libraries() if not wikidata_libraries: print("❌ No results from Wikidata") return # Fuzzy match matches = fuzzy_match_libraries(search_libraries, wikidata_libraries) # Save results output = { "batch": 12, "query_date": time.strftime("%Y-%m-%d"), "institution_type": "LIBRARY", "total_searched": len(search_libraries), "matches_found": len(matches), "matches": matches } output_file = "scripts/batch12_library_query_results.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(output, f, indent=2, ensure_ascii=False) print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Wikidata libraries found: {len(wikidata_libraries)}") print(f"Matches found: {len(matches)}") print(f"Match rate: {len(matches)/len(search_libraries)*100:.1f}%") print() print(f"✅ Results saved to: {output_file}") print() if len(matches) >= 3: print(f"🎯 SUCCESS! Found {len(matches)} matches - enough to reach 70% target") else: print(f"⚠️ Only found {len(matches)} matches - need {3 - len(matches)} more for 70% target") print() print("Next step: Manual validation with scripts/finalize_batch12.py") if __name__ == "__main__": main()