#!/usr/bin/env python3 """ Chilean Batch 13: Manual Wikidata Search for High-Priority Institutions Target: 3 specific institutions to reach 70% coverage (63/90) Focus institutions: 1. Museo de las Iglesias (Castro, Chiloé) - UNESCO connection 2. Museo del Libro del Mar (San Antonio) - Unique maritime museum 3. Archivo General de Asuntos Indígenas (CONADI, Temuco) - Government archive Strategy: Exact name matching, no fuzzy matching, manual verification required. """ import json from SPARQLWrapper import SPARQLWrapper, JSON from typing import Dict, List, Optional, Any def query_wikidata_exact(institution_name: str, location_city: str, region: str) -> Optional[List[Dict]]: """ Query Wikidata for an institution using exact name matching. Returns None if no exact match found, otherwise returns list of result dictionaries. """ endpoint = "https://query.wikidata.org/sparql" sparql = SPARQLWrapper(endpoint) sparql.setReturnFormat(JSON) # Try multiple query strategies queries = [] # Strategy 1: Exact Spanish name queries.append(f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{ ?item rdfs:label "{institution_name}"@es . ?item wdt:P31 ?instanceOf . OPTIONAL {{ ?item wdt:P131 ?location }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }} }} LIMIT 10 """) # Strategy 2: Contains name search with location filter queries.append(f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{ ?item rdfs:label ?label . FILTER(CONTAINS(LCASE(?label), LCASE("{institution_name}"))) ?item wdt:P31 ?instanceOf . ?item wdt:P17 wd:Q298 . # Country: Chile OPTIONAL {{ ?item wdt:P131 ?location }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }} }} LIMIT 10 """) # Strategy 3: Search by location and type if "Museo" in institution_name: instance_filter = "?item wdt:P31/wdt:P279* wd:Q33506 ." # Museum elif "Archivo" in institution_name: instance_filter = "?item wdt:P31/wdt:P279* wd:Q166118 ." # Archive else: instance_filter = "" if instance_filter: queries.append(f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{ {instance_filter} ?item rdfs:label ?label . ?item wdt:P17 wd:Q298 . # Country: Chile OPTIONAL {{ ?item wdt:P131 ?location }} ?item wdt:P31 ?instanceOf . SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }} }} LIMIT 50 """) all_results = [] for i, query in enumerate(queries, 1): print(f" Trying query strategy {i}...") sparql.setQuery(query) try: results: Any = sparql.query().convert() if isinstance(results, dict): bindings = results.get('results', {}).get('bindings', []) if bindings: print(f" Found {len(bindings)} results") all_results.extend(bindings) except Exception as e: print(f" Query failed: {e}") return all_results def main(): """Search for 3 high-priority institutions.""" target_institutions = [ { "name": "Museo de las Iglesias", "full_name": "Museo de las Iglesias de Chiloé", "city": "Castro", "region": "Chiloé", "rationale": "Connected to UNESCO World Heritage Site (Churches of Chiloé)", "search_terms": ["Museo de las Iglesias", "Museo Iglesias Chiloé", "Chiloé Churches Museum"] }, { "name": "Museo del Libro del Mar", "full_name": "Museo del Libro del Mar", "city": "San Antonio", "region": "San Antonio", "rationale": "Unique maritime book museum, specific subject focus", "search_terms": ["Museo del Libro del Mar", "Museo Libro Mar San Antonio"] }, { "name": "Archivo General de Asuntos Indígenas (CONADI)", "full_name": "Archivo General de Asuntos Indígenas", "city": "Temuco", "region": "Cautín", "rationale": "National government archive for indigenous affairs", "search_terms": ["Archivo General de Asuntos Indígenas", "CONADI", "Corporación Nacional de Desarrollo Indígena"] } ] all_search_results = {} print("=" * 80) print("Chilean Batch 13: Manual Wikidata Search") print("Target: 3 institutions to reach 70% coverage (63/90)") print("=" * 80) print() for institution in target_institutions: print(f"\n{'=' * 80}") print(f"Institution: {institution['name']}") print(f"Location: {institution['city']}, {institution['region']}") print(f"Rationale: {institution['rationale']}") print(f"{'=' * 80}") institution_results = { "metadata": institution, "wikidata_results": [] } # Try each search term for search_term in institution['search_terms']: print(f"\nSearching for: '{search_term}'") results = query_wikidata_exact(search_term, institution['city'], institution['region']) if results: print(f"Found {len(results)} potential matches") for result in results[:5]: # Show top 5 item_id = result['item']['value'].split('/')[-1] item_label = result.get('itemLabel', {}).get('value', 'No label') item_desc = result.get('itemDescription', {}).get('value', 'No description') location = result.get('locationLabel', {}).get('value', 'No location') instance = result.get('instanceOfLabel', {}).get('value', 'No type') print(f" {item_id}: {item_label}") print(f" Description: {item_desc}") print(f" Location: {location}") print(f" Type: {instance}") print() institution_results['wikidata_results'].append({ 'q_number': item_id, 'label': item_label, 'description': item_desc, 'location': location, 'instance_of': instance, 'search_term_used': search_term }) else: print(" No results found") all_search_results[institution['name']] = institution_results # Save results to JSON output_file = 'scripts/batch13_manual_search_results.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(all_search_results, f, indent=2, ensure_ascii=False) print(f"\n{'=' * 80}") print(f"Results saved to: {output_file}") print("=" * 80) print() print("NEXT STEPS:") print("1. Review search results manually") print("2. Verify Q-numbers correspond to correct institutions") print("3. Create batch13 enrichment script for validated matches") print("4. Apply enrichment to reach 70% coverage target") print() if __name__ == '__main__': main()