#!/usr/bin/env python3 """ Query Wikidata for Chilean Museums - Batch 11 (Final Museum Push) ================================================================= Target: 13 remaining museums needing Wikidata enrichment Strategy: Direct SPARQL queries with fuzzy matching Goal: Add 5-8 museums → reach 60-63/90 (67-70% overall coverage) """ import requests import json from time import sleep from rapidfuzz import fuzz # Museums without Wikidata from analysis TARGET_MUSEUMS = [ { "name": "Museo de Tocopilla", "city": "María Elena", "region": "Tocopilla", "search_terms": ["Museo de Tocopilla", "Museo Histórico Tocopilla"] }, { "name": "Museo Rodulfo Philippi", "city": "Chañaral", "region": "Chañaral", "search_terms": ["Museo Rodulfo Philippi", "Museo Rudolf Philippi"] }, { "name": "Museo del Libro del Mar", "city": "San Antonio", "region": "San Antonio", "search_terms": ["Museo del Libro del Mar", "Museo Libro Mar"] }, { "name": "Museo de Historia Local Los Perales", "city": "Quilpué", "region": "Marga Marga", "search_terms": ["Museo Historia Local Los Perales", "Museo Quilpué"] }, { "name": "Museo Histórico-Arqueológico", "city": "Quillota", "region": "Quillota", "search_terms": ["Museo Histórico Arqueológico Quillota", "Museo Quillota"] }, { "name": "Museo Histórico y Cultural", "city": "Cauquenes", "region": "Cauquenes", "search_terms": ["Museo Histórico Cultural Cauquenes", "Museo Cauquenes"] }, { "name": "Museo Mapuche de Purén", "city": "Capitán Pastene", "region": "Malleco", "search_terms": ["Museo Mapuche Purén", "Museo Capitán Pastene"] }, { "name": "Museo Rudolph Philippi", "city": "Valdivia", "region": "Valdivia", "search_terms": ["Museo Rudolph Philippi", "Museo Rudolf Philippi Valdivia"] }, { "name": "Museo de las Iglesias", "city": "Castro", "region": "Chiloé", "search_terms": ["Museo Iglesias Chiloé", "Museo Castro"] }, { "name": "Museo Pleistocénico", "city": "Osorno", "region": "Osorno", "search_terms": ["Museo Pleistocénico", "Museo Pleistocene Osorno"] }, { "name": "Red de Museos Aysén", "city": "Coyhaique", "region": "Aisén", "search_terms": ["Red Museos Aysén", "Museo Regional Aysén"] }, { "name": "Museo Territorial Yagan Usi", "city": "Cabo de Hornos", "region": "Antártica Chilena", "search_terms": ["Museo Yagan", "Museo Territorial Yagan"] }, { "name": "Museo Histórico Municipal", "city": "Provincia de Última Esperanza", "region": "Última Esperanza", "search_terms": ["Museo Histórico Puerto Natales", "Museo Última Esperanza"] } ] def query_chilean_museums(): """ Query all Chilean museums from Wikidata in one efficient SPARQL query """ endpoint = "https://query.wikidata.org/sparql" query = """ SELECT DISTINCT ?museum ?museumLabel ?museumDescription ?location ?locationLabel ?founded ?coordinates WHERE { # Museum types VALUES ?museumType { wd:Q33506 wd:Q207694 wd:Q4737021 wd:Q812979 wd:Q10283556 wd:Q641152 wd:Q1124131 wd:Q17431399 wd:Q2772772 wd:Q2001305 } ?museum wdt:P31/wdt:P279* ?museumType . ?museum wdt:P17 wd:Q298 . # Country: Chile OPTIONAL { ?museum wdt:P131 ?location . } OPTIONAL { ?museum wdt:P571 ?founded . } OPTIONAL { ?museum wdt:P625 ?coordinates . } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . } } ORDER BY ?founded """ headers = { "User-Agent": "GLAM-Extractor/1.0 (Chilean Heritage Enrichment)", "Accept": "application/sparql-results+json" } try: print("🔍 Querying Wikidata for all Chilean museums...") response = requests.get( endpoint, params={"query": query, "format": "json"}, headers=headers, timeout=60 ) if response.status_code == 200: data = response.json() print(f"✅ Retrieved {len(data['results']['bindings'])} museums from Wikidata") return data['results']['bindings'] else: print(f"❌ Query failed with status {response.status_code}") return [] except Exception as e: print(f"❌ Query error: {e}") return [] def fuzzy_match_museum(target_museum, wikidata_museums): """ Find best match for target museum in Wikidata results """ best_match = None best_score = 0.0 for wd_museum in wikidata_museums: wd_name = wd_museum.get('museumLabel', {}).get('value', '') wd_location = wd_museum.get('locationLabel', {}).get('value', '') # Try matching against all search terms for search_term in target_museum['search_terms']: # Calculate multiple fuzzy scores ratio = fuzz.ratio(search_term.lower(), wd_name.lower()) / 100 partial = fuzz.partial_ratio(search_term.lower(), wd_name.lower()) / 100 token_sort = fuzz.token_sort_ratio(search_term.lower(), wd_name.lower()) / 100 score = max(ratio, partial, token_sort) # Boost score if location matches if target_museum['city'].lower() in wd_location.lower() or \ target_museum['region'].lower() in wd_location.lower(): score += 0.1 if score > best_score and score >= 0.80: best_score = score q_number = wd_museum['museum']['value'].split('/')[-1] best_match = { 'q_number': q_number, 'wikidata_name': wd_name, 'location': wd_location, 'description': wd_museum.get('museumDescription', {}).get('value', ''), 'match_score': score, 'founded': wd_museum.get('founded', {}).get('value', 'Unknown'), 'coordinates': wd_museum.get('coordinates', {}).get('value', 'Unknown') } return best_match def main(): print("="*80) print("CHILEAN MUSEUMS WIKIDATA QUERY - BATCH 11") print("="*80) print(f"Target: {len(TARGET_MUSEUMS)} museums") print("Strategy: Single SPARQL query + fuzzy matching (threshold: 0.80)") print("="*80) # Get all Chilean museums from Wikidata wikidata_museums = query_chilean_museums() if not wikidata_museums: print("❌ Failed to retrieve museums from Wikidata") return print("\n" + "="*80) print("MATCHING MUSEUMS") print("="*80) matches = [] no_matches = [] for target in TARGET_MUSEUMS: print(f"\n🔍 Searching: {target['name']} ({target['city']}, {target['region']})") match = fuzzy_match_museum(target, wikidata_museums) if match: print(f" ✅ MATCH: {match['wikidata_name']} ({match['q_number']})") print(f" 📍 Location: {match['location']}") print(f" 📊 Score: {match['match_score']:.2f}") matches.append({ 'museum': target, 'match': match }) else: print(f" ❌ No match found (threshold: 0.80)") no_matches.append(target) # Summary print("\n" + "="*80) print("BATCH 11 QUERY SUMMARY") print("="*80) print(f"✅ Matches found: {len(matches)}/{len(TARGET_MUSEUMS)}") print(f"❌ No matches: {len(no_matches)}/{len(TARGET_MUSEUMS)}") if matches: print("\n" + "-"*80) print("MATCHED MUSEUMS:") print("-"*80) for item in matches: print(f"{item['museum']['name']}") print(f" → {item['match']['wikidata_name']} ({item['match']['q_number']})") print(f" Score: {item['match']['match_score']:.2f}") if no_matches: print("\n" + "-"*80) print("MUSEUMS WITHOUT WIKIDATA MATCHES:") print("-"*80) for museum in no_matches: print(f" • {museum['name']} ({museum['city']}, {museum['region']})") # Save results output = { 'batch': 11, 'query_date': '2025-11-09', 'matches': matches, 'no_matches': no_matches, 'match_rate': f"{len(matches)}/{len(TARGET_MUSEUMS)}", 'expected_coverage': f"{55 + len(matches)}/90", 'expected_coverage_percent': f"{((55 + len(matches))/90*100):.1f}%" } with open('scripts/batch11_query_results.json', 'w', encoding='utf-8') as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"\n💾 Results saved to: scripts/batch11_query_results.json") print(f"\n📊 Projected coverage: {55 + len(matches)}/90 = {((55 + len(matches))/90*100):.1f}%") if len(matches) >= 8: print("🎯 SUCCESS! Reached 70% coverage target!") elif len(matches) >= 5: print("✅ Good progress toward 70% coverage goal") else: print("⚠️ May need additional enrichment strategies") if __name__ == "__main__": main()