#!/usr/bin/env python3 """ Query Wikidata for Chilean GLAM Institutions - Batch 10 (Priority Targets) Focus: Official institutions, research centers, and mixed/cultural centers Target institutions: - Servicio Nacional del Patrimonio Cultural (official) - Fundación Buen Pastor (research) - Fundación Iglesias Patrimoniales (research) - Instituto Alemán Puerto Montt (mixed) - Centro Cultural Sofia Hott (mixed) - Centro de Interpretación Histórica (mixed) """ import json import requests import time from pathlib import Path from rapidfuzz import fuzz # Wikidata SPARQL endpoint ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAMDataExtractor/1.0 (https://github.com/yourusername/glam; your@email.com)" def query_wikidata(sparql_query: str) -> list: """Execute SPARQL query against Wikidata.""" headers = { 'User-Agent': USER_AGENT, 'Accept': 'application/sparql-results+json' } params = { 'query': sparql_query, 'format': 'json' } response = requests.get(ENDPOINT, params=params, headers=headers) response.raise_for_status() data = response.json() return data['results']['bindings'] def extract_qid(uri: str) -> str: """Extract Q-number from Wikidata URI.""" return uri.split('/')[-1] def query_chilean_official_institutions(): """Query for Chilean government cultural/heritage agencies.""" query = """ SELECT DISTINCT ?org ?orgLabel ?typeLabel ?websiteLabel ?viafID WHERE { # Chilean government organizations related to culture/heritage ?org wdt:P31 ?type . ?org wdt:P17 wd:Q298 . # Country: Chile # Types: government agency, ministry, public service VALUES ?type { wd:Q327333 # government agency wd:Q192350 # government organization wd:Q2659904 # government institution wd:Q294414 # public service } # Related to culture/heritage/museums/archives { ?org wdt:P2578 ?mission . FILTER(CONTAINS(LCASE(?mission), "cultura") || CONTAINS(LCASE(?mission), "patrimonio") || CONTAINS(LCASE(?mission), "museo") || CONTAINS(LCASE(?mission), "archivo")) } UNION { ?org rdfs:label ?label . FILTER(LANG(?label) = "es") FILTER(CONTAINS(LCASE(?label), "cultura") || CONTAINS(LCASE(?label), "patrimonio") || CONTAINS(LCASE(?label), "museo") || CONTAINS(LCASE(?label), "archivo")) } OPTIONAL { ?org wdt:P856 ?website } OPTIONAL { ?org wdt:P214 ?viafID } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en". } } LIMIT 20 """ return query_wikidata(query) def query_chilean_foundations(): """Query for Chilean cultural foundations.""" query = """ SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE { # Chilean foundations ?org wdt:P31/wdt:P279* wd:Q157031 . # foundation ?org wdt:P17 wd:Q298 . # Country: Chile # Related to culture/heritage { ?org rdfs:label ?label . FILTER(LANG(?label) = "es") FILTER(CONTAINS(LCASE(?label), "pastor") || CONTAINS(LCASE(?label), "iglesia") || CONTAINS(LCASE(?label), "patrimonial") || CONTAINS(LCASE(?label), "cultura")) } UNION { ?org wdt:P2578 ?mission . FILTER(CONTAINS(LCASE(?mission), "cultura") || CONTAINS(LCASE(?mission), "patrimonio")) } OPTIONAL { ?org wdt:P131 ?location } OPTIONAL { ?org wdt:P856 ?website } OPTIONAL { ?org wdt:P214 ?viafID } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en". } } LIMIT 20 """ return query_wikidata(query) def query_chilean_cultural_centers(): """Query for Chilean cultural centers and interpretation centers.""" query = """ SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE { ?org wdt:P31 ?type . ?org wdt:P17 wd:Q298 . # Country: Chile # Types: cultural center, interpretation center VALUES ?type { wd:Q2334061 # cultural center wd:Q2095 # educational institution (covers Instituto Alemán) } OPTIONAL { ?org wdt:P131 ?location } OPTIONAL { ?org wdt:P856 ?website } OPTIONAL { ?org wdt:P214 ?viafID } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,de". } } LIMIT 30 """ return query_wikidata(query) def query_german_institutes_chile(): """Query specifically for German institutes in Chile.""" query = """ SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE { ?org wdt:P31 ?type . ?org wdt:P17 wd:Q298 . # Country: Chile # Educational or cultural institution VALUES ?type { wd:Q2095 # educational institution wd:Q2385804 # educational organization wd:Q31855 # research institute } # German connection { ?org rdfs:label ?label . FILTER(CONTAINS(LCASE(?label), "alemán") || CONTAINS(LCASE(?label), "aleman") || CONTAINS(LCASE(?label), "german") || CONTAINS(LCASE(?label), "deutsch")) } OPTIONAL { ?org wdt:P131 ?location } OPTIONAL { ?org wdt:P856 ?website } OPTIONAL { ?org wdt:P214 ?viafID } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,de". } } LIMIT 20 """ return query_wikidata(query) def fuzzy_match_institutions(wikidata_results: list, target_names: list) -> list: """Fuzzy match Wikidata results against target institution names.""" matches = [] for result in wikidata_results: wd_name = result.get('orgLabel', {}).get('value', '') wd_qid = extract_qid(result['org']['value']) wd_location = result.get('locationLabel', {}).get('value', 'Unknown') wd_website = result.get('websiteLabel', {}).get('value', None) wd_viaf = result.get('viafID', {}).get('value', None) # Match against targets for target in target_names: score = fuzz.ratio(target.lower(), wd_name.lower()) partial_score = fuzz.partial_ratio(target.lower(), wd_name.lower()) token_score = fuzz.token_sort_ratio(target.lower(), wd_name.lower()) max_score = max(score, partial_score, token_score) if max_score >= 70: # Lower threshold for discovery matches.append({ 'target_name': target, 'wikidata_name': wd_name, 'q_number': wd_qid, 'location': wd_location, 'website': wd_website, 'viaf': wd_viaf, 'match_score': max_score, 'match_type': 'fuzzy', 'scores': { 'ratio': score, 'partial': partial_score, 'token': token_score } }) # Sort by score matches.sort(key=lambda x: x['match_score'], reverse=True) return matches def main(): print("=" * 80) print("CHILEAN GLAM INSTITUTIONS - BATCH 10 WIKIDATA QUERY") print("Target: Official institutions, research centers, mixed institutions") print("=" * 80) print() # Target institutions from our dataset targets = { 'official': ['Servicio Nacional del Patrimonio Cultural'], 'research': ['Fundación Buen Pastor', 'Fundación Iglesias Patrimoniales'], 'mixed': [ 'Instituto Alemán Puerto Montt', 'Centro Cultural Sofia Hott', 'Centro de Interpretación Histórica' ] } all_results = [] # Query 1: Official institutions print("🔍 Querying official institutions...") try: results = query_chilean_official_institutions() print(f" Found {len(results)} official institutions") matches = fuzzy_match_institutions(results, targets['official']) all_results.extend([{**m, 'query_type': 'official'} for m in matches]) time.sleep(2) # Rate limiting except Exception as e: print(f" ❌ Error: {e}") print() # Query 2: Foundations print("🔍 Querying foundations...") try: results = query_chilean_foundations() print(f" Found {len(results)} foundations") matches = fuzzy_match_institutions(results, targets['research']) all_results.extend([{**m, 'query_type': 'foundation'} for m in matches]) time.sleep(2) except Exception as e: print(f" ❌ Error: {e}") print() # Query 3: Cultural centers print("🔍 Querying cultural centers...") try: results = query_chilean_cultural_centers() print(f" Found {len(results)} cultural centers") matches = fuzzy_match_institutions(results, targets['mixed']) all_results.extend([{**m, 'query_type': 'cultural_center'} for m in matches]) time.sleep(2) except Exception as e: print(f" ❌ Error: {e}") print() # Query 4: German institutes print("🔍 Querying German institutes...") try: results = query_german_institutes_chile() print(f" Found {len(results)} German institutes") matches = fuzzy_match_institutions(results, ['Instituto Alemán Puerto Montt']) all_results.extend([{**m, 'query_type': 'german_institute'} for m in matches]) time.sleep(2) except Exception as e: print(f" ❌ Error: {e}") print() # Save results output_file = Path('data/instances/chile/wikidata_matches_batch10_priority.json') print(f"💾 Saving results to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: json.dump(all_results, f, indent=2, ensure_ascii=False) print() # Summary print("=" * 80) print("QUERY SUMMARY") print("=" * 80) print() print(f"Total potential matches: {len(all_results)}") if all_results: print() print("Top matches by score:") for i, match in enumerate(all_results[:10], 1): print(f"{i:2d}. {match['target_name']}") print(f" → {match['wikidata_name']} ({match['q_number']})") print(f" Score: {match['match_score']:.1f}% | Type: {match['query_type']}") print(f" Location: {match['location']}") if match.get('website'): print(f" Website: {match['website']}") print() else: print("⚠️ No matches found above threshold (70%)") print("🎯 Next step: Review matches and create enrich_chilean_batch10.py") if __name__ == '__main__': main()