glam/scripts/query_wikidata_chilean_batch10.py

#!/usr/bin/env python3
"""
Query Wikidata for Chilean GLAM Institutions - Batch 10 (Priority Targets)
Focus: Official institutions, research centers, and mixed/cultural centers

Target institutions:
- Servicio Nacional del Patrimonio Cultural (official)
- Fundación Buen Pastor (research)
- Fundación Iglesias Patrimoniales (research)
- Instituto Alemán Puerto Montt (mixed)
- Centro Cultural Sofia Hott (mixed)
- Centro de Interpretación Histórica (mixed)
"""

import json
import requests
import time
from pathlib import Path
from rapidfuzz import fuzz

# Wikidata SPARQL endpoint
ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAMDataExtractor/1.0 (https://github.com/yourusername/glam; your@email.com)"

def query_wikidata(sparql_query: str) -> list:
    """Execute SPARQL query against Wikidata."""
    headers = {
        'User-Agent': USER_AGENT,
        'Accept': 'application/sparql-results+json'
    }

    params = {
        'query': sparql_query,
        'format': 'json'
    }

    response = requests.get(ENDPOINT, params=params, headers=headers)
    response.raise_for_status()

    data = response.json()
    return data['results']['bindings']

def extract_qid(uri: str) -> str:
    """Extract Q-number from Wikidata URI."""
    return uri.split('/')[-1]

def query_chilean_official_institutions():
    """Query for Chilean government cultural/heritage agencies."""
    query = """
    SELECT DISTINCT ?org ?orgLabel ?typeLabel ?websiteLabel ?viafID WHERE {
      # Chilean government organizations related to culture/heritage
      ?org wdt:P31 ?type .
      ?org wdt:P17 wd:Q298 .  # Country: Chile

      # Types: government agency, ministry, public service
      VALUES ?type {
        wd:Q327333    # government agency
        wd:Q192350    # government organization
        wd:Q2659904   # government institution
        wd:Q294414    # public service
      }

      # Related to culture/heritage/museums/archives
      {
        ?org wdt:P2578 ?mission .
        FILTER(CONTAINS(LCASE(?mission), "cultura") ||
               CONTAINS(LCASE(?mission), "patrimonio") ||
               CONTAINS(LCASE(?mission), "museo") ||
               CONTAINS(LCASE(?mission), "archivo"))
      } UNION {
        ?org rdfs:label ?label .
        FILTER(LANG(?label) = "es")
        FILTER(CONTAINS(LCASE(?label), "cultura") ||
               CONTAINS(LCASE(?label), "patrimonio") ||
               CONTAINS(LCASE(?label), "museo") ||
               CONTAINS(LCASE(?label), "archivo"))
      }

      OPTIONAL { ?org wdt:P856 ?website }
      OPTIONAL { ?org wdt:P214 ?viafID }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en". }
    }
    LIMIT 20
    """
    return query_wikidata(query)

def query_chilean_foundations():
    """Query for Chilean cultural foundations."""
    query = """
    SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE {
      # Chilean foundations
      ?org wdt:P31/wdt:P279* wd:Q157031 .  # foundation
      ?org wdt:P17 wd:Q298 .  # Country: Chile

      # Related to culture/heritage
      {
        ?org rdfs:label ?label .
        FILTER(LANG(?label) = "es")
        FILTER(CONTAINS(LCASE(?label), "pastor") ||
               CONTAINS(LCASE(?label), "iglesia") ||
               CONTAINS(LCASE(?label), "patrimonial") ||
               CONTAINS(LCASE(?label), "cultura"))
      } UNION {
        ?org wdt:P2578 ?mission .
        FILTER(CONTAINS(LCASE(?mission), "cultura") ||
               CONTAINS(LCASE(?mission), "patrimonio"))
      }

      OPTIONAL { ?org wdt:P131 ?location }
      OPTIONAL { ?org wdt:P856 ?website }
      OPTIONAL { ?org wdt:P214 ?viafID }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en". }
    }
    LIMIT 20
    """
    return query_wikidata(query)

def query_chilean_cultural_centers():
    """Query for Chilean cultural centers and interpretation centers."""
    query = """
    SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE {
      ?org wdt:P31 ?type .
      ?org wdt:P17 wd:Q298 .  # Country: Chile

      # Types: cultural center, interpretation center
      VALUES ?type {
        wd:Q2334061   # cultural center
        wd:Q2095    # educational institution (covers Instituto Alemán)
      }

      OPTIONAL { ?org wdt:P131 ?location }
      OPTIONAL { ?org wdt:P856 ?website }
      OPTIONAL { ?org wdt:P214 ?viafID }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,de". }
    }
    LIMIT 30
    """
    return query_wikidata(query)

def query_german_institutes_chile():
    """Query specifically for German institutes in Chile."""
    query = """
    SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE {
      ?org wdt:P31 ?type .
      ?org wdt:P17 wd:Q298 .  # Country: Chile

      # Educational or cultural institution
      VALUES ?type {
        wd:Q2095      # educational institution
        wd:Q2385804   # educational organization
        wd:Q31855     # research institute
      }

      # German connection
      {
        ?org rdfs:label ?label .
        FILTER(CONTAINS(LCASE(?label), "alemán") ||
               CONTAINS(LCASE(?label), "aleman") ||
               CONTAINS(LCASE(?label), "german") ||
               CONTAINS(LCASE(?label), "deutsch"))
      }

      OPTIONAL { ?org wdt:P131 ?location }
      OPTIONAL { ?org wdt:P856 ?website }
      OPTIONAL { ?org wdt:P214 ?viafID }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,de". }
    }
    LIMIT 20
    """
    return query_wikidata(query)

def fuzzy_match_institutions(wikidata_results: list, target_names: list) -> list:
    """Fuzzy match Wikidata results against target institution names."""
    matches = []

    for result in wikidata_results:
        wd_name = result.get('orgLabel', {}).get('value', '')
        wd_qid = extract_qid(result['org']['value'])
        wd_location = result.get('locationLabel', {}).get('value', 'Unknown')
        wd_website = result.get('websiteLabel', {}).get('value', None)
        wd_viaf = result.get('viafID', {}).get('value', None)

        # Match against targets
        for target in target_names:
            score = fuzz.ratio(target.lower(), wd_name.lower())
            partial_score = fuzz.partial_ratio(target.lower(), wd_name.lower())
            token_score = fuzz.token_sort_ratio(target.lower(), wd_name.lower())

            max_score = max(score, partial_score, token_score)

            if max_score >= 70:  # Lower threshold for discovery
                matches.append({
                    'target_name': target,
                    'wikidata_name': wd_name,
                    'q_number': wd_qid,
                    'location': wd_location,
                    'website': wd_website,
                    'viaf': wd_viaf,
                    'match_score': max_score,
                    'match_type': 'fuzzy',
                    'scores': {
                        'ratio': score,
                        'partial': partial_score,
                        'token': token_score
                    }
                })

    # Sort by score
    matches.sort(key=lambda x: x['match_score'], reverse=True)
    return matches

def main():
    print("=" * 80)
    print("CHILEAN GLAM INSTITUTIONS - BATCH 10 WIKIDATA QUERY")
    print("Target: Official institutions, research centers, mixed institutions")
    print("=" * 80)
    print()

    # Target institutions from our dataset
    targets = {
        'official': ['Servicio Nacional del Patrimonio Cultural'],
        'research': ['Fundación Buen Pastor', 'Fundación Iglesias Patrimoniales'],
        'mixed': [
            'Instituto Alemán Puerto Montt',
            'Centro Cultural Sofia Hott',
            'Centro de Interpretación Histórica'
        ]
    }

    all_results = []

    # Query 1: Official institutions
    print("🔍 Querying official institutions...")
    try:
        results = query_chilean_official_institutions()
        print(f"   Found {len(results)} official institutions")
        matches = fuzzy_match_institutions(results, targets['official'])
        all_results.extend([{**m, 'query_type': 'official'} for m in matches])
        time.sleep(2)  # Rate limiting
    except Exception as e:
        print(f"   ❌ Error: {e}")
    print()

    # Query 2: Foundations
    print("🔍 Querying foundations...")
    try:
        results = query_chilean_foundations()
        print(f"   Found {len(results)} foundations")
        matches = fuzzy_match_institutions(results, targets['research'])
        all_results.extend([{**m, 'query_type': 'foundation'} for m in matches])
        time.sleep(2)
    except Exception as e:
        print(f"   ❌ Error: {e}")
    print()

    # Query 3: Cultural centers
    print("🔍 Querying cultural centers...")
    try:
        results = query_chilean_cultural_centers()
        print(f"   Found {len(results)} cultural centers")
        matches = fuzzy_match_institutions(results, targets['mixed'])
        all_results.extend([{**m, 'query_type': 'cultural_center'} for m in matches])
        time.sleep(2)
    except Exception as e:
        print(f"   ❌ Error: {e}")
    print()

    # Query 4: German institutes
    print("🔍 Querying German institutes...")
    try:
        results = query_german_institutes_chile()
        print(f"   Found {len(results)} German institutes")
        matches = fuzzy_match_institutions(results, ['Instituto Alemán Puerto Montt'])
        all_results.extend([{**m, 'query_type': 'german_institute'} for m in matches])
        time.sleep(2)
    except Exception as e:
        print(f"   ❌ Error: {e}")
    print()

    # Save results
    output_file = Path('data/instances/chile/wikidata_matches_batch10_priority.json')
    print(f"💾 Saving results to: {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    print()

    # Summary
    print("=" * 80)
    print("QUERY SUMMARY")
    print("=" * 80)
    print()
    print(f"Total potential matches: {len(all_results)}")

    if all_results:
        print()
        print("Top matches by score:")
        for i, match in enumerate(all_results[:10], 1):
            print(f"{i:2d}. {match['target_name']}")
            print(f"    → {match['wikidata_name']} ({match['q_number']})")
            print(f"    Score: {match['match_score']:.1f}% | Type: {match['query_type']}")
            print(f"    Location: {match['location']}")
            if match.get('website'):
                print(f"    Website: {match['website']}")
            print()
    else:
        print("⚠️  No matches found above threshold (70%)")

    print("🎯 Next step: Review matches and create enrich_chilean_batch10.py")

if __name__ == '__main__':
    main()