glam/scripts/query_wikidata_libraries_batch12.py

#!/usr/bin/env python3
"""
Query Wikidata for Chilean libraries to reach 70% coverage target - Batch 12.

Target: 7 libraries without Wikidata identifiers
Strategy: Bulk SPARQL query + fuzzy matching (threshold 75+)
Goal: Find 3+ matches to reach 63/90 (70.0%)
"""

import json
import time
import yaml
from typing import List, Dict, Any
from pathlib import Path
from SPARQLWrapper import SPARQLWrapper, JSON
from rapidfuzz import fuzz

# Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Input dataset
INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch11_enriched.yaml")

def load_institutions() -> List[Dict[str, Any]]:
    """Load Chilean institutions from YAML."""
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

def get_libraries_without_wikidata(institutions: List[Dict]) -> List[Dict]:
    """Extract libraries that don't have Wikidata identifiers."""
    libraries = []

    for inst in institutions:
        if inst.get('institution_type') != 'LIBRARY':
            continue

        # Check if already has Wikidata
        has_wikidata = any(
            i.get('identifier_scheme') == 'Wikidata'
            for i in inst.get('identifiers', [])
        )

        if not has_wikidata:
            name = inst.get('name', 'Unknown')
            city = inst.get('locations', [{}])[0].get('city', 'Unknown')
            libraries.append({'name': name, 'city': city})

    return libraries

def query_chilean_libraries() -> List[Dict[str, Any]]:
    """Query all Chilean libraries from Wikidata."""

    query = """
    SELECT DISTINCT ?library ?libraryLabel ?location ?locationLabel ?coords ?viaf ?isil ?founded WHERE {
      # Chilean libraries
      ?library wdt:P31/wdt:P279* wd:Q7075 .  # Instance of library (or subclass)
      ?library wdt:P17 wd:Q298 .              # Country: Chile

      # Optional: location
      OPTIONAL {
        ?library wdt:P131 ?location .
      }

      # Optional: coordinates
      OPTIONAL {
        ?library wdt:P625 ?coords .
      }

      # Optional: VIAF identifier
      OPTIONAL {
        ?library wdt:P214 ?viaf .
      }

      # Optional: ISIL code
      OPTIONAL {
        ?library wdt:P791 ?isil .
      }

      # Optional: founding date
      OPTIONAL {
        ?library wdt:P571 ?founded .
      }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" }
    }
    ORDER BY ?libraryLabel
    """

    print("Querying Wikidata for Chilean libraries...")
    print(f"SPARQL endpoint: {SPARQL_ENDPOINT}")
    print()

    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    sparql.setTimeout(120)  # 2 minute timeout

    try:
        start_time = time.time()
        results = sparql.query().convert()
        elapsed = time.time() - start_time

        print(f"✅ Query completed in {elapsed:.1f} seconds")
        print()

        # Extract results
        bindings = results['results']['bindings']  # type: ignore
        libraries = []

        for result in bindings:
            library = {
                'q_number': result['library']['value'].split('/')[-1],  # type: ignore
                'name': result.get('libraryLabel', {}).get('value', 'Unknown'),  # type: ignore
                'location': result.get('locationLabel', {}).get('value'),  # type: ignore
                'coords': result.get('coords', {}).get('value'),  # type: ignore
                'viaf': result.get('viaf', {}).get('value'),  # type: ignore
                'isil': result.get('isil', {}).get('value'),  # type: ignore
                'founded': result.get('founded', {}).get('value', '')[:4] if 'founded' in result else None  # type: ignore
            }
            libraries.append(library)

        print(f"Found {len(libraries)} Chilean libraries in Wikidata")
        return libraries

    except Exception as e:
        print(f"❌ Query failed: {e}")
        return []

def fuzzy_match_libraries(search_libraries: List[Dict], wikidata_libraries: List[Dict]) -> List[Dict]:
    """Fuzzy match search libraries against Wikidata results."""

    matches = []

    print()
    print("=" * 80)
    print("FUZZY MATCHING RESULTS")
    print("=" * 80)
    print()

    for search_lib in search_libraries:
        search_name = search_lib['name']
        search_city = search_lib['city']

        print(f"Searching for: {search_name} ({search_city})")

        best_match = None
        best_score = 0

        for wd_lib in wikidata_libraries:
            wd_name = wd_lib['name']

            # Fuzzy match on name
            name_score = fuzz.ratio(search_name.lower(), wd_name.lower())

            # Bonus for location match
            location_bonus = 0
            if wd_lib.get('location'):
                location_score = fuzz.partial_ratio(search_city.lower(), wd_lib['location'].lower())
                if location_score > 70:
                    location_bonus = 10

            total_score = name_score + location_bonus

            if total_score > best_score:
                best_score = total_score
                best_match = {
                    'search_name': search_name,
                    'search_city': search_city,
                    'q_number': wd_lib['q_number'],
                    'wikidata_name': wd_name,
                    'wikidata_location': wd_lib.get('location'),
                    'match_score': name_score,
                    'total_score': total_score,
                    'viaf': wd_lib.get('viaf'),
                    'isil': wd_lib.get('isil'),
                    'founded': wd_lib.get('founded')
                }

        if best_match and best_match['total_score'] >= 75:  # Threshold: 75+
            print(f"  ✅ MATCH: {best_match['wikidata_name']} ({best_match['q_number']})")
            print(f"     Score: {best_match['match_score']:.0f} (name) + {best_match['total_score'] - best_match['match_score']:.0f} (location) = {best_match['total_score']:.0f}")
            if best_match.get('wikidata_location'):
                print(f"     Location: {best_match['wikidata_location']}")
            if best_match.get('founded'):
                print(f"     Founded: {best_match['founded']}")
            matches.append(best_match)
        else:
            print(f"  ❌ No match found (best score: {best_score:.0f})")

        print()

    return matches

def main():
    """Main execution."""
    print("=" * 80)
    print("CHILEAN LIBRARIES WIKIDATA QUERY - BATCH 12")
    print("=" * 80)
    print()

    # Load institutions
    print(f"Loading dataset: {INPUT_FILE}")
    institutions = load_institutions()
    print(f"  Loaded {len(institutions)} institutions")
    print()

    # Get libraries without Wikidata
    search_libraries = get_libraries_without_wikidata(institutions)
    print(f"Target: {len(search_libraries)} libraries without Wikidata")
    print(f"Goal: Find 3+ matches to reach 70% coverage (63/90)")
    print()

    # Query Wikidata
    wikidata_libraries = query_chilean_libraries()

    if not wikidata_libraries:
        print("❌ No results from Wikidata")
        return

    # Fuzzy match
    matches = fuzzy_match_libraries(search_libraries, wikidata_libraries)

    # Save results
    output = {
        "batch": 12,
        "query_date": time.strftime("%Y-%m-%d"),
        "institution_type": "LIBRARY",
        "total_searched": len(search_libraries),
        "matches_found": len(matches),
        "matches": matches
    }

    output_file = "scripts/batch12_library_query_results.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Wikidata libraries found: {len(wikidata_libraries)}")
    print(f"Matches found: {len(matches)}")
    print(f"Match rate: {len(matches)/len(search_libraries)*100:.1f}%")
    print()
    print(f"✅ Results saved to: {output_file}")
    print()

    if len(matches) >= 3:
        print(f"🎯 SUCCESS! Found {len(matches)} matches - enough to reach 70% target")
    else:
        print(f"⚠️  Only found {len(matches)} matches - need {3 - len(matches)} more for 70% target")

    print()
    print("Next step: Manual validation with scripts/finalize_batch12.py")

if __name__ == "__main__":
    main()