glam/scripts/query_wikidata_chilean_archives.py

#!/usr/bin/env python3
"""
Query Wikidata for Chilean Archives using SPARQL
Uses Wikidata Query Service to find archives in Chile with their Q-numbers
"""

import yaml
from SPARQLWrapper import SPARQLWrapper, JSON
from typing import List, Dict
from pathlib import Path

def query_chilean_archives() -> List[Dict]:
    """Query Wikidata for all archives in Chile."""

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    # SPARQL query for archives in Chile
    # P31 = instance of, P17 = country, Q298 = Chile
    # Q166118 = archive institution
    query = """
    SELECT DISTINCT ?archive ?archiveLabel ?cityLabel ?coords ?founded WHERE {
      # Archive types (including subclasses)
      ?archive wdt:P31/wdt:P279* wd:Q166118 .

      # Located in Chile
      ?archive wdt:P17 wd:Q298 .

      # Get city/location
      OPTIONAL { ?archive wdt:P131 ?city . }

      # Get coordinates
      OPTIONAL { ?archive wdt:P625 ?coords . }

      # Get founding date
      OPTIONAL { ?archive wdt:P571 ?founded . }

      # Get labels in Spanish and English
      SERVICE wikibase:label {
        bd:serviceParam wikibase:language "es,en" .
      }
    }
    ORDER BY ?archiveLabel
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    print("🔍 Querying Wikidata for Chilean archives...")
    print("   Endpoint: https://query.wikidata.org/sparql")
    print()

    try:
        results = sparql.query().convert()  # type: ignore

        archives = []
        for result in results["results"]["bindings"]:  # type: ignore
            archive_uri = result["archive"]["value"]  # type: ignore
            q_number = archive_uri.split("/")[-1]

            archive = {
                "q_number": q_number,
                "name": result.get("archiveLabel", {}).get("value", ""),  # type: ignore
                "city": result.get("cityLabel", {}).get("value", ""),  # type: ignore
                "founded": result.get("founded", {}).get("value", "")[:4] if "founded" in result else "",  # type: ignore
                "wikidata_url": f"https://www.wikidata.org/wiki/{q_number}"
            }
            archives.append(archive)

        return archives

    except Exception as e:
        print(f"❌ Error querying Wikidata: {e}")
        return []

def load_chilean_institutions(file_path: Path) -> List[Dict]:
    """Load Chilean institutions from YAML file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

def normalize_name(name: str) -> str:
    """Normalize institution name for matching."""
    return name.lower().strip().replace("'", "").replace("  ", " ")

def find_matches(institutions: List[Dict], wikidata_archives: List[Dict]) -> List[Dict]:
    """Find matches between our institutions and Wikidata archives."""

    matches = []

    # Filter institutions without Wikidata
    archives_without_wd = [
        inst for inst in institutions
        if inst.get('institution_type') == 'ARCHIVE'
        and not any(
            id_obj.get('identifier_scheme') == 'Wikidata'
            for id_obj in inst.get('identifiers', [])
        )
    ]

    print(f"📊 Matching {len(archives_without_wd)} institutions against {len(wikidata_archives)} Wikidata entries...")
    print()

    for inst in archives_without_wd:
        inst_name = normalize_name(inst['name'])
        inst_city = inst.get('locations', [{}])[0].get('city', '').lower()

        for wd_archive in wikidata_archives:
            wd_name = normalize_name(wd_archive['name'])
            wd_city = wd_archive['city'].lower()

            # Name match strategies
            name_match = False

            # Strategy 1: Exact match
            if inst_name == wd_name:
                name_match = True

            # Strategy 2: Partial match (institution name contains Wikidata name or vice versa)
            elif inst_name in wd_name or wd_name in inst_name:
                name_match = True

            # Strategy 3: Key words match (archivo/archive + significant word)
            elif ('archivo' in inst_name or 'archive' in inst_name) and ('archivo' in wd_name or 'archive' in wd_name):
                inst_words = set(inst_name.split())
                wd_words = set(wd_name.split())
                common_words = inst_words & wd_words
                # Must share at least 2 significant words beyond "archivo"
                significant_common = common_words - {'de', 'del', 'la', 'el', 'archivo', 'archives', 'historico', 'histórico', 'national', 'nacional', 's'}
                if len(significant_common) >= 1:  # At least 1 significant word for archives
                    name_match = True

            # Strategy 4: "Archivo Nacional" special case (high-value institution)
            if 'nacional' in inst_name and ('archivo' in inst_name or 'archive' in inst_name):
                if 'nacional' in wd_name or 'national' in wd_name:
                    name_match = True

            # Strategy 5: University archives (USACH, Universidad de Chile)
            if 'universidad' in inst_name or 'university' in inst_name:
                # Extract university name
                if 'usach' in inst_name and 'usach' in wd_name:
                    name_match = True
                elif 'chile' in inst_name and 'chile' in wd_name:
                    name_match = True

            # Strategy 6: Diocese/Church archives (Arzobispado, Diócesis)
            if 'diocesis' in inst_name or 'arzobispado' in inst_name:
                if 'diocese' in wd_name or 'diocesis' in wd_name or 'arzobispado' in wd_name:
                    name_match = True

            # City match (flexible - allows partial matches)
            city_match = False
            if inst_city and wd_city:
                if inst_city in wd_city or wd_city in inst_city:
                    city_match = True

            # Accept match if name matches and either city matches or no city info
            # Allow special exceptions for national/well-known institutions
            allow_match = (
                city_match or
                not wd_city or
                'nacional' in inst_name or
                'national' in wd_name
            )

            if name_match and allow_match:
                match = {
                    'institution': inst,
                    'wikidata': wd_archive,
                    'name_confidence': 'exact' if inst_name == wd_name else 'partial',
                    'city_match': city_match
                }
                matches.append(match)
                break  # Only take first match per institution

    return matches

def main():
    print("=" * 80)
    print("WIKIDATA SPARQL QUERY - CHILEAN ARCHIVES")
    print("=" * 80)
    print()

    # Query Wikidata
    wikidata_archives = query_chilean_archives()

    if not wikidata_archives:
        print("❌ No results from Wikidata")
        return

    print(f"✅ Found {len(wikidata_archives)} archives in Wikidata")
    print()

    # Show sample
    print("Sample results (first 10):")
    for i, archive in enumerate(wikidata_archives[:10], 1):
        print(f"  {i}. {archive['name']} ({archive['city']}) → {archive['q_number']}")
    print()

    # Load our institutions
    input_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml')
    institutions = load_chilean_institutions(input_file)

    print(f"📖 Loaded {len(institutions)} Chilean institutions")
    archives_count = sum(1 for i in institutions if i.get('institution_type') == 'ARCHIVE')
    print(f"   {archives_count} are archives")

    with_wikidata = sum(
        1 for inst in institutions
        if inst.get('institution_type') == 'ARCHIVE'
        and any(
            id_obj.get('identifier_scheme') == 'Wikidata'
            for id_obj in inst.get('identifiers', [])
        )
    )
    print(f"   {with_wikidata} already have Wikidata")
    print(f"   {archives_count - with_wikidata} need enrichment")
    print()

    # Find matches
    matches = find_matches(institutions, wikidata_archives)

    print("=" * 80)
    print(f"MATCHING RESULTS: {len(matches)} potential matches found")
    print("=" * 80)
    print()

    # Display matches
    for i, match in enumerate(matches, 1):
        inst = match['institution']
        wd = match['wikidata']

        print(f"{i}. {inst['name']}")
        print(f"   Our city: {inst.get('locations', [{}])[0].get('city', 'Unknown')}")
        print(f"   ↓ MATCH ({match['name_confidence']} name, city: {match['city_match']})")
        print(f"   Wikidata: {wd['name']} ({wd['city']})")
        print(f"   Q-number: {wd['q_number']}")
        if wd['founded']:
            print(f"   Founded: {wd['founded']}")
        print()

    # Export matches to JSON for batch processing
    output_file = Path('data/instances/chile/wikidata_matches_batch9_archives.json')

    import json
    match_data = [
        {
            'institution_name': match['institution']['name'],
            'institution_city': match['institution'].get('locations', [{}])[0].get('city', ''),
            'q_number': match['wikidata']['q_number'],
            'wikidata_name': match['wikidata']['name'],
            'wikidata_city': match['wikidata']['city'],
            'founded': match['wikidata']['founded'],
            'confidence': match['name_confidence'],
            'city_match': match['city_match']
        }
        for match in matches
    ]

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(match_data, f, indent=2, ensure_ascii=False)

    print(f"💾 Saved {len(matches)} matches to: {output_file}")
    print()
    print("🎯 Next step: Review matches and create Batch 9 enrichment script")

if __name__ == '__main__':
    main()