glam/scripts/enrich_czech_wikidata.py

#!/usr/bin/env python3
"""
Enrich Czech institutions with Wikidata Q-numbers.

Uses Wikidata SPARQL endpoint to find matching institutions by name,
location, and type. Adds Wikidata identifiers to czech_unified.yaml.

Process:
1. Load czech_unified.yaml (8,694 institutions)
2. Filter institutions WITHOUT Wikidata Q-numbers (estimate: ~95%)
3. Query Wikidata for Czech heritage institutions
4. Fuzzy match by name + location + type
5. Add Wikidata identifiers to records
6. Save to czech_unified_wikidata.yaml

Estimated time: 5-10 minutes (SPARQL queries + fuzzy matching)
"""

import yaml
import requests
from typing import List, Dict, Optional, Tuple
from rapidfuzz import fuzz
from datetime import datetime, timezone

# Wikidata SPARQL endpoint
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"

# Wikidata institution type mapping (GLAM → Wikidata Q-numbers)
WIKIDATA_TYPES = {
    'MUSEUM': ['Q33506'],  # museum
    'LIBRARY': ['Q7075'],  # library
    'ARCHIVE': ['Q166118'],  # archive
    'GALLERY': ['Q1007870'],  # art gallery
}

def query_wikidata_institutions(country_code: str = 'Q213') -> List[Dict]:
    """
    Query Wikidata for Czech Republic heritage institutions.

    Args:
        country_code: Wikidata Q-number for country (Q213 = Czech Republic)

    Returns:
        List of dicts with: qid, label, type, location, coordinates
    """

    # SPARQL query for Czech heritage institutions
    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?typeLabel ?locationLabel ?coords ?isil ?viaf
    WHERE {{
      # Institution types (museum, library, archive, gallery)
      VALUES ?type {{ wd:Q33506 wd:Q7075 wd:Q166118 wd:Q1007870 }}

      # Instance of heritage institution type
      ?item wdt:P31/wdt:P279* ?type .

      # Located in Czech Republic (or subdivisions)
      ?item wdt:P17 wd:{country_code} .

      # Optional: specific location (city/town)
      OPTIONAL {{ ?item wdt:P131 ?location }}

      # Optional: coordinates
      OPTIONAL {{ ?item wdt:P625 ?coords }}

      # Optional: ISIL code
      OPTIONAL {{ ?item wdt:P791 ?isil }}

      # Optional: VIAF ID
      OPTIONAL {{ ?item wdt:P214 ?viaf }}

      # Get labels in Czech and English
      SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "cs,en"
      }}
    }}
    LIMIT 10000
    """

    print("Querying Wikidata for Czech heritage institutions...")
    print(f"SPARQL endpoint: {WIKIDATA_SPARQL}")

    headers = {
        'User-Agent': 'GLAM-Data-Extraction/0.2.0 (heritage institution research)',
        'Accept': 'application/sparql-results+json'
    }

    try:
        response = requests.get(
            WIKIDATA_SPARQL,
            params={'query': query},
            headers=headers,
            timeout=60
        )
        response.raise_for_status()
        data = response.json()

        # Parse results
        institutions = []
        for binding in data['results']['bindings']:
            qid = binding['item']['value'].split('/')[-1]
            label = binding['itemLabel']['value']
            inst_type = binding['typeLabel']['value']
            location = binding.get('locationLabel', {}).get('value', '')
            coords = binding.get('coords', {}).get('value', '')
            isil = binding.get('isil', {}).get('value', '')
            viaf = binding.get('viaf', {}).get('value', '')

            institutions.append({
                'qid': qid,
                'label': label,
                'type': inst_type,
                'location': location,
                'coordinates': coords,
                'isil': isil,
                'viaf': viaf
            })

        print(f"Found {len(institutions)} institutions in Wikidata")
        return institutions

    except Exception as e:
        print(f"Error querying Wikidata: {e}")
        return []

def fuzzy_match_institution(
    inst_name: str,
    inst_city: str,
    inst_type: str,
    wikidata_results: List[Dict],
    threshold: float = 85.0
) -> Optional[Tuple[Dict, float]]:
    """
    Fuzzy match institution to Wikidata results.

    Args:
        inst_name: Institution name from our dataset
        inst_city: City location
        inst_type: Institution type (MUSEUM, LIBRARY, ARCHIVE, GALLERY)
        wikidata_results: List of Wikidata query results
        threshold: Minimum similarity score (0-100)

    Returns:
        Tuple of (matched_wikidata_record, confidence_score) or None
    """

    best_match = None
    best_score = 0.0

    for wd in wikidata_results:
        # Name similarity
        name_score = fuzz.ratio(inst_name.lower(), wd['label'].lower())

        # Location boost (if cities match)
        location_boost = 0
        if inst_city and wd['location']:
            location_score = fuzz.partial_ratio(inst_city.lower(), wd['location'].lower())
            if location_score > 85:
                location_boost = 10

        # Type match check (optional, informational only)
        # We don't penalize type mismatches since Wikidata typing can be inconsistent

        # Combined score
        total_score = name_score + location_boost

        if total_score > best_score and total_score >= threshold:
            best_score = total_score
            best_match = wd

    if best_match:
        return (best_match, best_score)
    else:
        return None

def enrich_with_wikidata():
    """Main enrichment workflow."""

    print("="*80)
    print("CZECH INSTITUTIONS - WIKIDATA ENRICHMENT")
    print("="*80)
    print()

    # Load unified dataset
    print("Loading czech_unified.yaml...")
    with open('data/instances/czech_unified.yaml', 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    print(f"Loaded {len(institutions)} institutions")

    # Filter institutions without Wikidata Q-numbers
    needs_wikidata = []
    has_wikidata = 0

    for inst in institutions:
        has_qid = False
        for identifier in inst.get('identifiers', []):
            if identifier.get('identifier_scheme') == 'Wikidata':
                has_qid = True
                has_wikidata += 1
                break

        if not has_qid:
            needs_wikidata.append(inst)

    print(f"Institutions with Wikidata: {has_wikidata}")
    print(f"Institutions needing Wikidata: {len(needs_wikidata)}")
    print()

    # Query Wikidata
    wikidata_results = query_wikidata_institutions()

    if not wikidata_results:
        print("No Wikidata results found. Exiting.")
        return

    print()
    print(f"Fuzzy matching {len(needs_wikidata)} institutions...")
    print(f"Match threshold: 85% similarity")
    print()

    # Fuzzy match
    matched = 0
    low_confidence = 0

    for idx, inst in enumerate(needs_wikidata, 1):
        if idx % 100 == 0:
            print(f"  Processed {idx}/{len(needs_wikidata)} institutions...")

        # Extract city from locations
        city = ''
        if inst.get('locations'):
            city = inst['locations'][0].get('city', '')

        # Fuzzy match
        match_result = fuzzy_match_institution(
            inst['name'],
            city,
            inst['institution_type'],
            wikidata_results,
            threshold=85.0
        )

        if match_result:
            matched_wd, confidence = match_result

            # Add Wikidata identifier
            if 'identifiers' not in inst:
                inst['identifiers'] = []

            inst['identifiers'].append({
                'identifier_scheme': 'Wikidata',
                'identifier_value': matched_wd['qid'],
                'identifier_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}"
            })

            # Add ISIL if available and not already present
            if matched_wd.get('isil'):
                has_isil = any(
                    i.get('identifier_scheme') == 'ISIL'
                    for i in inst['identifiers']
                )
                if not has_isil:
                    inst['identifiers'].append({
                        'identifier_scheme': 'ISIL',
                        'identifier_value': matched_wd['isil'],
                        # ISIL codes don't have a universal URLisil']}"
                    })

            # Add VIAF if available and not already present
            if matched_wd.get('viaf'):
                has_viaf = any(
                    i.get('identifier_scheme') == 'VIAF'
                    for i in inst['identifiers']
                )
                if not has_viaf:
                    inst['identifiers'].append({
                        'identifier_scheme': 'VIAF',
                        'identifier_value': matched_wd['viaf'],
                        'identifier_url': f"https://viaf.org/viaf/{matched_wd['viaf']}"
                    })

            # Update provenance
            if 'enrichment_history' not in inst['provenance']:
                inst['provenance']['enrichment_history'] = []

            inst['provenance']['enrichment_history'].append({
                'enrichment_date': datetime.now(timezone.utc).isoformat(),
                'enrichment_method': 'Wikidata SPARQL query + fuzzy matching',
                'match_score': confidence,
                'verified': True if confidence > 95 else False
            })

            matched += 1

            if confidence < 90:
                low_confidence += 1

    print(f"\n✅ Matched {matched} institutions ({matched/len(needs_wikidata)*100:.1f}%)")
    print(f"⚠️  Low confidence matches (<90%): {low_confidence}")
    print(f"❌ No match: {len(needs_wikidata) - matched}")
    print()

    # Save enriched dataset
    output_path = 'data/instances/czech_unified_wikidata.yaml'
    print(f"Saving enriched dataset to {output_path}...")

    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(
            institutions,
            f,
            allow_unicode=True,
            sort_keys=False,
            default_flow_style=False,
            width=100
        )

    print(f"✅ Saved {len(institutions)} institutions")
    print()
    print("="*80)
    print("ENRICHMENT COMPLETE")
    print("="*80)
    print(f"Total institutions: {len(institutions)}")
    print(f"With Wikidata Q-numbers: {has_wikidata + matched}")
    print(f"Newly enriched: {matched}")
    print(f"Enrichment rate: {(has_wikidata + matched)/len(institutions)*100:.1f}%")

if __name__ == '__main__':
    enrich_with_wikidata()