glam/scripts/enrich_denmark_wikidata.py

"""
Wikidata Enrichment for Danish GLAM Institutions

Queries Wikidata SPARQL endpoint to find Q-numbers for Danish libraries and archives,
then enriches the denmark_complete.json dataset with Wikidata identifiers.
"""

import json
import re
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from urllib.parse import quote
import requests
from rapidfuzz import fuzz

# Wikidata SPARQL endpoint
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"

def query_wikidata_libraries_denmark() -> List[Dict]:
    """Query Wikidata for libraries in Denmark."""

    query = """
    SELECT DISTINCT ?item ?itemLabel ?isil ?viaf ?coordinates ?city ?cityLabel WHERE {
      # Libraries in Denmark
      ?item wdt:P31/wdt:P279* wd:Q7075 .  # instance of library (or subclass)
      ?item wdt:P17 wd:Q35 .               # country: Denmark

      # Optional identifiers
      OPTIONAL { ?item wdt:P791 ?isil }    # ISIL code
      OPTIONAL { ?item wdt:P214 ?viaf }    # VIAF ID
      OPTIONAL { ?item wdt:P625 ?coordinates }  # Coordinates
      OPTIONAL { ?item wdt:P131 ?city }    # Located in administrative entity

      SERVICE wikibase:label {
        bd:serviceParam wikibase:language "da,en"
      }
    }
    ORDER BY ?itemLabel
    """

    headers = {
        'User-Agent': 'GLAM-Data-Enrichment/0.1 (https://github.com/example/glam-data)',
        'Accept': 'application/sparql-results+json'
    }

    print("Querying Wikidata for Danish libraries...")
    response = requests.get(
        WIKIDATA_SPARQL,
        params={'query': query, 'format': 'json'},
        headers=headers,
        timeout=60
    )
    response.raise_for_status()

    results = response.json()
    bindings = results['results']['bindings']

    print(f"  Found {len(bindings)} libraries in Wikidata")

    libraries = []
    for binding in bindings:
        lib = {
            'qid': binding['item']['value'].split('/')[-1],
            'label': binding.get('itemLabel', {}).get('value', ''),
            'isil': binding.get('isil', {}).get('value'),
            'viaf': binding.get('viaf', {}).get('value'),
            'city': binding.get('cityLabel', {}).get('value')
        }
        libraries.append(lib)

    return libraries

def query_wikidata_archives_denmark() -> List[Dict]:
    """Query Wikidata for archives in Denmark."""

    query = """
    SELECT DISTINCT ?item ?itemLabel ?isil ?viaf ?coordinates ?city ?cityLabel WHERE {
      # Archives in Denmark
      {
        ?item wdt:P31/wdt:P279* wd:Q166118 .  # instance of archive (or subclass)
      } UNION {
        ?item wdt:P31 wd:Q7075 .              # or library with archival collections
        ?item wdt:P31 wd:Q166118 .
      }
      ?item wdt:P17 wd:Q35 .                  # country: Denmark

      # Optional identifiers
      OPTIONAL { ?item wdt:P791 ?isil }       # ISIL code
      OPTIONAL { ?item wdt:P214 ?viaf }       # VIAF ID
      OPTIONAL { ?item wdt:P625 ?coordinates }  # Coordinates
      OPTIONAL { ?item wdt:P131 ?city }       # Located in administrative entity

      SERVICE wikibase:label {
        bd:serviceParam wikibase:language "da,en"
      }
    }
    ORDER BY ?itemLabel
    """

    headers = {
        'User-Agent': 'GLAM-Data-Enrichment/0.1 (https://github.com/example/glam-data)',
        'Accept': 'application/sparql-results+json'
    }

    print("\nQuerying Wikidata for Danish archives...")
    response = requests.get(
        WIKIDATA_SPARQL,
        params={'query': query, 'format': 'json'},
        headers=headers,
        timeout=60
    )
    response.raise_for_status()

    results = response.json()
    bindings = results['results']['bindings']

    print(f"  Found {len(bindings)} archives in Wikidata")

    archives = []
    for binding in bindings:
        archive = {
            'qid': binding['item']['value'].split('/')[-1],
            'label': binding.get('itemLabel', {}).get('value', ''),
            'isil': binding.get('isil', {}).get('value'),
            'viaf': binding.get('viaf', {}).get('value'),
            'city': binding.get('cityLabel', {}).get('value')
        }
        archives.append(archive)

    return archives

def parse_identifier_string(identifier_str: str) -> Optional[Dict]:
    """Parse identifier from string representation."""
    if not identifier_str or not isinstance(identifier_str, str):
        return None

    scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str)
    value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str)
    url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str)

    if scheme_match and value_match:
        return {
            'scheme': scheme_match.group(1),
            'value': value_match.group(1),
            'url': url_match.group(1) if url_match else None
        }
    return None

def find_wikidata_match(
    institution: Dict,
    wikidata_institutions: List[Dict],
    threshold: int = 85
) -> Optional[Tuple[Dict, int]]:
    """
    Find best Wikidata match for an institution.

    Returns:
        Tuple of (wikidata_item, match_score) if found, else None
    """
    inst_name = institution.get('name', '').lower()
    if not inst_name:
        return None

    # Extract ISIL code from institution if present
    inst_isil = None
    identifiers = institution.get('identifiers', [])
    for identifier_data in identifiers:
        identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
        if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'ISIL':
            inst_isil = identifier.get('value')
            break

    # First pass: Try exact ISIL match
    if inst_isil:
        for wd_item in wikidata_institutions:
            if wd_item.get('isil') == inst_isil:
                return (wd_item, 100)  # Perfect match via ISIL

    # Second pass: Fuzzy match by name
    best_match = None
    best_score = 0

    for wd_item in wikidata_institutions:
        wd_label = wd_item.get('label', '').lower()
        if not wd_label:
            continue

        # Calculate fuzzy similarity
        score = fuzz.ratio(inst_name, wd_label)

        # Bonus points for city match
        inst_city = None
        locations = institution.get('locations', [])
        if locations:
            first_loc = locations[0]
            if isinstance(first_loc, str):
                city_match = re.search(r"'city':\s*'([^']*)'", first_loc)
                if city_match:
                    inst_city = city_match.group(1).lower()
            elif isinstance(first_loc, dict):
                inst_city = first_loc.get('city', '').lower()

        if inst_city and wd_item.get('city'):
            wd_city = wd_item['city'].lower()
            if inst_city in wd_city or wd_city in inst_city:
                score += 10  # City match bonus

        if score > best_score:
            best_score = score
            best_match = wd_item

    if best_score >= threshold:
        return (best_match, best_score)

    return None

def enrich_with_wikidata(
    institutions: List[Dict],
    wikidata_libraries: List[Dict],
    wikidata_archives: List[Dict]
) -> Tuple[List[Dict], Dict]:
    """
    Enrich institutions with Wikidata Q-numbers.

    Returns:
        Tuple of (enriched_institutions, statistics)
    """

    stats = {
        'total': len(institutions),
        'libraries_checked': 0,
        'archives_checked': 0,
        'matched_by_isil': 0,
        'matched_by_name': 0,
        'no_match': 0,
        'already_had_wikidata': 0
    }

    enriched = []

    for i, inst in enumerate(institutions, 1):
        if i % 100 == 0:
            print(f"  Processing {i}/{len(institutions)} institutions...")

        inst_type = inst.get('institution_type')

        # Check if already has Wikidata ID
        has_wikidata = False
        identifiers = inst.get('identifiers', [])
        for identifier_data in identifiers:
            identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
            if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'Wikidata':
                has_wikidata = True
                stats['already_had_wikidata'] += 1
                break

        if not has_wikidata:
            # Try to find Wikidata match
            if inst_type == 'LIBRARY':
                stats['libraries_checked'] += 1
                match = find_wikidata_match(inst, wikidata_libraries, threshold=85)
            elif inst_type == 'ARCHIVE':
                stats['archives_checked'] += 1
                match = find_wikidata_match(inst, wikidata_archives, threshold=85)
            else:
                match = None

            if match:
                wd_item, score = match
                qid = wd_item['qid']

                # Determine if it was ISIL or name match
                if score == 100:
                    stats['matched_by_isil'] += 1
                else:
                    stats['matched_by_name'] += 1

                # Add Wikidata identifier (as string representation to match existing format)
                wikidata_identifier = (
                    f"Identifier({{\n"
                    f"  'identifier_scheme': 'Wikidata',\n"
                    f"  'identifier_value': '{qid}',\n"
                    f"  'identifier_url': 'https://www.wikidata.org/wiki/{qid}'\n"
                    f"}})"
                )

                if not inst.get('identifiers'):
                    inst['identifiers'] = []
                inst['identifiers'].append(wikidata_identifier)

                # Add enrichment metadata
                if not inst.get('enrichment_history'):
                    inst['enrichment_history'] = []
                inst['enrichment_history'].append({
                    'enrichment_date': '2025-11-19',
                    'enrichment_method': 'Wikidata SPARQL query',
                    'enrichment_source': 'https://query.wikidata.org/sparql',
                    'match_score': score,
                    'matched_label': wd_item.get('label')
                })
            else:
                stats['no_match'] += 1

        enriched.append(inst)

    return enriched, stats

def main():
    print("=" * 60)
    print("Danish GLAM Dataset → Wikidata Enrichment")
    print("=" * 60)

    # Load dataset
    input_path = Path('data/instances/denmark_complete.json')
    print(f"\nLoading dataset from {input_path}...")
    with open(input_path, 'r') as f:
        institutions = json.load(f)
    print(f"  Loaded {len(institutions)} institutions")

    # Query Wikidata
    try:
        wikidata_libraries = query_wikidata_libraries_denmark()
        time.sleep(2)  # Rate limiting
        wikidata_archives = query_wikidata_archives_denmark()
    except Exception as e:
        print(f"❌ Error querying Wikidata: {e}")
        return

    # Enrich dataset
    print("\nEnriching dataset with Wikidata Q-numbers...")
    enriched_institutions, stats = enrich_with_wikidata(
        institutions,
        wikidata_libraries,
        wikidata_archives
    )

    # Save enriched dataset
    output_path = Path('data/instances/denmark_complete_enriched.json')
    print(f"\nSaving enriched dataset to {output_path}...")
    with open(output_path, 'w') as f:
        json.dump(enriched_institutions, f, indent=2, ensure_ascii=False)

    size_mb = output_path.stat().st_size / (1024 * 1024)
    print(f"  ✅ Saved ({size_mb:.2f} MB)")

    # Print statistics
    print("\n" + "=" * 60)
    print("Enrichment Statistics")
    print("=" * 60)
    print(f"Total institutions: {stats['total']}")
    print(f"Already had Wikidata: {stats['already_had_wikidata']}")
    print(f"Libraries checked: {stats['libraries_checked']}")
    print(f"Archives checked: {stats['archives_checked']}")
    print(f"Matched by ISIL: {stats['matched_by_isil']}")
    print(f"Matched by name: {stats['matched_by_name']}")
    print(f"No match found: {stats['no_match']}")

    total_new_matches = stats['matched_by_isil'] + stats['matched_by_name']
    total_with_wikidata = stats['already_had_wikidata'] + total_new_matches

    print(f"\n✅ Total institutions with Wikidata: {total_with_wikidata}/{stats['total']} " +
          f"({100*total_with_wikidata/stats['total']:.1f}%)")
    print(f"✅ New Wikidata matches added: {total_new_matches}")

    print("\n" + "=" * 60)
    print("✅ Wikidata Enrichment Complete")
    print("=" * 60)

if __name__ == '__main__':
    main()