glam/scripts/enrich_belgium_wikidata_fuzzy.py

#!/usr/bin/env python3
"""
Enrich Belgian (BE) custodian files with Wikidata data using fuzzy name matching.

This script:
1. Fetches Belgian heritage institutions from Wikidata in batches
2. Uses fuzzy matching to find corresponding custodians
3. Enriches files that don't already have wikidata_enrichment
"""

import yaml
import glob
import time
import httpx
from datetime import datetime, timezone
from pathlib import Path
import logging
import sys
import re
from difflib import SequenceMatcher

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('be_wikidata_fuzzy.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAMBot/1.0 (Heritage Custodian Enrichment; contact@example.org)"
MATCH_THRESHOLD = 0.85

def normalize_name(name: str) -> str:
    """Normalize institution name for comparison."""
    if not name:
        return ""
    name = name.lower()
    # Remove common prefixes/suffixes
    name = re.sub(r'\b(de|het|een|the|le|la|les|du|des|van|voor|von)\b', '', name)
    # Remove punctuation
    name = re.sub(r'[^\w\s]', '', name)
    # Normalize whitespace
    name = re.sub(r'\s+', ' ', name).strip()
    return name


def similarity(a: str, b: str) -> float:
    """Calculate similarity ratio between two strings."""
    a_norm = normalize_name(a)
    b_norm = normalize_name(b)
    if not a_norm or not b_norm:
        return 0.0
    return SequenceMatcher(None, a_norm, b_norm).ratio()


def fetch_belgian_institutions_by_type(type_qid: str, type_name: str) -> list[dict]:
    """Fetch Belgian institutions of a specific type from Wikidata."""

    sparql_query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?website ?image WHERE {{
        ?item wdt:P17 wd:Q31 .
        ?item wdt:P31 wd:{type_qid} .
        OPTIONAL {{ ?item wdt:P856 ?website . }}
        OPTIONAL {{ ?item wdt:P18 ?image . }}
        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,fr,de,en". }}
    }}
    """

    headers = {
        "User-Agent": USER_AGENT,
        "Accept": "application/sparql-results+json"
    }

    try:
        response = httpx.get(
            WIKIDATA_SPARQL,
            params={"query": sparql_query, "format": "json"},
            headers=headers,
            timeout=60.0
        )
        response.raise_for_status()
        data = response.json()

        results = []
        for b in data.get("results", {}).get("bindings", []):
            item_uri = b.get("item", {}).get("value", "")
            wikidata_id = item_uri.split("/")[-1] if item_uri else None

            if not wikidata_id:
                continue

            label = b.get("itemLabel", {}).get("value", "")
            # Skip if label is just the Q-number
            if label.startswith("Q") and label[1:].isdigit():
                continue

            results.append({
                "wikidata_id": wikidata_id,
                "wikidata_url": item_uri,
                "label": label,
                "description": b.get("itemDescription", {}).get("value"),
                "website": b.get("website", {}).get("value"),
                "image": b.get("image", {}).get("value"),
            })

        logger.info(f"  {type_name}: {len(results)} items")
        return results

    except Exception as e:
        logger.error(f"Error fetching {type_name}: {e}")
        return []


def fetch_belgian_institutions() -> list[dict]:
    """Fetch all Belgian heritage institutions from Wikidata."""
    logger.info("Fetching Belgian institutions from Wikidata...")

    # Define institution types to fetch
    types = [
        ("Q7075", "library"),
        ("Q166118", "archive"),
        ("Q33506", "museum"),
        ("Q207694", "art museum"),
        ("Q1007870", "public library"),
        ("Q2668072", "provincial archive"),
        ("Q473972", "city archive"),
        ("Q17431399", "local history museum"),
        ("Q210272", "cultural center"),
        ("Q28564", "public library"),
        ("Q856234", "national library"),
    ]

    all_results = []
    seen_ids = set()

    for type_qid, type_name in types:
        results = fetch_belgian_institutions_by_type(type_qid, type_name)
        for r in results:
            if r["wikidata_id"] not in seen_ids:
                seen_ids.add(r["wikidata_id"])
                all_results.append(r)
        time.sleep(1)  # Rate limiting between queries

    logger.info(f"Total unique Belgian institutions: {len(all_results)}")
    return all_results


def get_instance_of(wikidata_id: str) -> list[str]:
    """Get instance_of (P31) values for a Wikidata entity."""
    sparql_query = f"""
    SELECT ?type WHERE {{
        wd:{wikidata_id} wdt:P31 ?type .
    }}
    """

    headers = {
        "User-Agent": USER_AGENT,
        "Accept": "application/sparql-results+json"
    }

    try:
        response = httpx.get(
            WIKIDATA_SPARQL,
            params={"query": sparql_query, "format": "json"},
            headers=headers,
            timeout=30.0
        )
        response.raise_for_status()
        data = response.json()

        types = []
        for binding in data.get("results", {}).get("bindings", []):
            type_uri = binding.get("type", {}).get("value", "")
            type_id = type_uri.split("/")[-1] if type_uri else None
            if type_id:
                types.append(type_id)
        return types
    except:
        return []


def get_custodian_name(data: dict) -> str:
    """Extract the best name from custodian data."""
    if data.get('custodian_name', {}).get('emic_name'):
        return data['custodian_name']['emic_name']
    if data.get('original_entry', {}).get('name'):
        return data['original_entry']['name']
    if data.get('name'):
        return data['name']
    return ""


def main():
    """Main enrichment process."""
    # Fetch Wikidata institutions
    wikidata_institutions = fetch_belgian_institutions()
    if not wikidata_institutions:
        logger.error("Failed to fetch Wikidata institutions")
        return

    # Load BE custodian files
    data_dir = Path("data/custodian")
    be_files = sorted(data_dir.glob("BE-*.yaml"))

    logger.info(f"Processing {len(be_files)} Belgian custodian files")

    enriched_count = 0
    skipped_count = 0

    for i, filepath in enumerate(be_files):
        if (i + 1) % 50 == 0:
            logger.info(f"Progress: {i+1}/{len(be_files)} files processed, {enriched_count} enriched")

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            if not data:
                continue

            # Skip if already enriched
            if 'wikidata_enrichment' in data:
                skipped_count += 1
                continue

            # Get custodian name
            custodian_name = get_custodian_name(data)
            if not custodian_name:
                continue

            # Find best match
            best_match = None
            best_score = 0

            for wd in wikidata_institutions:
                score = similarity(custodian_name, wd['label'])
                if score > best_score:
                    best_score = score
                    best_match = wd

            if best_score < MATCH_THRESHOLD:
                continue

            # Get instance_of for the match
            instance_of = get_instance_of(best_match['wikidata_id'])
            time.sleep(0.3)

            # Build enrichment block
            enrichment = {
                'wikidata_id': best_match['wikidata_id'],
                'wikidata_url': best_match['wikidata_url'],
                'matched_by': 'fuzzy_name_match',
                'match_score': round(best_score, 3),
                'matched_name': best_match['label'],
                'enrichment_date': datetime.now(timezone.utc).isoformat(),
                'enrichment_version': '2.1.0',
            }

            if best_match.get('label'):
                enrichment['wikidata_label'] = best_match['label']
            if best_match.get('description'):
                enrichment['wikidata_description'] = best_match['description']
            if best_match.get('website'):
                enrichment['official_website'] = best_match['website']
            if best_match.get('image'):
                enrichment['image'] = best_match['image']
            if instance_of:
                enrichment['instance_of'] = instance_of

            # Add to data
            data['wikidata_enrichment'] = enrichment

            # Write back
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

            logger.info(f"Enriched {filepath.name}: '{custodian_name}' → '{best_match['label']}' ({best_match['wikidata_id']}, score={best_score:.3f})")
            enriched_count += 1

        except Exception as e:
            logger.error(f"Error processing {filepath}: {e}")

    logger.info("=" * 60)
    logger.info("ENRICHMENT COMPLETE")
    logger.info(f"Total BE files: {len(be_files)}")
    logger.info(f"Enriched: {enriched_count}")
    logger.info(f"Skipped (already enriched): {skipped_count}")


if __name__ == "__main__":
    main()