glam/scripts/enrich_algeria_wikidata_fuzzy.py

#!/usr/bin/env python3
"""
Wikidata enrichment for Algerian heritage institutions using fuzzy search.

Searches Wikidata by CONTAINS search rather than exact label match,
then uses fuzzy matching to verify results.

GLAM Data Extraction Project
Schema: LinkML v0.2.1
"""

import yaml
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
from rapidfuzz import fuzz

SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Algeria-Wikidata-Enrichment/2.0"

def search_wikidata_fuzzy(name: str, city: Optional[str] = None, timeout: int = 60) -> Optional[Dict[str, Any]]:
    """
    Search Wikidata for Algerian heritage institutions using broader criteria.

    Returns best fuzzy match from results.
    Improved with city verification and higher threshold (85%).
    """

    # Try multiple search strategies
    queries = []

    # Strategy 1: Search by country + institution type
    # Get all museums/libraries/archives in Algeria, then fuzzy match client-side
    query1 = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?cityLabel
    WHERE {
      # Must be in Algeria
      ?item wdt:P17 wd:Q262 .

      # Must be heritage institution type
      ?item wdt:P31 ?type .
      VALUES ?type {
        wd:Q33506      # Museum
        wd:Q7075       # Library
        wd:Q166118     # Archive
        wd:Q1030034    # Archaeological museum
        wd:Q473972     # Art museum
        wd:Q570116     # Public library
        wd:Q22687      # Synagogue
        wd:Q7840289    # Art gallery
        wd:Q2668072    # National library
      }

      OPTIONAL { ?item wdt:P214 ?viaf . }
      OPTIONAL { ?item wdt:P791 ?isil . }
      OPTIONAL { ?item wdt:P856 ?website . }
      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P571 ?inception . }
      OPTIONAL { ?item wdt:P131 ?city . }
      OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
    }
    LIMIT 100
    """

    headers = {'User-Agent': USER_AGENT}
    params = {
        'query': query1,
        'format': 'json'
    }

    try:
        time.sleep(1.5)  # Rate limiting
        response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
        response.raise_for_status()

        results = response.json()
        bindings = results.get("results", {}).get("bindings", [])

        if not bindings:
            return None

        # Fuzzy match against all results
        best_match = None
        best_score = 0

        name_lower = name.lower()
        city_lower = city.lower() if city else None

        for binding in bindings:
            item_label = binding.get("itemLabel", {}).get("value", "").lower()
            item_desc = binding.get("itemDescription", {}).get("value", "").lower()
            wd_city = binding.get("cityLabel", {}).get("value", "").lower()

            # Calculate match score
            label_score = fuzz.ratio(name_lower, item_label)
            partial_score = fuzz.partial_ratio(name_lower, item_label)
            token_score = fuzz.token_set_ratio(name_lower, item_label)

            # Best of the three fuzzy match strategies
            score = max(label_score, partial_score, token_score)

            # City verification: if both have cities and they don't match, penalize score
            if city_lower and wd_city:
                city_match = fuzz.ratio(city_lower, wd_city)
                if city_match < 80:  # Cities don't match
                    print(f"    ⚠️  City mismatch: {city} vs {wd_city} - penalizing match")
                    score *= 0.5  # Penalize heavily

            if score > best_score:
                best_score = score
                best_match = binding

        # Require minimum 85% match (raised from 70%)
        if best_score < 85:
            return None

        # Extract data from best match
        item_uri = best_match.get("item", {}).get("value", "")
        qid = item_uri.split("/")[-1] if item_uri else None

        if not qid or not qid.startswith("Q"):
            return None

        result = {
            "qid": qid,
            "name": best_match.get("itemLabel", {}).get("value", ""),
            "description": best_match.get("itemDescription", {}).get("value", ""),
            "match_score": best_score
        }

        if "viaf" in best_match:
            result["viaf"] = best_match["viaf"]["value"]

        if "isil" in best_match:
            result["isil"] = best_match["isil"]["value"]

        if "website" in best_match:
            result["website"] = best_match["website"]["value"]

        if "inception" in best_match:
            result["founded_date"] = best_match["inception"]["value"].split("T")[0]

        if "coords" in best_match:
            coords_str = best_match["coords"]["value"]
            if coords_str.startswith("Point("):
                lon, lat = coords_str[6:-1].split()
                result["latitude"] = float(lat)
                result["longitude"] = float(lon)

        return result

    except requests.exceptions.Timeout:
        print(f"  ⏱️  Query timeout (>{timeout}s)")
        return None
    except requests.exceptions.RequestException as e:
        print(f"  ❌ Network error: {e}")
        return None
    except Exception as e:
        print(f"  ❌ Error: {e}")
        return None

def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
    """Add Wikidata information to institution record."""

    # Add Wikidata identifier
    if 'identifiers' not in institution:
        institution['identifiers'] = []

    # Check if Wikidata already exists
    existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}

    if 'Wikidata' not in existing_schemes:
        institution['identifiers'].append({
            'identifier_scheme': 'Wikidata',
            'identifier_value': wikidata_result['qid'],
            'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
        })

    # Add VIAF if present
    if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
        institution['identifiers'].append({
            'identifier_scheme': 'VIAF',
            'identifier_value': wikidata_result['viaf'],
            'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
        })

    # Add ISIL if present
    if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
        institution['identifiers'].append({
            'identifier_scheme': 'ISIL',
            'identifier_value': wikidata_result['isil'],
            # ISIL codes don't have a universal URLisil']}"
        })

    # Update provenance notes
    if 'provenance' not in institution:
        institution['provenance'] = {}

    notes = institution['provenance'].get('notes', '')
    enrich_note = f" Wikidata enriched {datetime.now(timezone.utc).strftime('%Y-%m-%d')} ({wikidata_result['qid']}, match: {wikidata_result.get('match_score', 0):.0f}%)."
    institution['provenance']['notes'] = (notes + enrich_note).strip()

def save_checkpoint(data, input_file: Path, stats: dict):
    """Save progress checkpoint."""
    print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']}, total coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']})")

    # Handle metadata for dict format (Tunisia) vs list format (Algeria/Libya)
    if isinstance(data, dict) and '_metadata' in data:
        data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
        if 'Wikidata enrichment' not in data['_metadata'].get('enhancements', []):
            data['_metadata']['enhancements'].append('Wikidata enrichment')

    with open(input_file, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

def main():
    input_file = Path('data/instances/algeria/algerian_institutions.yaml')

    print("Algeria Wikidata Enrichment (Fuzzy Search)")
    print("=" * 60)
    print("Features:")
    print("  - Broad SPARQL query (all Algerian heritage institutions)")
    print("  - Client-side fuzzy matching (85% threshold)")
    print("  - City verification (prevents false matches)")
    print("  - Checkpoint saving every 10 institutions")
    print("  - Multiple match strategies (exact, partial, token)")
    print("=" * 60)

    # Load data
    print(f"\nReading: {input_file}")
    with open(input_file, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    # Handle both list format and dict with 'institutions' key
    institutions = data if isinstance(data, list) else data.get('institutions', [])
    print(f"Total institutions: {len(institutions)}")

    # Statistics
    stats = {
        'total': len(institutions),
        'already_enriched': 0,
        'searched': 0,
        'found': 0,
        'enriched': 0,
        'failed': 0,
        'low_confidence': 0,
        'duplicate_prevented': 0
    }

    # Track Q-numbers used in this enrichment run (prevent duplicates)
    used_qids = set()

    # Also collect existing Q-numbers from already-enriched institutions
    for inst in institutions:
        identifiers = inst.get('identifiers', [])
        for ident in identifiers:
            if ident.get('identifier_scheme') == 'Wikidata':
                used_qids.add(ident['identifier_value'])

    # Process each institution
    checkpoint_interval = 10

    for i, inst in enumerate(institutions, 1):
        name = inst.get('name', '')
        city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else ''

        # Check if already has Wikidata
        identifiers = inst.get('identifiers', [])
        existing_schemes = {id.get('identifier_scheme') for id in identifiers}

        if 'Wikidata' in existing_schemes:
            stats['already_enriched'] += 1
            qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
            print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
            continue

        # Search Wikidata with fuzzy matching
        print(f"[{i}/{len(institutions)}] Searching: {name} ({city})")
        stats['searched'] += 1

        result = search_wikidata_fuzzy(name, city, timeout=60)

        if result:
            stats['found'] += 1
            match_score = result.get('match_score', 0)
            qid = result['qid']
            print(f"  ✅ Found: {qid} - {result.get('name', '')} (match: {match_score:.0f}%)")

            # Check if Q-number already used
            if qid in used_qids:
                stats['duplicate_prevented'] += 1
                stats['failed'] += 1
                print(f"  ⚠️  Q-number {qid} already assigned to another institution, skipping")
            # Accept matches above 85% (function already filters, but double-check)
            elif match_score >= 85:
                add_wikidata_to_institution(inst, result)
                used_qids.add(qid)  # Track this Q-number
                stats['enriched'] += 1
                print(f"  ✅ Enriched")
            else:
                stats['low_confidence'] += 1
                stats['failed'] += 1
                print(f"  ⚠️  Match score too low (<85%), skipping")
        else:
            stats['failed'] += 1
            print(f"  ❌ Not found")

        # Checkpoint every N institutions
        if i % checkpoint_interval == 0 or i == len(institutions):
            save_checkpoint(data, input_file, stats)

    # Final save
    save_checkpoint(data, input_file, stats)

    # Print statistics
    print("\n" + "=" * 60)
    print("WIKIDATA ENRICHMENT STATISTICS")
    print("=" * 60)
    print(f"Total institutions: {stats['total']}")
    print(f"Already enriched: {stats['already_enriched']}")
    print(f"Searched: {stats['searched']}")
    print(f"Found: {stats['found']}")
    print(f"Enriched (new): {stats['enriched']}")
    print(f"Failed: {stats['failed']}")
    print(f"  - Low confidence: {stats['low_confidence']}")
    print(f"  - Duplicate Q-numbers prevented: {stats['duplicate_prevented']}")
    print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")

    if stats['enriched'] > 0:
        improvement = stats['enriched']
        print(f"✨ Added {improvement} new Wikidata identifiers!")

    print("\n✅ Wikidata enrichment complete!")

if __name__ == '__main__':
    main()