glam/scripts/enrich_argentina_wikidata.py

#!/usr/bin/env python3
"""
Wikidata enrichment for Argentine CONABIP libraries using fuzzy search.

Searches Wikidata for libraries in Argentina, then uses fuzzy matching
to verify results against CONABIP data.

GLAM Data Extraction Project
Schema: LinkML v0.2.1
Country: Argentina (AR)
Source: CONABIP (Comisión Nacional de Bibliotecas Populares)
"""

import json
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
from rapidfuzz import fuzz

SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Argentina-Wikidata-Enrichment/1.0"

def search_wikidata_fuzzy(name: str, city: Optional[str] = None, province: Optional[str] = None, timeout: int = 60, verbose: bool = False) -> Optional[Dict[str, Any]]:
    """
    Search Wikidata for Argentine libraries using broad criteria.

    Returns best fuzzy match from results with 85% threshold.
    Includes city and province verification for better accuracy.
    """

    # Query all libraries in Argentina (cached result shared across calls)
    # Note: This could be optimized by caching the SPARQL query result
    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?cityLabel ?provinceLabel
    WHERE {
      # Must be in Argentina
      ?item wdt:P17 wd:Q414 .

      # Must be library or archive type
      ?item wdt:P31 ?type .
      VALUES ?type {
        wd:Q7075       # Library
        wd:Q28564      # Public library
        wd:Q2668072    # National library
        wd:Q856234     # Community library
        wd:Q166118     # Archive
        wd:Q1622062    # Popular library (biblioteca popular)
      }

      OPTIONAL { ?item wdt:P214 ?viaf . }
      OPTIONAL { ?item wdt:P791 ?isil . }
      OPTIONAL { ?item wdt:P856 ?website . }
      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P571 ?inception . }
      OPTIONAL { ?item wdt:P131 ?city . }
      OPTIONAL { ?item wdt:P131/wdt:P131 ?province . }
      OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("es", "en")) }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
    }
    LIMIT 200
    """

    headers = {'User-Agent': USER_AGENT}
    params = {
        'query': query,
        'format': 'json'
    }

    try:
        time.sleep(1.0)  # Rate limiting (reduced from 1.5s)
        response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
        response.raise_for_status()

        results = response.json()
        bindings = results.get("results", {}).get("bindings", [])

        if not bindings:
            return None

        # Fuzzy match against all results
        best_match = None
        best_score = 0

        name_lower = name.lower()
        city_lower = city.lower() if city else None
        province_lower = province.lower() if province else None

        # Normalize CABA (Ciudad Autónoma de Buenos Aires)
        if city_lower and "ciudad autónoma" in city_lower:
            city_lower = "buenos aires"  # Treat CABA as Buenos Aires for matching

        for binding in bindings:
            item_label = binding.get("itemLabel", {}).get("value", "").lower()
            item_desc = binding.get("itemDescription", {}).get("value", "").lower()
            wd_city = binding.get("cityLabel", {}).get("value", "").lower()
            wd_province = binding.get("provinceLabel", {}).get("value", "").lower()

            # Calculate match score (multiple strategies)
            label_score = fuzz.ratio(name_lower, item_label)
            partial_score = fuzz.partial_ratio(name_lower, item_label)
            token_score = fuzz.token_set_ratio(name_lower, item_label)

            # Best of the three fuzzy match strategies
            score = max(label_score, partial_score, token_score)

            # City verification: if both have cities and they match, boost score
            if city_lower and wd_city:
                city_match = fuzz.ratio(city_lower, wd_city)
                if city_match > 80:  # Cities match well
                    if verbose:
                        print(f"    ✓ City match: {city} ≈ {wd_city} (boost +5)")
                    score = min(100, score + 5)  # Boost for city match
                elif city_match < 60:  # Cities don't match - only log if very low score
                    if verbose and city_match < 40:
                        print(f"    ⚠️  City mismatch: {city} vs {wd_city}")
                    score *= 0.7  # Penalize (reduced from 0.6)

            # Province verification (Argentina has 23 provinces + CABA)
            if province_lower and wd_province:
                prov_match = fuzz.partial_ratio(province_lower, wd_province)
                if prov_match > 80:
                    if verbose:
                        print(f"    ✓ Province match: {province} ≈ {wd_province} (boost +3)")
                    score = min(100, score + 3)

            if score > best_score:
                best_score = score
                best_match = binding

        # Require minimum 85% match
        if best_score < 85:
            return None

        # Extract data from best match
        item_uri = best_match.get("item", {}).get("value", "")
        qid = item_uri.split("/")[-1] if item_uri else None

        if not qid or not qid.startswith("Q"):
            return None

        result = {
            "qid": qid,
            "name": best_match.get("itemLabel", {}).get("value", ""),
            "description": best_match.get("itemDescription", {}).get("value", ""),
            "match_score": best_score
        }

        if "viaf" in best_match:
            result["viaf"] = best_match["viaf"]["value"]

        if "isil" in best_match:
            result["isil"] = best_match["isil"]["value"]

        if "website" in best_match:
            result["website"] = best_match["website"]["value"]

        if "inception" in best_match:
            result["founded_date"] = best_match["inception"]["value"].split("T")[0]

        if "coords" in best_match:
            coords_str = best_match["coords"]["value"]
            if coords_str.startswith("Point("):
                lon, lat = coords_str[6:-1].split()
                result["latitude"] = float(lat)
                result["longitude"] = float(lon)

        return result

    except Exception as e:
        print(f"  ❌ Error querying Wikidata: {e}")
        return None

def enrich_conabip_with_wikidata(input_file: Path, output_file: Path):
    """
    Enrich CONABIP libraries with Wikidata Q-numbers.
    """

    print("=" * 80)
    print("ARGENTINA CONABIP WIKIDATA ENRICHMENT")
    print("=" * 80)
    print(f"Input:  {input_file}")
    print(f"Output: {output_file}")
    print()

    # Load CONABIP data
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    institutions = data.get("institutions", [])
    total = len(institutions)

    print(f"📚 Loaded {total} institutions from CONABIP")
    print()

    # Statistics
    stats = {
        "total": total,
        "enriched": 0,
        "skipped_existing": 0,
        "no_match": 0,
        "errors": 0
    }

    # Enrich each institution
    for idx, inst in enumerate(institutions, 1):
        name = inst.get("name", "Unknown")
        city = inst.get("city")
        province = inst.get("province")

        print(f"[{idx}/{total}] {name}")
        print(f"  📍 {city}, {province}")

        # Skip if already has Wikidata Q-number
        identifiers = inst.get("identifiers", [])
        has_wikidata = any(
            id_obj.get("identifier_scheme") == "Wikidata"
            for id_obj in identifiers
        )

        if has_wikidata:
            print("  ⏭️  Already has Wikidata")
            stats["skipped_existing"] += 1
            continue

        # Search Wikidata
        print("  🔍 Searching...", end=" ", flush=True)
        wd_result = search_wikidata_fuzzy(name, city, province, verbose=False)

        if wd_result:
            qid = wd_result["qid"]
            match_score = wd_result["match_score"]
            wd_name = wd_result["name"]

            print(f"✅ {qid} ({match_score:.0f}%)")


            # Add Wikidata identifier
            if "identifiers" not in inst:
                inst["identifiers"] = []

            inst["identifiers"].append({
                "identifier_scheme": "Wikidata",
                "identifier_value": qid,
                "identifier_url": f"https://www.wikidata.org/wiki/{qid}",
                "match_score": match_score,
                "enrichment_date": datetime.now(timezone.utc).isoformat()
            })

            # Add additional identifiers from Wikidata if available
            extras = []
            if "viaf" in wd_result:
                inst["identifiers"].append({
                    "identifier_scheme": "VIAF",
                    "identifier_value": wd_result["viaf"],
                    "identifier_url": f"https://viaf.org/viaf/{wd_result['viaf']}",
                    "source": "Wikidata"
                })
                extras.append(f"VIAF:{wd_result['viaf']}")

            if "isil" in wd_result:
                inst["identifiers"].append({
                    "identifier_scheme": "ISIL",
                    "identifier_value": wd_result["isil"],
                    "source": "Wikidata"
                })
                extras.append(f"ISIL:{wd_result['isil']}")

            if "website" in wd_result and not inst.get("website"):
                inst["website"] = wd_result["website"]
                extras.append("Website")

            if "founded_date" in wd_result:
                inst["founded_date"] = wd_result["founded_date"]
                extras.append(f"Founded:{wd_result['founded_date']}")

            if extras:
                print(f"     + {', '.join(extras)}")

            stats["enriched"] += 1
        else:
            print("⚠️  No match (< 85%)")
            stats["no_match"] += 1

    # Update metadata
    data["metadata"]["wikidata_enrichment_date"] = datetime.now(timezone.utc).isoformat()
    data["metadata"]["wikidata_enrichment_stats"] = stats

    # Save enriched data
    output_file.parent.mkdir(parents=True, exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print("=" * 80)
    print("ENRICHMENT COMPLETE")
    print("=" * 80)
    print(f"✅ Enriched: {stats['enriched']}/{total} ({stats['enriched']/total*100:.1f}%)")
    print(f"⏭️  Already had Wikidata: {stats['skipped_existing']}")
    print(f"⚠️  No match found: {stats['no_match']}")
    print(f"❌ Errors: {stats['errors']}")
    print()
    print(f"📁 Output saved to: {output_file}")
    print()

if __name__ == "__main__":
    # Paths
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / "data" / "isil" / "AR" / "conabip_libraries_enhanced_FULL.json"
    output_file = base_dir / "data" / "isil" / "AR" / "conabip_libraries_wikidata_enriched.json"

    # Run enrichment
    enrich_conabip_with_wikidata(input_file, output_file)