glam/scripts/enrich_from_wikidata.py

#!/usr/bin/env python3
"""
Enrich Latin American institutions with Wikidata identifiers and ISIL codes.

This script:
1. Queries Wikidata SPARQL endpoint for GLAM institutions in Brazil, Mexico, Chile
2. Fuzzy matches Wikidata results to our 304 existing institutions
3. Extracts Wikidata IDs, ISIL codes (P791), VIAF IDs (P214), and other identifiers
4. Updates the YAML dataset with enriched data
5. Generates a report on enrichment results

Usage:
    python scripts/enrich_from_wikidata.py

Dependencies:
    - SPARQLWrapper (for Wikidata queries)
    - rapidfuzz (for fuzzy name matching)
    - pyyaml (for YAML I/O)
"""

import sys
from pathlib import Path
from typing import Any
from datetime import datetime, timezone
import time
import yaml

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON  # type: ignore
from rapidfuzz import fuzz, process  # type: ignore


class WikidataEnricher:
    """Enrich heritage institutions with Wikidata identifiers."""

    WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
    USER_AGENT = "GLAM-Extractor/0.1 (https://github.com/yourusername/glam-extractor)"

    # Country mappings
    COUNTRIES = {
        "BR": {"qid": "Q155", "name": "Brazil"},
        "MX": {"qid": "Q96", "name": "Mexico"},
        "CL": {"qid": "Q298", "name": "Chile"}
    }

    def __init__(self, input_file: Path, output_file: Path):
        self.input_file = input_file
        self.output_file = output_file
        self.sparql = SPARQLWrapper(self.WIKIDATA_ENDPOINT)
        self.sparql.setReturnFormat(JSON)
        self.sparql.addCustomHttpHeader("User-Agent", self.USER_AGENT)  # type: ignore

        # Statistics
        self.stats = {
            "total_institutions": 0,
            "wikidata_queries": 0,
            "wikidata_results": 0,
            "matched_institutions": 0,
            "new_wikidata_ids": 0,
            "new_isil_codes": 0,
            "new_viaf_ids": 0,
            "fuzzy_matches": 0,
            "no_matches": 0,
        }

    def build_sparql_query(self, country_qid: str) -> str:
        """Build SPARQL query for GLAM institutions in a country."""
        return f"""
        SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?address ?website
        WHERE {{
          # Institution is located in the country
          ?item wdt:P17 wd:{country_qid} .

          # Institution is one of our GLAM types
          VALUES ?type {{
            wd:Q7075      # library
            wd:Q166118    # archive
            wd:Q33506     # museum
            wd:Q1007870   # art gallery
            wd:Q28564     # public library
            wd:Q11396180  # academic library
            wd:Q207694    # art museum
            wd:Q2772772   # history museum
            wd:Q7140621   # cultural institution
            wd:Q31855     # research institute
          }}
          ?item wdt:P31 ?type .

          # Optional identifiers
          OPTIONAL {{ ?item wdt:P791 ?isil . }}      # ISIL code
          OPTIONAL {{ ?item wdt:P214 ?viaf . }}      # VIAF ID
          OPTIONAL {{ ?item wdt:P625 ?coords . }}    # Coordinates
          OPTIONAL {{ ?item wdt:P6375 ?address . }}  # Address
          OPTIONAL {{ ?item wdt:P856 ?website . }}   # Official website

          # Get labels
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,pt" . }}
        }}
        LIMIT 1000
        """

    def query_wikidata(self, country_code: str) -> list[dict[str, Any]]:
        """Query Wikidata for institutions in a country."""
        country_info = self.COUNTRIES[country_code]
        print(f"\n🔍 Querying Wikidata for {country_info['name']} institutions...")

        query = self.build_sparql_query(country_info["qid"])
        self.sparql.setQuery(query)

        try:
            self.stats["wikidata_queries"] += 1
            raw_results = self.sparql.query().convert()  # type: ignore
            bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

            print(f"   Found {len(bindings)} Wikidata institutions")
            self.stats["wikidata_results"] += len(bindings)

            # Parse results
            institutions = []
            for binding in bindings:
                inst = self._parse_wikidata_result(binding, country_code)
                if inst:
                    institutions.append(inst)

            return institutions

        except Exception as e:
            print(f"   ❌ Error querying Wikidata: {e}")
            return []

    def _parse_wikidata_result(self, binding: dict[str, Any], country_code: str) -> dict[str, Any] | None:
        """Parse a single Wikidata SPARQL result."""
        try:
            # Extract Wikidata QID from URI
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                return None

            result: dict[str, Any] = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "country": country_code,
                "identifiers": {}
            }

            # Extract identifiers
            if "isil" in binding:
                result["identifiers"]["ISIL"] = binding["isil"]["value"]

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            # Extract location data
            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                # Parse "Point(lon lat)" format
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            if "address" in binding:
                result["address"] = binding["address"]["value"]

            return result

        except Exception as e:
            print(f"   ⚠️  Error parsing Wikidata result: {e}")
            return None

    def fuzzy_match_institution(
        self,
        wikidata_inst: dict[str, Any],
        our_institutions: list[dict[str, Any]]
    ) -> tuple[dict[str, Any], float] | None:
        """
        Fuzzy match a Wikidata institution to our dataset.

        Returns: (matched_institution, confidence_score) or None
        """
        wikidata_name = wikidata_inst["name"]
        country = wikidata_inst["country"]

        # Filter candidates by country
        candidates = [
            inst for inst in our_institutions
            if any(loc.get("country") == country for loc in inst.get("locations", []))
        ]

        if not candidates:
            return None

        # Extract names for fuzzy matching
        candidate_names = [(inst, inst.get("name", "")) for inst in candidates]
        choice_names = [name for _, name in candidate_names]

        # Use rapidfuzz to find best match
        best_match = process.extractOne(
            wikidata_name,
            choice_names,
            scorer=fuzz.token_sort_ratio
        )

        if not best_match:
            return None

        matched_name, score, _ = best_match

        # Require minimum 80% match
        if score < 80:
            return None

        # Find the institution object
        matched_inst = next(
            inst for inst, name in candidate_names if name == matched_name
        )

        return (matched_inst, score / 100.0)

    def enrich_institution(
        self,
        institution: dict[str, Any],
        wikidata_inst: dict[str, Any],
        match_confidence: float
    ) -> bool:
        """
        Enrich an institution with Wikidata data.

        Returns: True if any new data was added
        """
        enriched = False

        # Ensure identifiers list exists
        if "identifiers" not in institution or not institution["identifiers"]:
            institution["identifiers"] = []

        identifiers_list = institution["identifiers"]

        existing_schemes = {
            ident.get("identifier_scheme", "")
            for ident in identifiers_list
            if isinstance(ident, dict)
        }

        # Add Wikidata ID
        if "Wikidata" not in existing_schemes:
            identifiers_list.append({
                "identifier_scheme": "Wikidata",
                "identifier_value": wikidata_inst["qid"],
                "identifier_url": f"https://www.wikidata.org/wiki/{wikidata_inst['qid']}"
            })
            self.stats["new_wikidata_ids"] += 1
            enriched = True

        # Add other identifiers from Wikidata
        wd_identifiers = wikidata_inst.get("identifiers", {})
        if isinstance(wd_identifiers, dict):
            for scheme, value in wd_identifiers.items():
                if scheme not in existing_schemes:
                    id_obj: dict[str, Any] = {
                        "identifier_scheme": scheme,
                        "identifier_value": value
                    }

                    # Add URLs for known schemes
                    if scheme == "ISIL":
                        # ISIL codes don't have a universal URL - identifier_value only
                        self.stats["new_isil_codes"] += 1
                    elif scheme == "VIAF":
                        id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
                        self.stats["new_viaf_ids"] += 1
                    elif scheme == "Website":
                        id_obj["identifier_url"] = value

                    identifiers_list.append(id_obj)
                    enriched = True

        # Add location data if missing
        if "latitude" in wikidata_inst and "longitude" in wikidata_inst:
            locations = institution.get("locations", [])
            if isinstance(locations, list) and len(locations) > 0:
                first_loc = locations[0]
                if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
                    first_loc["latitude"] = wikidata_inst["latitude"]
                    first_loc["longitude"] = wikidata_inst["longitude"]
                    enriched = True

        # Update provenance
        if enriched:
            prov = institution.get("provenance", {})
            if isinstance(prov, dict):
                existing_method = prov.get("extraction_method", "")
                prov["extraction_method"] = (
                    existing_method +
                    f" + Wikidata enrichment (match confidence: {match_confidence:.2f})"
                )

        return enriched

    def run(self) -> None:
        """Run the complete enrichment workflow."""
        print("🚀 Starting Wikidata enrichment for Latin American institutions\n")
        print(f"   Input:  {self.input_file}")
        print(f"   Output: {self.output_file}")

        # Load existing dataset
        print("\n📖 Loading existing dataset...")
        with open(self.input_file, 'r', encoding='utf-8') as f:
            institutions = yaml.safe_load(f)

        if not isinstance(institutions, list):
            raise ValueError("Expected YAML file to contain a list of institutions")

        self.stats["total_institutions"] = len(institutions)
        print(f"   Loaded {len(institutions)} institutions")

        # Query Wikidata for each country
        all_wikidata = []
        for country_code in ["BR", "MX", "CL"]:
            wikidata_results = self.query_wikidata(country_code)
            all_wikidata.extend(wikidata_results)
            time.sleep(1)  # Rate limiting

        print(f"\n📊 Total Wikidata results: {len(all_wikidata)}")

        # Match and enrich
        print("\n🔗 Matching Wikidata institutions to our dataset...")

        for wikidata_inst in all_wikidata:
            match_result = self.fuzzy_match_institution(wikidata_inst, institutions)

            if match_result:
                matched_inst, confidence = match_result
                print(f"   ✅ Matched: '{wikidata_inst['name']}' → '{matched_inst['name']}' ({confidence:.0%})")

                if self.enrich_institution(matched_inst, wikidata_inst, confidence):
                    self.stats["matched_institutions"] += 1

                    if confidence < 0.95:
                        self.stats["fuzzy_matches"] += 1
            else:
                self.stats["no_matches"] += 1

        # Write enriched dataset
        print(f"\n💾 Writing enriched dataset to {self.output_file}...")

        # Add header comment
        header = f"""---
# Latin American GLAM Institutions - Wikidata Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Enrichment run: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
# Wikidata queries: {self.stats['wikidata_queries']}
# Wikidata results: {self.stats['wikidata_results']}
# Matched institutions: {self.stats['matched_institutions']}
# New Wikidata IDs: {self.stats['new_wikidata_ids']}
# New ISIL codes: {self.stats['new_isil_codes']}
# New VIAF IDs: {self.stats['new_viaf_ids']}

"""

        with open(self.output_file, 'w', encoding='utf-8') as f:
            f.write(header)
            yaml.dump(
                institutions,
                f,
                allow_unicode=True,
                default_flow_style=False,
                sort_keys=False,
                width=120
            )

        # Print final statistics
        self._print_report()

    def _print_report(self) -> None:
        """Print enrichment report."""
        print("\n" + "="*70)
        print("📊 WIKIDATA ENRICHMENT REPORT")
        print("="*70)
        print(f"\n📚 Dataset Statistics:")
        print(f"   Total institutions in dataset: {self.stats['total_institutions']}")
        print(f"   Wikidata queries executed: {self.stats['wikidata_queries']}")
        print(f"   Wikidata institutions found: {self.stats['wikidata_results']}")

        print(f"\n🔗 Matching Results:")
        print(f"   Successfully matched: {self.stats['matched_institutions']} ({self.stats['matched_institutions']/self.stats['total_institutions']*100:.1f}%)")
        print(f"   Fuzzy matches (< 95% confidence): {self.stats['fuzzy_matches']}")
        print(f"   No matches: {self.stats['no_matches']}")

        print(f"\n✨ New Identifiers Added:")
        print(f"   Wikidata IDs: {self.stats['new_wikidata_ids']}")
        print(f"   ISIL codes: {self.stats['new_isil_codes']}")
        print(f"   VIAF IDs: {self.stats['new_viaf_ids']}")

        print(f"\n💡 Next Steps:")
        if self.stats['new_isil_codes'] > 0:
            print(f"   ✅ Found {self.stats['new_isil_codes']} ISIL codes from Wikidata!")
        else:
            print(f"   ⚠️  No ISIL codes found in Wikidata")
            print(f"   → Proceed with national library outreach strategy")

        if self.stats['fuzzy_matches'] > 0:
            print(f"   ⚠️  Review {self.stats['fuzzy_matches']} fuzzy matches manually")

        print("\n" + "="*70 + "\n")


def main():
    """Main entry point."""
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / "data" / "instances" / "latin_american_institutions.yaml"
    output_file = base_dir / "data" / "instances" / "latin_american_institutions_enriched.yaml"

    if not input_file.exists():
        print(f"❌ Error: Input file not found: {input_file}")
        sys.exit(1)

    enricher = WikidataEnricher(input_file, output_file)

    try:
        enricher.run()
        print("✅ Enrichment complete!")

    except KeyboardInterrupt:
        print("\n\n⚠️  Enrichment interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n❌ Error during enrichment: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()