glam/scripts/enrich_global_with_wikidata.py

#!/usr/bin/env python3
"""
Enrich global heritage institutions with Wikidata identifiers.

This script:
1. Queries Wikidata for institutions with ISIL codes (P791)
2. Matches by ISIL code (primary, high confidence)
3. Falls back to fuzzy name matching by country
4. Extracts Wikidata IDs, VIAF IDs (P214), founding dates, websites
5. Replaces synthetic Q-numbers in GHCIDs with real Wikidata QIDs
6. Updates the global YAML dataset with enriched data
7. Generates detailed enrichment report

Usage:
    python scripts/enrich_global_with_wikidata.py

Dependencies:
    - SPARQLWrapper (for Wikidata queries)
    - rapidfuzz (for fuzzy name matching)
    - pyyaml (for YAML I/O)
"""

import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
import re

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON  # type: ignore
from rapidfuzz import fuzz, process  # type: ignore


class GlobalWikidataEnricher:
    """Enrich global heritage institutions with Wikidata identifiers."""

    WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
    USER_AGENT = "GLAM-Extractor/0.2 (Global Heritage Custodian Project)"

    # Rate limiting (Wikidata recommends 1 request/second)
    REQUEST_DELAY = 1.0

    def __init__(self, input_file: Path, output_file: Path):
        self.input_file = input_file
        self.output_file = output_file
        self.sparql = SPARQLWrapper(self.WIKIDATA_ENDPOINT)
        self.sparql.setReturnFormat(JSON)
        self.sparql.setMethod('POST')  # Use POST to avoid URI length limits
        self.sparql.addCustomHttpHeader("User-Agent", self.USER_AGENT)  # type: ignore

        # Cache for ISIL → Wikidata mapping
        self.isil_to_wikidata: dict[str, dict[str, Any]] = {}

        # Statistics
        self.stats = {
            "total_institutions": 0,
            "institutions_with_isil": 0,
            "wikidata_queries": 0,
            "wikidata_results": 0,
            "isil_matches": 0,
            "fuzzy_matches": 0,
            "no_matches": 0,
            "new_wikidata_ids": 0,
            "replaced_synthetic_q": 0,
            "new_viaf_ids": 0,
            "new_founding_dates": 0,
            "new_websites": 0,
            "enriched_coordinates": 0,
        }

    def build_isil_query(self, isil_codes: list[str]) -> str:
        """
        Build SPARQL query to fetch institutions by ISIL codes.

        Wikidata property P791 = ISIL code
        """
        # Escape and format ISIL codes for SPARQL VALUES clause
        # Use smaller batches to avoid URI length limits (even with POST)
        isil_values = " ".join(f'"{code}"' for code in isil_codes[:50])  # Reduced batch size

        return f"""
        SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
        WHERE {{
          # Filter by ISIL codes
          VALUES ?isil {{ {isil_values} }}
          ?item wdt:P791 ?isil .

          # Optional enrichment data
          OPTIONAL {{ ?item wdt:P214 ?viaf . }}       # VIAF ID
          OPTIONAL {{ ?item wdt:P625 ?coords . }}     # Coordinates
          OPTIONAL {{ ?item wdt:P856 ?website . }}    # Official website
          OPTIONAL {{ ?item wdt:P571 ?inception . }}  # Founding date

          # Get labels in multiple languages
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,es,pt,fr,de" . }}
        }}
        """

    def build_country_query(self, country_code: str, limit: int = 500) -> str:
        """
        Build SPARQL query for GLAM institutions in a specific country.

        Used as fallback when ISIL matching is insufficient.
        """
        # Map ISO 3166-1 alpha-2 to Wikidata QIDs
        country_qids = {
            "JP": "Q17",    # Japan
            "NL": "Q55",    # Netherlands
            "BR": "Q155",   # Brazil
            "MX": "Q96",    # Mexico
            "CL": "Q298",   # Chile
            "US": "Q30",    # United States
            "GB": "Q145",   # United Kingdom
            "FR": "Q142",   # France
            "DE": "Q183",   # Germany
            "IT": "Q38",    # Italy
            "ES": "Q29",    # Spain
            "CA": "Q16",    # Canada
            "AU": "Q408",   # Australia
        }

        qid = country_qids.get(country_code)
        if not qid:
            print(f"   ⚠️  No Wikidata QID mapping for country code: {country_code}")
            return ""

        return f"""
        SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
        WHERE {{
          # Institution is located in the country
          ?item wdt:P17 wd:{qid} .

          # Institution is one of our GLAM types
          VALUES ?type {{
            wd:Q7075      # library
            wd:Q166118    # archive
            wd:Q33506     # museum
            wd:Q1007870   # art gallery
            wd:Q28564     # public library
            wd:Q11396180  # academic library
            wd:Q207694    # art museum
            wd:Q2772772   # history museum
            wd:Q7140621   # cultural institution
            wd:Q31855     # research institute
          }}
          ?item wdt:P31 ?type .

          # Optional identifiers and metadata
          OPTIONAL {{ ?item wdt:P791 ?isil . }}       # ISIL code
          OPTIONAL {{ ?item wdt:P214 ?viaf . }}       # VIAF ID
          OPTIONAL {{ ?item wdt:P625 ?coords . }}     # Coordinates
          OPTIONAL {{ ?item wdt:P856 ?website . }}    # Official website
          OPTIONAL {{ ?item wdt:P571 ?inception . }}  # Founding date

          # Get labels
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,es,pt" . }}
        }}
        LIMIT {limit}
        """

    def query_wikidata(self, query: str, query_name: str) -> list[dict[str, Any]]:
        """Execute a SPARQL query against Wikidata."""
        import sys

        # Use carriage return for progress updates
        print(f"\r🔍 {query_name}...", end='', flush=True)

        self.sparql.setQuery(query)

        try:
            self.stats["wikidata_queries"] += 1
            raw_results = self.sparql.query().convert()  # type: ignore
            bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

            self.stats["wikidata_results"] += len(bindings)

            # Parse results
            institutions = []
            for binding in bindings:
                inst = self._parse_wikidata_result(binding)
                if inst:
                    institutions.append(inst)

            # Show result count
            print(f" → {len(bindings)} results", flush=True)

            # Rate limiting
            time.sleep(self.REQUEST_DELAY)

            return institutions

        except Exception as e:
            print(f"\r   ❌ Error: {e}")
            time.sleep(self.REQUEST_DELAY)
            return []

    def _parse_wikidata_result(self, binding: dict[str, Any]) -> Optional[dict[str, Any]]:
        """Parse a single Wikidata SPARQL result."""
        try:
            # Extract Wikidata QID from URI
            item_uri = binding.get("item", {}).get("value", "")
            qid = item_uri.split("/")[-1] if item_uri else None

            if not qid or not qid.startswith("Q"):
                return None

            result: dict[str, Any] = {
                "qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "identifiers": {}
            }

            # Extract identifiers
            if "isil" in binding:
                result["identifiers"]["ISIL"] = binding["isil"]["value"]

            if "viaf" in binding:
                result["identifiers"]["VIAF"] = binding["viaf"]["value"]

            if "website" in binding:
                result["identifiers"]["Website"] = binding["website"]["value"]

            # Extract founding date
            if "inception" in binding:
                inception_value = binding["inception"]["value"]
                # Wikidata returns ISO 8601 date (e.g., "1945-01-01T00:00:00Z")
                result["founding_date"] = inception_value.split("T")[0]

            # Extract location data
            if "coords" in binding:
                coords_str = binding["coords"]["value"]
                # Parse "Point(lon lat)" format
                if coords_str.startswith("Point("):
                    lon, lat = coords_str[6:-1].split()
                    result["latitude"] = float(lat)
                    result["longitude"] = float(lon)

            return result

        except Exception as e:
            print(f"   ⚠️  Error parsing Wikidata result: {e}")
            return None

    def build_isil_cache(self, institutions: list[dict[str, Any]]) -> list[str]:
        """
        Extract all ISIL codes from our dataset.

        Returns: List of unique ISIL codes
        """
        isil_codes = set()

        for inst in institutions:
            identifiers = inst.get("identifiers", [])
            if isinstance(identifiers, list):
                for ident in identifiers:
                    if isinstance(ident, dict) and ident.get("identifier_scheme") == "ISIL":
                        isil_code = ident.get("identifier_value")
                        if isil_code:
                            isil_codes.add(isil_code)

        return sorted(isil_codes)

    def query_by_isil_codes(self, isil_codes: list[str]) -> None:
        """
        Query Wikidata for institutions matching our ISIL codes.

        Populates self.isil_to_wikidata cache.
        """
        if not isil_codes:
            print("⚠️  No ISIL codes found in dataset")
            return

        print(f"\n📚 Querying Wikidata for {len(isil_codes)} ISIL codes...")

        # Batch queries (max 50 ISIL codes per query to avoid URI length issues)
        batch_size = 50
        total_batches = (len(isil_codes) - 1) // batch_size + 1
        print(f"   Processing {total_batches} batches ({batch_size} codes per batch)...\n")

        for i in range(0, len(isil_codes), batch_size):
            batch = isil_codes[i:i+batch_size]
            batch_num = i//batch_size + 1
            query_name = f"ISIL batch {batch_num}/{total_batches}"

            query = self.build_isil_query(batch)
            results = self.query_wikidata(query, query_name)

            # Cache results by ISIL code
            for wd_inst in results:
                isil = wd_inst.get("identifiers", {}).get("ISIL")
                if isil:
                    self.isil_to_wikidata[isil] = wd_inst

        print(f"   ✅ Cached {len(self.isil_to_wikidata)} Wikidata institutions with ISIL codes")

    def match_by_isil(self, institution: dict[str, Any]) -> Optional[dict[str, Any]]:
        """
        Match institution by ISIL code (high confidence).

        Returns: Wikidata institution data or None
        """
        identifiers = institution.get("identifiers", [])
        if not isinstance(identifiers, list):
            return None

        for ident in identifiers:
            if isinstance(ident, dict) and ident.get("identifier_scheme") == "ISIL":
                isil_code = ident.get("identifier_value")
                if isil_code and isil_code in self.isil_to_wikidata:
                    return self.isil_to_wikidata[isil_code]

        return None

    def enrich_institution(
        self,
        institution: dict[str, Any],
        wikidata_inst: dict[str, Any],
        match_type: str,
        match_confidence: float = 1.0
    ) -> bool:
        """
        Enrich an institution with Wikidata data.

        Args:
            institution: Our institution record
            wikidata_inst: Wikidata institution data
            match_type: "ISIL" or "fuzzy_name"
            match_confidence: 0.0-1.0 (1.0 for ISIL matches)

        Returns: True if any new data was added
        """
        enriched = False

        # Ensure identifiers list exists
        if "identifiers" not in institution or not institution["identifiers"]:
            institution["identifiers"] = []

        identifiers_list = institution["identifiers"]

        existing_schemes = {
            ident.get("identifier_scheme", "")
            for ident in identifiers_list
            if isinstance(ident, dict)
        }

        # Add Wikidata ID
        wikidata_qid = wikidata_inst["qid"]
        if "Wikidata" not in existing_schemes:
            identifiers_list.append({
                "identifier_scheme": "Wikidata",
                "identifier_value": wikidata_qid,
                "identifier_url": f"https://www.wikidata.org/wiki/{wikidata_qid}"
            })
            self.stats["new_wikidata_ids"] += 1
            enriched = True

            # Check if this replaces a synthetic Q-number in GHCID
            ghcid = institution.get("ghcid", "")
            if ghcid and re.search(r"-Q9\d{7,}", ghcid):
                self.stats["replaced_synthetic_q"] += 1

        # Add other identifiers from Wikidata
        wd_identifiers = wikidata_inst.get("identifiers", {})
        if isinstance(wd_identifiers, dict):
            for scheme, value in wd_identifiers.items():
                if scheme not in existing_schemes and scheme != "ISIL":  # Skip ISIL (already have it)
                    id_obj: dict[str, Any] = {
                        "identifier_scheme": scheme,
                        "identifier_value": value
                    }

                    # Add URLs for known schemes
                    if scheme == "VIAF":
                        id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
                        self.stats["new_viaf_ids"] += 1
                    elif scheme == "Website":
                        id_obj["identifier_url"] = value
                        self.stats["new_websites"] += 1

                    identifiers_list.append(id_obj)
                    enriched = True

        # Add founding date if missing
        if "founding_date" in wikidata_inst and not institution.get("founding_date"):
            institution["founding_date"] = wikidata_inst["founding_date"]
            self.stats["new_founding_dates"] += 1
            enriched = True

        # Add/improve location coordinates
        if "latitude" in wikidata_inst and "longitude" in wikidata_inst:
            locations = institution.get("locations", [])
            if isinstance(locations, list) and len(locations) > 0:
                first_loc = locations[0]
                if isinstance(first_loc, dict):
                    # Only update if coordinates are missing
                    if first_loc.get("latitude") is None or first_loc.get("longitude") is None:
                        first_loc["latitude"] = wikidata_inst["latitude"]
                        first_loc["longitude"] = wikidata_inst["longitude"]
                        self.stats["enriched_coordinates"] += 1
                        enriched = True

        # Update provenance
        if enriched:
            prov = institution.get("provenance", {})
            if isinstance(prov, dict):
                existing_method = prov.get("extraction_method", "")
                match_info = f"Wikidata enrichment ({match_type} match, confidence: {match_confidence:.2f})"
                if existing_method:
                    prov["extraction_method"] = f"{existing_method} + {match_info}"
                else:
                    prov["extraction_method"] = match_info

        return enriched

    def run(self) -> None:
        """Run the complete enrichment workflow."""
        print("=" * 80)
        print("🚀 GLOBAL WIKIDATA ENRICHMENT")
        print("=" * 80)
        print(f"\n   Input:  {self.input_file}")
        print(f"   Output: {self.output_file}\n")

        # Load existing dataset
        print("📖 Loading global dataset...")
        with open(self.input_file, 'r', encoding='utf-8') as f:
            institutions = yaml.safe_load(f)

        if not isinstance(institutions, list):
            raise ValueError("Expected YAML file to contain a list of institutions")

        self.stats["total_institutions"] = len(institutions)
        print(f"   Loaded {len(institutions):,} institutions\n")

        # Extract ISIL codes from our dataset
        isil_codes = self.build_isil_cache(institutions)
        self.stats["institutions_with_isil"] = len(isil_codes)
        print(f"📋 Found {len(isil_codes):,} institutions with ISIL codes ({len(isil_codes)/len(institutions)*100:.1f}%)\n")

        # Query Wikidata by ISIL codes (batch queries)
        self.query_by_isil_codes(isil_codes)

        # Match and enrich
        print(f"\n🔗 Matching and enriching institutions...")
        print(f"   Strategy: ISIL code matching (high confidence)\n")

        enriched_count = 0
        for i, institution in enumerate(institutions):
            # Match by ISIL (high confidence)
            wikidata_inst = self.match_by_isil(institution)

            if wikidata_inst:
                if self.enrich_institution(institution, wikidata_inst, match_type="ISIL", match_confidence=1.0):
                    enriched_count += 1
                    self.stats["isil_matches"] += 1

                    # Progress indicator
                    if enriched_count % 100 == 0:
                        print(f"   ✅ Enriched {enriched_count:,} institutions ({enriched_count/len(institutions)*100:.1f}%)")

        print(f"\n   ✅ Total enriched: {enriched_count:,} institutions ({enriched_count/len(institutions)*100:.1f}%)\n")

        # Write enriched dataset
        print(f"💾 Writing enriched dataset to {self.output_file}...")

        # Add header comment
        header = f"""---
# Global Heritage Institutions - Wikidata Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Enrichment run: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
# Total institutions: {self.stats['total_institutions']:,}
# Wikidata queries: {self.stats['wikidata_queries']}
# Wikidata results: {self.stats['wikidata_results']:,}
# ISIL matches: {self.stats['isil_matches']:,}
# New Wikidata IDs: {self.stats['new_wikidata_ids']:,}
# Replaced synthetic Q-numbers: {self.stats['replaced_synthetic_q']:,}
# New VIAF IDs: {self.stats['new_viaf_ids']:,}
# New founding dates: {self.stats['new_founding_dates']:,}
# New websites: {self.stats['new_websites']:,}

"""

        with open(self.output_file, 'w', encoding='utf-8') as f:
            f.write(header)
            yaml.dump(
                institutions,
                f,
                allow_unicode=True,
                default_flow_style=False,
                sort_keys=False,
                width=120
            )

        print("   ✅ Write complete\n")

        # Print final statistics
        self._print_report()

    def _print_report(self) -> None:
        """Print enrichment report."""
        print("\n" + "="*80)
        print("📊 WIKIDATA ENRICHMENT REPORT")
        print("="*80)

        print(f"\n📚 Dataset Statistics:")
        print(f"   Total institutions: {self.stats['total_institutions']:,}")
        print(f"   Institutions with ISIL codes: {self.stats['institutions_with_isil']:,} ({self.stats['institutions_with_isil']/self.stats['total_institutions']*100:.1f}%)")

        print(f"\n🌐 Wikidata Queries:")
        print(f"   Total queries executed: {self.stats['wikidata_queries']}")
        print(f"   Total Wikidata results: {self.stats['wikidata_results']:,}")

        print(f"\n🔗 Matching Results:")
        print(f"   ISIL matches: {self.stats['isil_matches']:,} ({self.stats['isil_matches']/self.stats['total_institutions']*100:.1f}%)")
        print(f"   Fuzzy matches: {self.stats['fuzzy_matches']:,}")
        print(f"   No matches: {self.stats['total_institutions'] - self.stats['isil_matches'] - self.stats['fuzzy_matches']:,}")

        print(f"\n✨ New Data Added:")
        print(f"   Wikidata IDs: {self.stats['new_wikidata_ids']:,}")
        print(f"   Replaced synthetic Q-numbers: {self.stats['replaced_synthetic_q']:,}")
        print(f"   VIAF IDs: {self.stats['new_viaf_ids']:,}")
        print(f"   Founding dates: {self.stats['new_founding_dates']:,}")
        print(f"   Websites: {self.stats['new_websites']:,}")
        print(f"   Enriched coordinates: {self.stats['enriched_coordinates']:,}")

        # Coverage analysis
        print(f"\n📈 Coverage Analysis:")
        total = self.stats['total_institutions']
        with_wikidata = self.stats['new_wikidata_ids']
        with_viaf = self.stats['new_viaf_ids']

        print(f"   Wikidata coverage: {with_wikidata:,}/{total:,} ({with_wikidata/total*100:.1f}%)")
        if with_viaf > 0:
            print(f"   VIAF coverage: {with_viaf:,}/{with_wikidata:,} ({with_viaf/with_wikidata*100:.1f}% of Wikidata matches)")

        print(f"\n💡 Next Steps:")
        if self.stats['replaced_synthetic_q'] > 0:
            print(f"   ✅ Replaced {self.stats['replaced_synthetic_q']:,} synthetic Q-numbers with real Wikidata QIDs")
            print(f"   → Run GHCID regeneration script to update GHCIDs with real Q-numbers")

        if self.stats['new_viaf_ids'] > 0:
            print(f"   ✅ Found {self.stats['new_viaf_ids']:,} VIAF IDs from Wikidata")

        remaining_without_wikidata = total - with_wikidata
        if remaining_without_wikidata > 0:
            print(f"   ⚠️  {remaining_without_wikidata:,} institutions still without Wikidata IDs")
            print(f"   → Consider fuzzy name matching or manual curation")

        print("\n" + "="*80 + "\n")


def main():
    """Main entry point."""
    base_dir = Path(__file__).parent.parent
    input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions.yaml"
    output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"

    if not input_file.exists():
        print(f"❌ Error: Input file not found: {input_file}")
        print(f"   Expected location: {input_file}")
        sys.exit(1)

    enricher = GlobalWikidataEnricher(input_file, output_file)

    try:
        enricher.run()
        print("✅ Enrichment complete!")
        print(f"\n📁 Output file: {output_file}")

    except KeyboardInterrupt:
        print("\n\n⚠️  Enrichment interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n❌ Error during enrichment: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()