glam/scripts/enrich_unesco_mow.py

#!/usr/bin/env python3
"""
UNESCO Memory of the World (MoW) Enrichment Script

Enriches custodian YAML files with UNESCO Memory of the World inscription data.
Unlike ICH (country-based matching), MoW uses Wikidata ID matching to find
custodians that are directly listed as holding institutions for MoW inscriptions.

Data source: Wikidata SPARQL (UNESCO has no MoW API)
- P1435 = heritage designation
- Q16024238 = Memory of the World International Register
- P276 = location (holding institution)

This enables PRECISE matching: if a custodian's Wikidata ID appears as P276
(location) on a MoW inscription, we have a direct link.

Usage:
    python scripts/enrich_unesco_mow.py [--dry-run] [--limit N] [--country CC]
    python scripts/enrich_unesco_mow.py --refresh-cache
    python scripts/enrich_unesco_mow.py --stats
"""

import argparse
import json
import logging
import os
import sys
import urllib.request
import urllib.error
import urllib.parse
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, List, Set

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

try:
    import yaml
except ImportError:
    print("ERROR: PyYAML not installed. Run: pip install pyyaml")
    sys.exit(1)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Constants
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
CACHE_DIR = Path(__file__).parent.parent / "data" / "cache"
CACHE_FILE = CACHE_DIR / "unesco_mow_wikidata.json"
CACHE_MAX_AGE_DAYS = 7

CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"

# SPARQL query to get all Memory of the World inscriptions with location (custodian) data
MOW_SPARQL_QUERY = """
SELECT DISTINCT
  ?item ?itemLabel
  ?countryLabel ?country
  ?location ?locationLabel
  ?inscriptionYear
  ?desc
  ?image
WHERE {
  ?item wdt:P1435 wd:Q16024238 .  # Memory of the World International Register

  OPTIONAL {
    ?item wdt:P17 ?country .
  }
  OPTIONAL {
    ?item wdt:P276 ?location .  # Critical: This links to custodian institutions!
  }
  OPTIONAL {
    ?item wdt:P571 ?inception .
    BIND(YEAR(?inception) AS ?inscriptionYear)
  }
  OPTIONAL {
    ?item schema:description ?desc .
    FILTER(LANG(?desc) = "en")
  }
  OPTIONAL {
    ?item wdt:P18 ?image .
  }

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en,nl,de,fr,es,pt" . }
}
ORDER BY ?itemLabel
"""


class UNESCOMoWEnricher:
    """Enriches custodian files with UNESCO Memory of the World data."""

    def __init__(self, dry_run: bool = False):
        self.dry_run = dry_run
        self.mow_data: List[Dict] = []
        self.inscriptions_by_custodian: Dict[str, List[Dict]] = {}  # Wikidata ID -> list of inscriptions
        self.inscriptions_by_country: Dict[str, List[Dict]] = {}  # Country code -> list of inscriptions
        self.stats = {
            "inscriptions_fetched": 0,
            "inscriptions_with_location": 0,
            "unique_custodians_in_mow": 0,
            "countries_covered": 0,
            "custodians_processed": 0,
            "custodians_with_wikidata": 0,
            "custodians_matched_by_wikidata": 0,
            "custodians_enriched": 0,
            "mow_references_added": 0,
            "errors": 0
        }

    def fetch_mow_data(self, force_refresh: bool = False) -> List[Dict]:
        """Fetch MoW data from Wikidata SPARQL or cache."""
        CACHE_DIR.mkdir(parents=True, exist_ok=True)

        # Check cache
        if CACHE_FILE.exists() and not force_refresh:
            cache_age = datetime.now() - datetime.fromtimestamp(CACHE_FILE.stat().st_mtime)
            if cache_age.days < CACHE_MAX_AGE_DAYS:
                logger.info(f"📁 Loading cached MoW data ({cache_age.days}.{cache_age.seconds//3600} days old)")
                with open(CACHE_FILE, 'r', encoding='utf-8') as f:
                    cache_data = json.load(f)
                    self.mow_data = cache_data.get('inscriptions', [])
                    logger.info(f"   Loaded {len(self.mow_data)} cached MoW inscriptions")
                    self._build_indices()
                    return self.mow_data

        # Fetch from Wikidata SPARQL
        logger.info("🌍 Fetching Memory of the World data from Wikidata SPARQL...")

        try:
            # Encode query for URL
            encoded_query = urllib.parse.urlencode({'query': MOW_SPARQL_QUERY})
            url = f"{WIKIDATA_SPARQL_ENDPOINT}?{encoded_query}"

            req = urllib.request.Request(
                url,
                headers={
                    'User-Agent': 'GLAM-Heritage-Enricher/1.0 (glam-data@example.com)',
                    'Accept': 'application/json'
                }
            )
            with urllib.request.urlopen(req, timeout=120) as response:
                result = json.loads(response.read().decode('utf-8'))
        except urllib.error.URLError as e:
            logger.error(f"Failed to fetch MoW data: {e}")
            raise

        # Parse SPARQL results
        bindings = result.get('results', {}).get('bindings', [])

        # Group by item (inscription) since one inscription may have multiple locations
        inscriptions_map: Dict[str, Dict] = {}

        for binding in bindings:
            item_uri = binding.get('item', {}).get('value', '')
            item_id = item_uri.split('/')[-1] if item_uri else ''

            if not item_id:
                continue

            if item_id not in inscriptions_map:
                inscriptions_map[item_id] = {
                    'wikidata_id': item_id,
                    'wikidata_url': item_uri,
                    'name': binding.get('itemLabel', {}).get('value', ''),
                    'description': binding.get('desc', {}).get('value', ''),
                    'country': binding.get('countryLabel', {}).get('value', ''),
                    'country_id': self._extract_id(binding.get('country', {}).get('value', '')),
                    'inscription_year': self._parse_year(binding.get('inscriptionYear', {}).get('value', '')),
                    'image_url': binding.get('image', {}).get('value', ''),
                    'locations': []  # Custodian institutions
                }

            # Add location if present
            location_uri = binding.get('location', {}).get('value', '')
            location_id = self._extract_id(location_uri)
            location_label = binding.get('locationLabel', {}).get('value', '')

            if location_id and location_id not in [loc['wikidata_id'] for loc in inscriptions_map[item_id]['locations']]:
                inscriptions_map[item_id]['locations'].append({
                    'wikidata_id': location_id,
                    'wikidata_url': location_uri,
                    'name': location_label
                })

        self.mow_data = list(inscriptions_map.values())
        self.stats["inscriptions_fetched"] = len(self.mow_data)

        with_location = sum(1 for i in self.mow_data if i['locations'])
        self.stats["inscriptions_with_location"] = with_location

        logger.info(f"✅ Fetched {len(self.mow_data)} MoW inscriptions ({with_location} with custodian locations)")

        # Cache the data
        cache_data = {
            'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
            'inscriptions': self.mow_data
        }
        with open(CACHE_FILE, 'w', encoding='utf-8') as f:
            json.dump(cache_data, f, ensure_ascii=False, indent=2)
        logger.info(f"   Cached to {CACHE_FILE}")

        self._build_indices()
        return self.mow_data

    def _extract_id(self, uri: str) -> str:
        """Extract Wikidata ID from URI."""
        if uri:
            return uri.split('/')[-1]
        return ''

    def _parse_year(self, year_str: str) -> Optional[int]:
        """Parse year from string."""
        if year_str:
            try:
                return int(year_str)
            except ValueError:
                pass
        return None

    def _build_indices(self):
        """Build indices for fast lookup by custodian Wikidata ID and country."""
        self.inscriptions_by_custodian = {}
        self.inscriptions_by_country = {}

        unique_custodians: Set[str] = set()

        for inscription in self.mow_data:
            # Index by custodian Wikidata ID
            for location in inscription.get('locations', []):
                custodian_id = location.get('wikidata_id', '')
                if custodian_id:
                    unique_custodians.add(custodian_id)
                    if custodian_id not in self.inscriptions_by_custodian:
                        self.inscriptions_by_custodian[custodian_id] = []
                    # Add inscription with specific location info
                    inscription_copy = inscription.copy()
                    inscription_copy['matched_location'] = location
                    self.inscriptions_by_custodian[custodian_id].append(inscription_copy)

            # Index by country (using ISO code mapping)
            country_code = self._country_to_code(inscription.get('country', ''))
            if country_code:
                if country_code not in self.inscriptions_by_country:
                    self.inscriptions_by_country[country_code] = []
                self.inscriptions_by_country[country_code].append(inscription)

        self.stats["unique_custodians_in_mow"] = len(unique_custodians)
        self.stats["countries_covered"] = len(self.inscriptions_by_country)

        logger.info(f"   Indexed {len(unique_custodians)} unique custodian institutions")
        logger.info(f"   Indexed {len(self.inscriptions_by_country)} countries with MoW inscriptions")

    def _country_to_code(self, country_name: str) -> Optional[str]:
        """Convert country name to ISO code."""
        # Simple mapping for common countries
        country_map = {
            "netherlands": "NL",
            "belgium": "BE",
            "germany": "DE",
            "france": "FR",
            "united kingdom": "GB",
            "united states of america": "US",
            "united states": "US",
            "japan": "JP",
            "china": "CN",
            "india": "IN",
            "brazil": "BR",
            "mexico": "MX",
            "spain": "ES",
            "italy": "IT",
            "portugal": "PT",
            "poland": "PL",
            "austria": "AT",
            "switzerland": "CH",
            "australia": "AU",
            "new zealand": "NZ",
            "canada": "CA",
            "south africa": "ZA",
            "egypt": "EG",
            "russia": "RU",
            "russian federation": "RU",
            "korea": "KR",
            "republic of korea": "KR",
            "south korea": "KR",
            "indonesia": "ID",
            "philippines": "PH",
            "thailand": "TH",
            "vietnam": "VN",
            "viet nam": "VN",
            "peru": "PE",
            "argentina": "AR",
            "colombia": "CO",
            "chile": "CL",
            "czech republic": "CZ",
            "czechia": "CZ",
            "hungary": "HU",
            "greece": "GR",
            "turkey": "TR",
            "iran": "IR",
            "iran (islamic republic of)": "IR",
            "israel": "IL",
            "morocco": "MA",
            "tunisia": "TN",
            "kenya": "KE",
            "nigeria": "NG",
            "ethiopia": "ET",
            "pakistan": "PK",
            "bangladesh": "BD",
            "malaysia": "MY",
            "singapore": "SG",
            "norway": "NO",
            "sweden": "SE",
            "denmark": "DK",
            "finland": "FI",
            "ireland": "IE",
            "luxembourg": "LU",
            "croatia": "HR",
            "slovenia": "SI",
            "slovakia": "SK",
            "romania": "RO",
            "bulgaria": "BG",
            "ukraine": "UA",
            "latvia": "LV",
            "lithuania": "LT",
            "estonia": "EE",
        }
        return country_map.get(country_name.lower().strip())

    def get_wikidata_id_from_custodian(self, data: dict) -> Optional[str]:
        """Extract Wikidata ID from custodian data."""
        # Try wikidata_enrichment first
        wikidata_id = data.get('wikidata_enrichment', {}).get('wikidata_entity_id')
        if wikidata_id:
            return wikidata_id

        # Try original_entry
        wikidata_id = data.get('original_entry', {}).get('wikidata_id')
        if wikidata_id:
            return wikidata_id

        # Try identifiers list
        for identifier in data.get('identifiers', []):
            if identifier.get('identifier_scheme') == 'Wikidata':
                return identifier.get('identifier_value')

        return None

    def get_country_from_custodian(self, data: dict) -> Optional[str]:
        """Extract country code from custodian data."""
        # Try GHCID first (most reliable)
        ghcid = data.get('ghcid', {}).get('ghcid_current', '')
        if ghcid and len(ghcid) >= 2:
            return ghcid[:2].upper()

        # Try location_resolution
        loc_res = data.get('ghcid', {}).get('location_resolution', {})
        if isinstance(loc_res, dict):
            country = loc_res.get('country_code')
            if country:
                return country.upper()

        return None

    def enrich_custodian(self, file_path: Path) -> bool:
        """Enrich a single custodian file with MoW data."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            if not data:
                return False

            # Get Wikidata ID
            wikidata_id = self.get_wikidata_id_from_custodian(data)
            if wikidata_id:
                self.stats["custodians_with_wikidata"] += 1

            country_code = self.get_country_from_custodian(data)

            # Look up MoW inscriptions where this custodian is listed as location
            matched_inscriptions = []
            match_method = None

            if wikidata_id and wikidata_id in self.inscriptions_by_custodian:
                matched_inscriptions = self.inscriptions_by_custodian[wikidata_id]
                match_method = "wikidata_id_direct"
                self.stats["custodians_matched_by_wikidata"] += 1

            if not matched_inscriptions:
                return False

            # Get custodian name for logging
            custodian_name = (
                data.get('custodian_name', {}).get('claim_value', '') or
                data.get('wikidata_enrichment', {}).get('wikidata_label_en', '') or
                data.get('original_entry', {}).get('organisatie', '') or
                data.get('google_maps_enrichment', {}).get('name', '') or
                file_path.stem
            )

            # Build enrichment data
            mow_enrichment = {
                "match_method": match_method,
                "custodian_wikidata_id": wikidata_id,
                "total_inscriptions_held": len(matched_inscriptions),
                "enrichment_timestamp": datetime.now(timezone.utc).isoformat(),
                "inscriptions": []
            }

            # Add inscription details (sorted by year, most recent first)
            sorted_inscriptions = sorted(
                matched_inscriptions,
                key=lambda x: -(x.get('inscription_year') or 0)
            )

            for inscription in sorted_inscriptions[:20]:  # Limit to 20 inscriptions
                desc = inscription.get("description", "")
                if len(desc) > 400:
                    desc = desc[:400] + '...'

                mow_enrichment["inscriptions"].append({
                    "wikidata_id": inscription["wikidata_id"],
                    "name": inscription["name"],
                    "description": desc,
                    "country": inscription.get("country", ""),
                    "inscription_year": inscription.get("inscription_year"),
                    "wikidata_url": inscription.get("wikidata_url", ""),
                    "unesco_mow_url": f"https://www.unesco.org/en/memory-world",  # Generic URL as UNESCO has no item-level URLs
                    "image_url": inscription.get("image_url", "")
                })

            # Update data
            data['unesco_mow_enrichment'] = mow_enrichment
            self.stats["mow_references_added"] += len(mow_enrichment["inscriptions"])

            # Log
            logger.info(f"📜 {custodian_name}: {len(mow_enrichment['inscriptions'])} MoW inscriptions (matched by {match_method})")
            for inscription in mow_enrichment["inscriptions"][:3]:
                year_str = f", {inscription['inscription_year']}" if inscription.get('inscription_year') else ""
                logger.info(f"   📜 {inscription['name'][:60]}... ({inscription['country']}{year_str})")

            if self.dry_run:
                logger.info(f"   [DRY RUN - not saving]")
                return True

            # Save updated file
            with open(file_path, 'w', encoding='utf-8') as f:
                yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)

            self.stats["custodians_enriched"] += 1
            return True

        except Exception as e:
            logger.error(f"Error processing {file_path}: {e}")
            self.stats["errors"] += 1
            return False

    def enrich_all(self, limit: Optional[int] = None, country_filter: Optional[str] = None):
        """Enrich all custodian files."""
        # Fetch MoW data first
        self.fetch_mow_data()

        # Find all custodian files
        if not CUSTODIAN_DIR.exists():
            logger.error(f"Custodian directory not found: {CUSTODIAN_DIR}")
            return

        files = sorted(CUSTODIAN_DIR.glob("*.yaml"))

        if country_filter:
            files = [f for f in files if f.name.startswith(country_filter.upper())]
            logger.info(f"Filtering to country: {country_filter.upper()}")

        if limit:
            files = files[:limit]

        logger.info(f"\n📂 Processing {len(files)} custodian files...")

        for file_path in files:
            self.stats["custodians_processed"] += 1
            self.enrich_custodian(file_path)

        self.print_stats()

    def print_stats(self):
        """Print enrichment statistics."""
        print("\n" + "=" * 65)
        print("UNESCO MEMORY OF THE WORLD ENRICHMENT STATISTICS")
        print("=" * 65)
        print(f"MoW inscriptions fetched: {self.stats['inscriptions_fetched']}")
        print(f"Inscriptions with custodian locations: {self.stats['inscriptions_with_location']}")
        print(f"Unique custodian institutions in MoW: {self.stats['unique_custodians_in_mow']}")
        print(f"Countries with MoW inscriptions: {self.stats['countries_covered']}")
        print("-" * 65)
        print(f"Custodian files processed: {self.stats['custodians_processed']}")
        print(f"Custodians with Wikidata ID: {self.stats['custodians_with_wikidata']}")
        print(f"Custodians matched by Wikidata ID: {self.stats['custodians_matched_by_wikidata']}")
        print(f"Custodians enriched: {self.stats['custodians_enriched']}")
        print(f"Total MoW references added: {self.stats['mow_references_added']}")
        print(f"Errors: {self.stats['errors']}")
        print("=" * 65)

    def show_stats_only(self):
        """Show MoW statistics without enriching."""
        self.fetch_mow_data()

        print("\n" + "=" * 65)
        print("UNESCO MEMORY OF THE WORLD STATISTICS (via Wikidata)")
        print("=" * 65)

        print(f"\nTotal inscriptions: {len(self.mow_data)}")
        print(f"Inscriptions with custodian (P276): {self.stats['inscriptions_with_location']}")
        print(f"Unique custodian institutions: {self.stats['unique_custodians_in_mow']}")

        # Top countries
        print(f"\nTop 15 Countries by MoW Inscriptions:")
        sorted_countries = sorted(
            self.inscriptions_by_country.items(),
            key=lambda x: -len(x[1])
        )[:15]
        for country, inscriptions in sorted_countries:
            print(f"  {country}: {len(inscriptions)} inscriptions")

        # Inscriptions by year
        year_counts = {}
        for inscription in self.mow_data:
            year = inscription.get('inscription_year')
            if year:
                year_counts[year] = year_counts.get(year, 0) + 1

        print("\nRecent Inscriptions by Year:")
        for year in sorted(year_counts.keys(), reverse=True)[:10]:
            print(f"  {year}: {year_counts[year]} inscriptions")

        # Sample custodians (institutions holding MoW inscriptions)
        print(f"\nSample Custodian Institutions (first 10 with most inscriptions):")
        sorted_custodians = sorted(
            self.inscriptions_by_custodian.items(),
            key=lambda x: -len(x[1])
        )[:10]
        for custodian_id, inscriptions in sorted_custodians:
            name = inscriptions[0].get('matched_location', {}).get('name', custodian_id)
            print(f"  {name} ({custodian_id}): {len(inscriptions)} inscriptions")


def main():
    parser = argparse.ArgumentParser(
        description="Enrich custodian files with UNESCO Memory of the World data"
    )
    parser.add_argument('--dry-run', action='store_true',
                        help="Don't save changes, just show what would be done")
    parser.add_argument('--limit', type=int,
                        help="Limit number of files to process")
    parser.add_argument('--country', type=str,
                        help="Filter to specific country code (e.g., NL, BE)")
    parser.add_argument('--refresh-cache', action='store_true',
                        help="Force refresh of cached MoW data")
    parser.add_argument('--stats', action='store_true',
                        help="Show MoW statistics only, don't enrich")

    args = parser.parse_args()

    enricher = UNESCOMoWEnricher(dry_run=args.dry_run)

    if args.refresh_cache:
        enricher.fetch_mow_data(force_refresh=True)
        print("Cache refreshed successfully.")
        return

    if args.stats:
        enricher.show_stats_only()
        return

    enricher.enrich_all(limit=args.limit, country_filter=args.country)


if __name__ == "__main__":
    main()