glam/scripts/batch_extract_mission_statements.py

#!/usr/bin/env python3
"""
Batch extract mission statements from heritage custodian websites.

This script:
1. Finds Dutch custodians with websites
2. Discovers mission/vision/about pages
3. Uses Linkup API (primary) or Z.AI Web Reader (fallback) to fetch content
4. Creates LinkML-compliant mission_statement entries with full provenance
5. Updates custodian YAML files with extracted statements

Usage:
    python scripts/batch_extract_mission_statements.py --test 5  # Test with 5 custodians
    python scripts/batch_extract_mission_statements.py --province NL-NH  # Noord-Holland only
    python scripts/batch_extract_mission_statements.py --all  # All Dutch custodians
    python scripts/batch_extract_mission_statements.py --ghcid NL-ZH-ZUI-M-LMT  # Single custodian

Requirements:
    - httpx (pip install httpx)
    - pyyaml
    - LINKUP_API_KEY environment variable (primary)
    - ZAI_API_TOKEN environment variable (fallback)

API Documentation:
    - Linkup: https://docs.linkup.so/
    - Z.AI: https://docs.z.ai/devpack/mcp/reader-mcp-server
"""

import argparse
import asyncio
import base64
import hashlib
import json
import os
import re
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Any, Union
from urllib.parse import urljoin, urlparse, quote

import httpx
import yaml

# Z.AI GLM API configuration (per Rule 11 in AGENTS.md)
ZAI_GLM_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_GLM_MODEL = "glm-4.5-air"  # Fast model that works reliably

# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# API configurations
LINKUP_API_URL = "https://api.linkup.so/v1/fetch"
ZAI_MCP_URL = "https://api.z.ai/api/mcp/web_reader/mcp"

# Common mission page URL patterns for Dutch heritage institutions
# Ordered by likelihood of success (most common patterns first)
DUTCH_MISSION_PATTERNS = [
    "/over-ons",           # Most common Dutch pattern
    "/missie",             # Direct mission page
    "/over",               # Short version
    "/missie-en-visie",    # Combined mission/vision
    "/organisatie",        # Organization page often has mission
    "/about",              # English fallback
    "/visie",              # Vision page
    "/over-ons/missie",    # Nested mission page
    "/onze-missie",        # "Our mission"
    "/over/missie",
    "/organisatie/missie",
    "/het-museum/missie",
    "/het-museum/missie-en-visie",
    "/museum/missie",
    "/about/mission",
    "/wie-zijn-wij",
    "/about-us",
]

# Extended patterns for Dutch museum websites (discovered through testing)
DUTCH_MISSION_EXTENDED_PATTERNS = [
    "/het-muzeeum-organisatie/missie-visie",
    "/het-museum-organisatie/missie-visie",
    "/organisatie/missie-visie",
    "/over-het-museum/missie",
    "/over-het-museum/missie-en-visie",
    "/info/missie",
    "/info/over-ons",
    "/stichting/missie",
    "/museum/over-ons",
    "/museum/organisatie",
]

# Spanish mission page patterns (for Latin America)
SPANISH_MISSION_PATTERNS = [
    "/sobre-nosotros",      # About us
    "/quienes-somos",       # Who we are
    "/mision",              # Mission
    "/mision-y-vision",     # Mission and vision
    "/institucional",       # Institutional
    "/historia",            # History often contains mission
    "/el-museo",            # The museum
    "/acerca-de",           # About
    "/nuestra-mision",      # Our mission
    "/conocenos",           # Get to know us
    "/institucion",         # Institution
    "/nosotros",            # Us
    "/about",               # English fallback
    "/about-us",
]

# Portuguese mission page patterns (for Brazil, Portugal)
PORTUGUESE_MISSION_PATTERNS = [
    "/sobre",               # About
    "/sobre-nos",           # About us
    "/quem-somos",          # Who we are
    "/missao",              # Mission
    "/missao-e-visao",      # Mission and vision
    "/institucional",       # Institutional
    "/historia",            # History
    "/o-museu",             # The museum
    "/a-biblioteca",        # The library
    "/conheca",             # Get to know
    "/nossa-missao",        # Our mission
    "/about",               # English fallback
]

# German mission page patterns
GERMAN_MISSION_PATTERNS = [
    "/ueber-uns",           # About us
    "/uber-uns",            # Without umlaut
    "/leitbild",            # Mission statement
    "/mission",             # Mission
    "/das-museum",          # The museum
    "/institution",         # Institution
    "/wir-ueber-uns",       # We about us
    "/about",               # English fallback
]

# French mission page patterns
FRENCH_MISSION_PATTERNS = [
    "/a-propos",            # About
    "/qui-sommes-nous",     # Who are we
    "/mission",             # Mission
    "/notre-mission",       # Our mission
    "/le-musee",            # The museum
    "/presentation",        # Presentation
    "/historique",          # Historical
    "/about",               # English fallback
]

# English mission page patterns (international fallback)
ENGLISH_MISSION_PATTERNS = [
    "/about",
    "/about-us",
    "/mission",
    "/our-mission",
    "/mission-vision",
    "/mission-and-vision",
    "/who-we-are",
    "/the-museum",
    "/the-library",
    "/the-archive",
    "/history",
    "/institutional",
]

# Combined patterns - use all languages for maximum coverage
ALL_MISSION_PATTERNS = (
    DUTCH_MISSION_PATTERNS +
    SPANISH_MISSION_PATTERNS +
    PORTUGUESE_MISSION_PATTERNS +
    GERMAN_MISSION_PATTERNS +
    FRENCH_MISSION_PATTERNS +
    ENGLISH_MISSION_PATTERNS
)

# Keywords indicating mission/vision content (multilingual)
MISSION_KEYWORDS = {
    'mission': ['missie', 'mission', 'opdracht', 'kerntaak', 'misión', 'missão', 'leitbild'],
    'vision': ['visie', 'vision', 'toekomst', 'ambitie', 'visión', 'visão'],
    'goal': ['doelstelling', 'doel', 'doelen', 'goal', 'objective', 'objectives', 'ambitie',
             'objetivo', 'objetivos', 'ziel', 'ziele'],
    'value': ['waarde', 'waarden', 'kernwaarden', 'value', 'values', 'principle',
              'valor', 'valores', 'wert', 'werte'],
    'motto': ['motto', 'slogan', 'slagzin', 'lema'],
}

# ISO 3166-1 alpha-2 country code to ISO 639-1 language code mapping
# Maps country to primary/official language
COUNTRY_TO_LANGUAGE = {
    # Dutch-speaking
    'NL': 'nl', 'BE': 'nl', 'SR': 'nl', 'AW': 'nl', 'CW': 'nl', 'SX': 'nl',
    # Spanish-speaking
    'AR': 'es', 'BO': 'es', 'CL': 'es', 'CO': 'es', 'CR': 'es', 'CU': 'es',
    'DO': 'es', 'EC': 'es', 'SV': 'es', 'GT': 'es', 'HN': 'es', 'MX': 'es',
    'NI': 'es', 'PA': 'es', 'PY': 'es', 'PE': 'es', 'PR': 'es', 'ES': 'es',
    'UY': 'es', 'VE': 'es', 'GQ': 'es',
    # Portuguese-speaking
    'BR': 'pt', 'PT': 'pt', 'AO': 'pt', 'MZ': 'pt', 'CV': 'pt', 'GW': 'pt',
    'ST': 'pt', 'TL': 'pt',
    # German-speaking
    'DE': 'de', 'AT': 'de', 'CH': 'de', 'LI': 'de', 'LU': 'de',
    # French-speaking
    'FR': 'fr', 'MC': 'fr', 'SN': 'fr', 'CI': 'fr', 'ML': 'fr', 'BF': 'fr',
    'NE': 'fr', 'TG': 'fr', 'BJ': 'fr', 'GA': 'fr', 'CG': 'fr', 'CD': 'fr',
    'MG': 'fr', 'HT': 'fr', 'RE': 'fr', 'MQ': 'fr', 'GP': 'fr', 'GF': 'fr',
    'NC': 'fr', 'PF': 'fr',
    # Italian-speaking
    'IT': 'it', 'SM': 'it', 'VA': 'it',
    # English-speaking (default)
    'US': 'en', 'GB': 'en', 'AU': 'en', 'NZ': 'en', 'CA': 'en', 'IE': 'en',
    'ZA': 'en', 'JM': 'en', 'TT': 'en', 'BB': 'en', 'GH': 'en', 'NG': 'en',
    'KE': 'en', 'UG': 'en', 'TZ': 'en', 'ZW': 'en', 'BW': 'en', 'MW': 'en',
    'ZM': 'en', 'PH': 'en', 'SG': 'en', 'MY': 'en', 'IN': 'en', 'PK': 'en',
    # Japanese
    'JP': 'ja',
    # Chinese
    'CN': 'zh', 'TW': 'zh', 'HK': 'zh', 'MO': 'zh',
    # Korean
    'KR': 'ko', 'KP': 'ko',
    # Russian
    'RU': 'ru', 'BY': 'ru', 'KZ': 'ru', 'KG': 'ru', 'TJ': 'ru',
    # Arabic
    'SA': 'ar', 'AE': 'ar', 'QA': 'ar', 'KW': 'ar', 'BH': 'ar', 'OM': 'ar',
    'YE': 'ar', 'JO': 'ar', 'SY': 'ar', 'LB': 'ar', 'IQ': 'ar', 'EG': 'ar',
    'LY': 'ar', 'TN': 'ar', 'DZ': 'ar', 'MA': 'ar', 'SD': 'ar', 'MR': 'ar',
    # Other
    'CZ': 'cs', 'SK': 'sk', 'PL': 'pl', 'HU': 'hu', 'RO': 'ro', 'BG': 'bg',
    'HR': 'hr', 'RS': 'sr', 'SI': 'sl', 'GR': 'el', 'TR': 'tr', 'IL': 'he',
    'TH': 'th', 'VN': 'vi', 'ID': 'id', 'SE': 'sv', 'NO': 'no', 'DK': 'da',
    'FI': 'fi', 'IS': 'is', 'EE': 'et', 'LV': 'lv', 'LT': 'lt', 'UA': 'uk',
}


def get_language_from_ghcid(ghcid: str) -> str:
    """Extract language code from GHCID country prefix.

    Args:
        ghcid: GHCID string (e.g., "AR-C-BUE-M-MAD")

    Returns:
        ISO 639-1 language code (e.g., "es" for Argentina)
    """
    if not ghcid or len(ghcid) < 2:
        return 'en'  # Default to English

    country_code = ghcid[:2].upper()
    return COUNTRY_TO_LANGUAGE.get(country_code, 'en')


def compute_content_hash(text: str) -> str:
    """Compute SHA-256 hash of text in SRI format."""
    sha256_hash = hashlib.sha256(text.encode('utf-8')).digest()
    b64_hash = base64.b64encode(sha256_hash).decode('ascii')
    return f"sha256-{b64_hash}"


def get_api_tokens() -> dict:
    """Get API tokens from environment.

    Returns:
        dict with 'linkup' and/or 'zai' keys containing API tokens
    """
    tokens = {}

    # Try environment variables first
    linkup_token = os.environ.get('LINKUP_API_KEY')
    zai_token = os.environ.get('ZAI_API_TOKEN')

    # Try loading from .env file if not in environment
    env_path = PROJECT_ROOT / '.env'
    if env_path.exists():
        with open(env_path, 'r') as f:
            for line in f:
                line = line.strip()
                if line.startswith('LINKUP_API_KEY=') and not linkup_token:
                    linkup_token = line.split('=', 1)[1].strip().strip('"\'')
                elif line.startswith('ZAI_API_TOKEN=') and not zai_token:
                    zai_token = line.split('=', 1)[1].strip().strip('"\'')

    if linkup_token:
        tokens['linkup'] = linkup_token
    if zai_token:
        tokens['zai'] = zai_token

    if not tokens:
        raise ValueError(
            "No API tokens found. Set LINKUP_API_KEY or ZAI_API_TOKEN environment variable."
        )

    return tokens


class LinkupWebReader:
    """
    Client for Linkup API - simple and reliable web fetching.

    Reference: https://docs.linkup.so/
    """

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        }

    async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
        """
        Read webpage content using Linkup API.

        Returns:
            dict with keys: content, success, error, url, retrieved_on
        """
        async with httpx.AsyncClient(timeout=timeout) as client:
            try:
                response = await client.post(
                    LINKUP_API_URL,
                    headers=self.headers,
                    json={"url": url}
                )

                if response.status_code != 200:
                    return {
                        "success": False,
                        "url": url,
                        "error": f"HTTP {response.status_code}: {response.text[:200]}",
                    }

                result = response.json()

                # Linkup returns markdown content directly
                content = result.get("markdown", result.get("content", ""))

                if not content:
                    return {
                        "success": False,
                        "url": url,
                        "error": "No content returned",
                    }

                return {
                    "success": True,
                    "url": url,
                    "content": content,
                    "retrieved_on": datetime.now(timezone.utc).isoformat(),
                }

            except httpx.TimeoutException:
                return {
                    "success": False,
                    "url": url,
                    "error": "Request timed out",
                }
            except Exception as e:
                return {
                    "success": False,
                    "url": url,
                    "error": str(e),
                }


class ZAIWebReader:
    """
    Client for Z.AI Web Reader MCP API using Streamable HTTP transport.

    The MCP protocol requires:
    1. Initialize session
    2. Send notifications/initialized
    3. Call tools

    Reference: https://docs.z.ai/devpack/mcp/reader-mcp-server
    """

    def __init__(self, api_token: str):
        self.api_token = api_token
        self.session_id = None
        self.headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json",
            "Accept": "application/json, text/event-stream",  # Required for MCP Streamable HTTP
        }

    def _parse_sse_response(self, text: str) -> dict:
        """Parse Server-Sent Events (SSE) response format from MCP API.

        SSE format:
            id:1
            event:message
            data:{"jsonrpc":"2.0",...}

        Returns the parsed JSON data from the 'data' field.
        """
        result = {}
        for line in text.strip().split('\n'):
            if line.startswith('data:'):
                data_content = line[5:].strip()
                if data_content:
                    try:
                        result = json.loads(data_content)
                    except json.JSONDecodeError:
                        pass
        return result

    async def _send_request(self, client: httpx.AsyncClient, method: str, params: Optional[dict] = None, request_id: int = 1) -> dict:
        """Send a JSON-RPC request to the MCP server and parse SSE response.

        Returns dict with keys:
            - success: bool
            - status_code: int
            - data: parsed JSON-RPC result (if success)
            - error: error message (if not success)
        """
        request_body = {
            "jsonrpc": "2.0",
            "method": method,
            "id": request_id
        }
        if params:
            request_body["params"] = params

        # Add session header if we have one
        headers = self.headers.copy()
        if self.session_id:
            headers["mcp-session-id"] = self.session_id

        response = await client.post(ZAI_MCP_URL, headers=headers, json=request_body)

        # Check for session ID in response headers
        if "mcp-session-id" in response.headers:
            self.session_id = response.headers["mcp-session-id"]

        if response.status_code != 200:
            return {
                "success": False,
                "status_code": response.status_code,
                "error": f"HTTP {response.status_code}: {response.text[:200]}"
            }

        # Parse SSE response
        parsed = self._parse_sse_response(response.text)
        if not parsed:
            return {
                "success": False,
                "status_code": response.status_code,
                "error": f"Failed to parse SSE response: {response.text[:200]}"
            }

        return {
            "success": True,
            "status_code": response.status_code,
            "data": parsed
        }

    async def initialize(self, client: httpx.AsyncClient) -> bool:
        """Initialize MCP session."""
        try:
            response = await self._send_request(
                client,
                "initialize",
                {
                    "protocolVersion": "2024-11-05",
                    "capabilities": {},
                    "clientInfo": {
                        "name": "glam-mission-extractor",
                        "version": "1.0.0"
                    }
                },
                request_id=1
            )

            if response.get("success"):
                # Send initialized notification
                await self._send_request(client, "notifications/initialized", {}, request_id=2)
                return True
            return False
        except Exception as e:
            print(f"Initialize error: {e}", file=sys.stderr)
            return False

    async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
        """
        Read webpage content using Z.AI Web Reader.

        Returns:
            dict with keys: title, content, metadata, links, success, error
        """
        async with httpx.AsyncClient(timeout=timeout) as client:
            try:
                # Initialize session first
                if not self.session_id:
                    await self.initialize(client)

                # Call webReader tool
                response = await self._send_request(
                    client,
                    "tools/call",
                    {
                        "name": "webReader",
                        "arguments": {
                            "url": url
                        }
                    },
                    request_id=3
                )

                if not response.get("success"):
                    return {
                        "success": False,
                        "url": url,
                        "error": response.get("error", "Unknown error"),
                    }

                result = response.get("data", {})

                # Parse MCP response
                if "result" in result:
                    content_data = result["result"]

                    # Extract content from MCP response format
                    if isinstance(content_data, dict):
                        # Check for content array (MCP tools/call response format)
                        if "content" in content_data and isinstance(content_data["content"], list):
                            text_parts = []
                            for item in content_data["content"]:
                                if isinstance(item, dict) and item.get("type") == "text":
                                    text_parts.append(item.get("text", ""))
                            content_text = "\n".join(text_parts)
                        else:
                            content_text = content_data.get("content", content_data.get("text", ""))

                        return {
                            "success": True,
                            "url": url,
                            "title": content_data.get("title", ""),
                            "content": content_text,
                            "metadata": content_data.get("metadata", {}),
                            "links": content_data.get("links", []),
                            "retrieved_on": datetime.now(timezone.utc).isoformat(),
                        }
                    elif isinstance(content_data, list) and len(content_data) > 0:
                        # Array of content blocks
                        text_content = ""
                        for block in content_data:
                            if isinstance(block, dict):
                                if block.get("type") == "text":
                                    text_content += block.get("text", "") + "\n"
                                elif "text" in block:
                                    text_content += block["text"] + "\n"
                            elif isinstance(block, str):
                                text_content += block + "\n"
                        return {
                            "success": True,
                            "url": url,
                            "content": text_content.strip(),
                            "retrieved_on": datetime.now(timezone.utc).isoformat(),
                        }

                # Check for error in response
                if "error" in result:
                    return {
                        "success": False,
                        "url": url,
                        "error": f"MCP error: {result['error']}",
                    }

                return {
                    "success": False,
                    "url": url,
                    "error": f"Unexpected response format: {str(result)[:200]}",
                }

            except httpx.HTTPStatusError as e:
                return {
                    "success": False,
                    "url": url,
                    "error": f"HTTP {e.response.status_code}: {e.response.text[:200]}",
                }
            except Exception as e:
                return {
                    "success": False,
                    "url": url,
                    "error": str(e),
                }


class GLMMissionExtractor:
    """
    LLM-based mission statement extractor using Z.AI GLM API.

    This provides intelligent extraction of mission, vision, and goal statements
    from webpage content, replacing naive keyword matching with semantic understanding.

    Uses Z.AI Coding Plan endpoint per Rule 11 in AGENTS.md.
    """

    # Prompt template for mission statement extraction
    EXTRACTION_PROMPT = """Je bent een expert in het analyseren van websites van Nederlandse erfgoedinstellingen (musea, archieven, bibliotheken, etc.).

Analyseer de volgende webpagina-inhoud en extraheer de missie, visie en/of doelstellingen van de organisatie.

## Instructies:
1. Zoek naar expliciete missie- of visie-statements
2. Let op zinnen die beginnen met "Onze missie is...", "Wij streven naar...", "Het museum heeft als doel...", etc.
3. Negeer navigatie-elementen, footer-tekst, contactgegevens, openingstijden
4. Negeer advertenties, nieuwsberichten, en evenement-aankondigingen
5. Als er GEEN duidelijke missie/visie/doelstelling te vinden is, retourneer een leeg resultaat

## Output Format (JSON):
Retourneer ALLEEN een JSON object in dit exacte formaat:
```json
{{
  "mission": "De missie-tekst hier, of null als niet gevonden",
  "vision": "De visie-tekst hier, of null als niet gevonden",
  "goals": "De doelstellingen hier, of null als niet gevonden",
  "confidence": 0.85,
  "source_section": "Naam van de sectie waar dit gevonden is (bijv. 'Over ons', 'Missie en Visie')"
}}
```

## Webpagina inhoud:
{content}

## Let op:
- Retourneer ALLEEN het JSON object, geen andere tekst
- Confidence moet tussen 0.0 en 1.0 zijn
- Als niets gevonden: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null}}
"""

    def __init__(self, api_token: str, model: str = ZAI_GLM_MODEL):
        self.api_token = api_token
        self.model = model
        self.headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json",
        }

    async def extract_mission_from_content(
        self,
        content: str,
        source_url: str,
        timeout: float = 60.0
    ) -> dict:
        """
        Use LLM to extract mission statement from webpage content.

        Args:
            content: The webpage text content (markdown or plain text)
            source_url: URL of the source page (for context)
            timeout: Request timeout in seconds

        Returns:
            dict with keys: success, mission, vision, goals, confidence, error
        """
        # Truncate content if too long (GLM has context limits)
        max_chars = 12000
        if len(content) > max_chars:
            content = content[:max_chars] + "\n\n[... content truncated ...]"

        # Build the prompt
        prompt = self.EXTRACTION_PROMPT.format(content=content)

        request_body = {
            "model": self.model,
            "messages": [
                {
                    "role": "system",
                    "content": "Je bent een assistent die JSON-gestructureerde data extraheert uit webpagina's. Antwoord ALLEEN met valid JSON."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "temperature": 0.1,  # Low temperature for consistent extraction
            "max_tokens": 2048,
        }

        async with httpx.AsyncClient(timeout=timeout) as client:
            try:
                response = await client.post(
                    ZAI_GLM_API_URL,
                    headers=self.headers,
                    json=request_body
                )

                if response.status_code != 200:
                    return {
                        "success": False,
                        "error": f"API error {response.status_code}: {response.text[:200]}",
                    }

                result = response.json()

                # Extract the assistant's response
                if "choices" not in result or len(result["choices"]) == 0:
                    return {
                        "success": False,
                        "error": "No response from API",
                    }

                assistant_message = result["choices"][0]["message"]["content"]

                # Parse JSON from response
                # Handle markdown code blocks if present
                json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', assistant_message)
                if json_match:
                    json_str = json_match.group(1)
                else:
                    json_str = assistant_message.strip()

                try:
                    extracted = json.loads(json_str)
                except json.JSONDecodeError as e:
                    return {
                        "success": False,
                        "error": f"Failed to parse JSON response: {e}",
                        "raw_response": assistant_message[:500],
                    }

                # Validate and return
                return {
                    "success": True,
                    "mission": extracted.get("mission"),
                    "vision": extracted.get("vision"),
                    "goals": extracted.get("goals"),
                    "confidence": extracted.get("confidence", 0.0),
                    "source_section": extracted.get("source_section"),
                    "model": self.model,
                }

            except httpx.TimeoutException:
                return {
                    "success": False,
                    "error": "Request timed out",
                }
            except Exception as e:
                return {
                    "success": False,
                    "error": str(e),
                }


def find_custodians_with_websites(
    prefix: Optional[str] = None,
    limit: Optional[int] = None
) -> list[tuple[Path, dict, str]]:
    """
    Find custodian YAML files that have website URLs.

    Args:
        prefix: Filter by GHCID prefix (e.g., "NL-NH" for Noord-Holland)
        limit: Maximum number of custodians to return

    Returns:
        List of (path, custodian_data, website_url) tuples
    """
    custodian_dir = PROJECT_ROOT / "data" / "custodian"
    results = []

    pattern = f"{prefix}*.yaml" if prefix else "NL-*.yaml"

    for yaml_path in custodian_dir.glob(pattern):
        if limit and len(results) >= limit:
            break

        try:
            with open(yaml_path, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            if not data:
                continue

            # Extract website URL from various possible locations (priority order)
            website = None

            # 1. Direct website field
            if 'website' in data and data['website']:
                website = data['website']

            # 2. Original entry webadres_organisatie
            if not website and 'original_entry' in data:
                oe = data['original_entry']
                if isinstance(oe, dict) and oe.get('webadres_organisatie'):
                    website = oe['webadres_organisatie']

            # 3. Museum register enrichment website_url
            if not website and 'museum_register_enrichment' in data:
                mre = data['museum_register_enrichment']
                if isinstance(mre, dict) and mre.get('website_url'):
                    website = mre['website_url']

            # 4. Wikidata enrichment official_website
            if not website and 'wikidata_enrichment' in data:
                we = data['wikidata_enrichment']
                if isinstance(we, dict) and we.get('official_website'):
                    website = we['official_website']

            # 5. Google Maps enrichment website
            if not website and 'google_maps_enrichment' in data:
                gm = data['google_maps_enrichment']
                if isinstance(gm, dict) and gm.get('website'):
                    website = gm['website']

            # 6. Location object website
            if not website and 'location' in data:
                loc = data['location']
                if isinstance(loc, dict) and loc.get('website'):
                    website = loc['website']

            # 7. Original entry identifiers (Website scheme)
            if not website and 'original_entry' in data:
                oe = data['original_entry']
                if isinstance(oe, dict) and 'identifiers' in oe:
                    for ident in oe.get('identifiers', []):
                        if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
                            website = ident.get('identifier_value') or ident.get('identifier_url')
                            if website:
                                break

            # 8. Top-level identifiers array (Website scheme)
            if not website and 'identifiers' in data:
                for ident in data.get('identifiers', []):
                    if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
                        website = ident.get('identifier_value') or ident.get('identifier_url')
                        if website:
                            break

            if website and website.startswith('http'):
                results.append((yaml_path, data, website))

        except Exception as e:
            print(f"Warning: Failed to parse {yaml_path}: {e}", file=sys.stderr)

    return results


def discover_mission_page_urls(base_url: str) -> list[str]:
    """
    Generate candidate URLs for mission/vision pages.

    Args:
        base_url: The custodian's main website URL

    Returns:
        List of URLs to check for mission content
    """
    # Normalize base URL - prefer https
    parsed = urlparse(base_url)
    scheme = 'https' if parsed.scheme == 'http' else parsed.scheme
    base = f"{scheme}://{parsed.netloc}"

    candidates = []
    # Use ALL_MISSION_PATTERNS for multilingual support
    for pattern in ALL_MISSION_PATTERNS:
        candidates.append(urljoin(base, pattern))

    # Also add the homepage as it might contain mission info
    candidates.append(base_url)

    return candidates


# Keywords to look for in links when discovering mission pages (multilingual)
MISSION_LINK_KEYWORDS = [
    # Dutch
    'missie', 'visie', 'over-ons', 'over', 'organisatie', 'doelstelling',
    'wie-zijn-wij', 'wie-we-zijn', 'onze-missie', 'het-museum', 'het-archief',
    'de-bibliotheek', 'stichting', 'vereniging', 'kernwaarden', 'ambitie',
    # Spanish
    'mision', 'vision', 'sobre-nosotros', 'quienes-somos', 'institucional',
    'historia', 'el-museo', 'la-biblioteca', 'el-archivo', 'acerca-de',
    'nuestra-mision', 'conocenos', 'nosotros',
    # Portuguese
    'missao', 'visao', 'sobre', 'sobre-nos', 'quem-somos', 'o-museu',
    'a-biblioteca', 'o-arquivo', 'nossa-missao', 'conheca',
    # German
    'leitbild', 'ueber-uns', 'uber-uns', 'das-museum', 'wir-ueber-uns',
    # French
    'a-propos', 'qui-sommes-nous', 'notre-mission', 'le-musee', 'presentation',
    # English
    'about', 'about-us', 'mission', 'vision', 'organization', 'who-we-are',
]


def extract_links_from_markdown(content: str, base_url: str) -> list[str]:
    """
    Extract all links from markdown content.

    Args:
        content: Markdown text content
        base_url: Base URL for resolving relative links

    Returns:
        List of absolute URLs found in the content
    """
    links = []

    # Match markdown links: [text](url)
    md_link_pattern = r'\[([^\]]*)\]\(([^)]+)\)'
    for match in re.finditer(md_link_pattern, content):
        url = match.group(2).strip()
        if url:
            # Skip anchors, mailto, tel, etc.
            if url.startswith('#') or url.startswith('mailto:') or url.startswith('tel:'):
                continue
            # Resolve relative URLs
            if not url.startswith('http'):
                url = urljoin(base_url, url)
            links.append(url)

    # Also match plain URLs in text
    url_pattern = r'https?://[^\s<>\)\]"\']+'
    for match in re.finditer(url_pattern, content):
        url = match.group(0).rstrip('.,;:')
        if url not in links:
            links.append(url)

    return links


def filter_mission_links(links: list[str], base_domain: str) -> list[str]:
    """
    Filter links to only those likely to contain mission/vision content.

    Args:
        links: List of URLs to filter
        base_domain: Domain of the custodian website (only keep same-domain links)

    Returns:
        List of URLs that likely contain mission content
    """
    mission_urls = []

    for url in links:
        parsed = urlparse(url)

        # Only keep links from the same domain
        if parsed.netloc and base_domain not in parsed.netloc:
            continue

        # Check if path contains mission-related keywords
        path_lower = parsed.path.lower()
        for keyword in MISSION_LINK_KEYWORDS:
            if keyword in path_lower:
                if url not in mission_urls:
                    mission_urls.append(url)
                break

    return mission_urls


async def discover_mission_links_from_homepage(
    reader: Union['LinkupWebReader', 'ZAIWebReader'],
    homepage_url: str,
    verbose: bool = False
) -> tuple[list[str], str, str]:
    """
    Fetch homepage and discover links to mission/vision pages.

    This is more reliable than guessing URL patterns because it finds
    the actual links used by the website.

    Args:
        reader: Web reader instance
        homepage_url: The custodian's homepage URL
        verbose: Whether to print progress

    Returns:
        Tuple of (discovered_urls, homepage_content, retrieved_on)
        Returns ([], '', '') if homepage fetch fails
    """
    # Fetch homepage
    result = await reader.read_webpage(homepage_url)

    if not result['success']:
        if verbose:
            print(f"    Homepage fetch failed: {result.get('error', 'Unknown')[:50]}")
        return [], '', ''

    content = result.get('content', '')
    retrieved_on = result.get('retrieved_on', datetime.now(timezone.utc).isoformat())

    if not content:
        return [], content, retrieved_on

    # Extract base domain for filtering
    parsed = urlparse(homepage_url)
    base_domain = parsed.netloc.lower()

    # Extract all links from homepage
    all_links = extract_links_from_markdown(content, homepage_url)

    if verbose:
        print(f"    Found {len(all_links)} links on homepage")

    # Filter to mission-related links
    mission_links = filter_mission_links(all_links, base_domain)

    if verbose and mission_links:
        print(f"    Found {len(mission_links)} mission-related links:")
        for link in mission_links[:5]:  # Show first 5
            print(f"      - {link}")

    return mission_links, content, retrieved_on


def extract_statements_from_content(
    content: str,
    source_url: str,
    retrieved_on: str,
    ghcid: str,
) -> list[dict]:
    """
    Extract mission, vision, and goal statements from webpage content.

    This uses keyword matching and section detection. For production,
    consider using an LLM for more intelligent extraction.

    Args:
        content: The webpage text content
        source_url: URL of the source page
        retrieved_on: ISO timestamp when page was retrieved
        ghcid: GHCID of the custodian

    Returns:
        List of mission statement dictionaries
    """
    statements = []
    content_lower = content.lower()

    # Skip error pages (404, 500, etc.)
    error_indicators = [
        'pagina niet gevonden', 'page not found', '404',
        'niet gevonden', 'not found', 'error', 'fout',
        'deze pagina bestaat niet', 'this page does not exist'
    ]
    # Check title and first 500 chars for error indicators
    if any(indicator in content_lower[:500] for indicator in error_indicators):
        return []

    # Also check if content looks like raw JSON (Z.AI sometimes returns this)
    if content.strip().startswith('{"') or content.strip().startswith('"{'):
        return []

    # Check if this page has mission-related content
    has_mission_content = any(
        keyword in content_lower
        for keywords in MISSION_KEYWORDS.values()
        for keyword in keywords
    )

    if not has_mission_content:
        return []

    # Split content into sections (by headings or blank lines)
    sections = re.split(r'\n\s*\n|\n#+\s+|\n\*\*[^*]+\*\*\n', content)

    for section in sections:
        section = section.strip()
        if len(section) < 20:  # Skip very short sections
            continue

        section_lower = section.lower()

        # Detect statement type based on keywords
        statement_type = None
        confidence = 0.7

        for stype, keywords in MISSION_KEYWORDS.items():
            for keyword in keywords:
                if keyword in section_lower[:200]:  # Check beginning of section
                    statement_type = stype
                    confidence = 0.85 if keyword in section_lower[:50] else 0.75
                    break
            if statement_type:
                break

        if not statement_type:
            continue

        # Clean up the section text
        # Remove markdown formatting
        clean_text = re.sub(r'\*\*([^*]+)\*\*', r'\1', section)
        clean_text = re.sub(r'\*([^*]+)\*', r'\1', clean_text)
        clean_text = re.sub(r'#+\s*', '', clean_text)
        clean_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', clean_text)
        clean_text = clean_text.strip()

        if len(clean_text) < 20:
            continue

        # Skip boilerplate/footer content
        boilerplate_indicators = [
            '©', 'copyright', 'all rights reserved', 'alle rechten voorbehouden',
            'privacybeleid', 'privacy policy', 'cookie', 'algemene voorwaarden',
            'terms and conditions', 'nieuwsbrief', 'newsletter', 'subscribe',
            'volg ons', 'follow us', 'social media', 'facebook', 'instagram',
            'twitter', 'linkedin', 'youtube', 'contact', 'openingstijden',
            'opening hours', 'bereikbaarheid', 'route', 'adres:', 'address:',
        ]
        clean_lower = clean_text.lower()
        boilerplate_count = sum(1 for ind in boilerplate_indicators if ind in clean_lower)
        # If more than 2 boilerplate indicators in a short text, skip it
        if boilerplate_count >= 2 and len(clean_text) < 200:
            continue
        # If the text is primarily copyright/footer (starts with ©)
        if clean_text.strip().startswith('©'):
            continue

        # Skip navigation/intro text (too short to be actual mission content)
        # Actual mission statements are usually at least 50 characters
        if len(clean_text) < 50:
            continue

        # Skip text that looks like a link/intro (e.g., "Lees alles over...")
        skip_patterns = [
            r'^lees\s+(alles\s+)?over',
            r'^klik\s+hier',
            r'^meer\s+(info|informatie)',
            r'^bekijk\s+',
            r'^ga\s+naar',
            r'^read\s+(more\s+)?about',
            r'^click\s+here',
            r'^view\s+',
        ]
        if any(re.match(pattern, clean_lower) for pattern in skip_patterns):
            continue

        # Generate statement ID
        year = datetime.now().year
        statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{statement_type}-{year}"

        # Compute content hash
        content_hash = compute_content_hash(clean_text)

        statement = {
            'statement_id': statement_id,
            'statement_type': statement_type,
            'statement_text': clean_text,
            'statement_language': get_language_from_ghcid(ghcid),  # Detect from GHCID country
            'source_url': source_url,
            'retrieved_on': retrieved_on,
            'extraction_agent': 'zai-web-reader/batch',
            'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
            'extraction_confidence': confidence,
            'content_hash': content_hash,
            'prov': {
                'wasDerivedFrom': source_url,
                'generatedAtTime': retrieved_on,
            }
        }

        statements.append(statement)

    return statements


async def extract_statements_with_llm(
    llm_extractor: GLMMissionExtractor,
    content: str,
    source_url: str,
    retrieved_on: str,
    ghcid: str,
) -> list[dict]:
    """
    Extract mission, vision, and goal statements using LLM (Z.AI GLM).

    This provides much better quality extraction than keyword matching
    by using semantic understanding of the content.

    Args:
        llm_extractor: GLMMissionExtractor instance
        content: The webpage text content
        source_url: URL of the source page
        retrieved_on: ISO timestamp when page was retrieved
        ghcid: GHCID of the custodian

    Returns:
        List of mission statement dictionaries
    """
    # Quick pre-filter: skip obvious error pages
    content_lower = content.lower()
    error_indicators = [
        'pagina niet gevonden', 'page not found', '404',
        'niet gevonden', 'not found', 'deze pagina bestaat niet',
        'oeps', 'error', 'no routes match', 'routing error'
    ]
    if any(indicator in content_lower[:500] for indicator in error_indicators):
        return []

    # Skip raw JSON responses
    if content.strip().startswith('{"') or content.strip().startswith('"{'):
        return []

    # Skip very short content (likely empty page)
    if len(content.strip()) < 200:
        return []

    # Call LLM for extraction
    result = await llm_extractor.extract_mission_from_content(
        content=content,
        source_url=source_url
    )

    if not result['success']:
        return []

    statements = []
    year = datetime.now().year

    # Process each statement type
    for statement_type in ['mission', 'vision', 'goals']:
        text = result.get(statement_type)
        if not text or text == 'null' or len(str(text).strip()) < 20:
            continue

        # Map 'goals' to 'goal' for consistency with schema
        schema_type = 'goal' if statement_type == 'goals' else statement_type

        # Generate statement ID
        statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{schema_type}-{year}"

        # Compute content hash
        content_hash = compute_content_hash(str(text))

        statement = {
            'statement_id': statement_id,
            'statement_type': schema_type,
            'statement_text': str(text).strip(),
            'statement_language': get_language_from_ghcid(ghcid),  # Detect from GHCID country
            'source_url': source_url,
            'retrieved_on': retrieved_on,
            'extraction_agent': f'zai-glm/{result.get("model", ZAI_GLM_MODEL)}',
            'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
            'extraction_confidence': result.get('confidence', 0.0),
            'content_hash': content_hash,
            'prov': {
                'wasDerivedFrom': source_url,
                'generatedAtTime': retrieved_on,
            }
        }

        # Add source section if available
        if result.get('source_section'):
            statement['source_section'] = result['source_section']

        statements.append(statement)

    return statements


def update_custodian_yaml(
    yaml_path: Path,
    custodian_data: dict,
    statements: list[dict],
    dry_run: bool = False
) -> bool:
    """
    Update custodian YAML file with extracted mission statements.

    Args:
        yaml_path: Path to the custodian YAML file
        custodian_data: Current custodian data
        statements: List of extracted statements
        dry_run: If True, don't write changes

    Returns:
        True if updated successfully
    """
    if not statements:
        return False

    # Initialize or update mission_statement field
    if 'mission_statement' not in custodian_data:
        custodian_data['mission_statement'] = []

    existing_ids = {
        s.get('statement_id') for s in custodian_data['mission_statement']
        if isinstance(s, dict)
    }

    # Add new statements
    added = 0
    for statement in statements:
        if statement['statement_id'] not in existing_ids:
            custodian_data['mission_statement'].append(statement)
            added += 1

    if added == 0:
        return False

    if dry_run:
        print(f"  Would add {added} statements to {yaml_path.name}")
        return True

    # Write updated YAML
    try:
        with open(yaml_path, 'w', encoding='utf-8') as f:
            yaml.dump(
                custodian_data,
                f,
                default_flow_style=False,
                allow_unicode=True,
                sort_keys=False,
                width=120
            )
        print(f"  Added {added} statements to {yaml_path.name}")
        return True
    except Exception as e:
        print(f"  Error writing {yaml_path.name}: {e}", file=sys.stderr)
        return False


async def process_custodian(
    reader: Union[LinkupWebReader, ZAIWebReader],
    yaml_path: Path,
    custodian_data: dict,
    website: str,
    dry_run: bool = False,
    verbose: bool = False,
    llm_extractor: Optional[GLMMissionExtractor] = None,
) -> dict:
    """
    Process a single custodian: discover pages, fetch content, extract statements.

    IMPROVED: Now uses two-phase discovery:
    1. First fetch homepage and extract actual mission page links from navigation
    2. Fall back to URL pattern guessing only if no links found

    Args:
        reader: Web reader instance (Linkup or ZAI)
        yaml_path: Path to custodian YAML file
        custodian_data: Current custodian data
        website: Website URL to process
        dry_run: If True, don't write changes
        verbose: If True, show detailed progress
        llm_extractor: Optional LLM extractor for intelligent extraction

    Returns:
        dict with processing results
    """
    ghcid = yaml_path.stem.split('-')[0:5]  # Extract base GHCID from filename
    ghcid = '-'.join(ghcid[:5]) if len(ghcid) >= 5 else yaml_path.stem

    # Get name for display
    name = custodian_data.get('custodian_name', {}).get('emic_name')
    if not name:
        name = custodian_data.get('name', ghcid)

    result = {
        'ghcid': ghcid,
        'name': name,
        'website': website,
        'pages_checked': 0,
        'pages_with_content': 0,
        'statements_found': 0,
        'statements_added': 0,
        'discovery_method': 'none',
        'errors': [],
    }

    if verbose:
        print(f"\nProcessing {ghcid}: {name}")
        print(f"  Website: {website}")

    all_statements = []
    homepage_content = None
    homepage_retrieved_on = None

    # PHASE 1: Discover mission pages from homepage links (preferred method)
    if verbose:
        print(f"  Phase 1: Discovering mission pages from homepage...")

    discovered_links, homepage_content, homepage_retrieved_on = await discover_mission_links_from_homepage(
        reader, website, verbose
    )

    result['pages_checked'] += 1  # Homepage was fetched

    if discovered_links:
        result['discovery_method'] = 'homepage_links'
        candidate_urls = discovered_links[:5]  # Limit to 5 discovered links
        if verbose:
            print(f"  Using {len(candidate_urls)} discovered mission links")
    else:
        # PHASE 2: Fall back to URL pattern guessing
        result['discovery_method'] = 'pattern_guessing'
        if verbose:
            print(f"  Phase 2: No mission links found, falling back to URL patterns...")
        candidate_urls = discover_mission_page_urls(website)[:5]

    # First, try to extract from homepage content (if we have it)
    if homepage_content and len(homepage_content) > 200:
        result['pages_with_content'] += 1

        if llm_extractor:
            statements = await extract_statements_with_llm(
                llm_extractor, homepage_content, website,
                homepage_retrieved_on or datetime.now(timezone.utc).isoformat(), ghcid
            )
            if verbose and statements:
                print(f"    [LLM] Found {len(statements)} statements on homepage")
        else:
            statements = extract_statements_from_content(
                homepage_content, website,
                homepage_retrieved_on or datetime.now(timezone.utc).isoformat(), ghcid
            )
            if verbose and statements:
                print(f"    [Keyword] Found {len(statements)} statements on homepage")

        if statements:
            all_statements.extend(statements)
            # If we found a mission statement on homepage with high confidence, skip dedicated pages
            # (unless using keyword extraction which has lower accuracy)
            if llm_extractor and any(s['statement_type'] == 'mission' and s.get('extraction_confidence', 0) > 0.7 for s in statements):
                if verbose:
                    print(f"  Found high-confidence mission on homepage, skipping dedicated pages")
                result['discovery_method'] = 'homepage_content'
                result['statements_found'] = len(all_statements)
                # Deduplicate and return early
                unique_statements = {}
                for stmt in all_statements:
                    stype = stmt['statement_type']
                    if stype not in unique_statements or stmt['extraction_confidence'] > unique_statements[stype]['extraction_confidence']:
                        unique_statements[stype] = stmt
                final_statements = list(unique_statements.values())
                if final_statements:
                    if update_custodian_yaml(yaml_path, custodian_data, final_statements, dry_run):
                        result['statements_added'] = len(final_statements)
                return result

    # Check candidate mission page URLs
    for url in candidate_urls:
        # Skip if this is the homepage (already processed)
        if url.rstrip('/') == website.rstrip('/'):
            continue

        result['pages_checked'] += 1

        if verbose:
            print(f"  Checking: {url}")

        # Fetch page content
        page_result = await reader.read_webpage(url)

        if not page_result['success']:
            if verbose:
                print(f"    Failed: {page_result.get('error', 'Unknown error')[:50]}")
            result['errors'].append(f"{url}: {page_result.get('error', 'Unknown')[:50]}")
            continue

        content = page_result.get('content', '')
        if not content or len(content) < 100:
            if verbose:
                print(f"    No content")
            continue

        result['pages_with_content'] += 1

        # Extract statements from content
        retrieved_on = page_result.get('retrieved_on', datetime.now(timezone.utc).isoformat())

        # Use LLM extraction if available, otherwise fall back to keyword-based
        if llm_extractor:
            statements = await extract_statements_with_llm(
                llm_extractor, content, url, retrieved_on, ghcid
            )
            if verbose and statements:
                print(f"    [LLM] Found {len(statements)} statements")
        else:
            statements = extract_statements_from_content(content, url, retrieved_on, ghcid)
            if verbose and statements:
                print(f"    [Keyword] Found {len(statements)} statements")

        if statements:
            all_statements.extend(statements)

            # If we found mission content on a dedicated page, prefer it over homepage
            if any(s['statement_type'] == 'mission' for s in statements):
                break

    result['statements_found'] = len(all_statements)

    # Deduplicate statements by type (keep highest confidence)
    unique_statements = {}
    for stmt in all_statements:
        stype = stmt['statement_type']
        if stype not in unique_statements or stmt['extraction_confidence'] > unique_statements[stype]['extraction_confidence']:
            unique_statements[stype] = stmt

    final_statements = list(unique_statements.values())

    # Update YAML file
    if final_statements:
        if update_custodian_yaml(yaml_path, custodian_data, final_statements, dry_run):
            result['statements_added'] = len(final_statements)

    return result


async def main():
    parser = argparse.ArgumentParser(
        description='Batch extract mission statements from heritage custodian websites'
    )
    parser.add_argument(
        '--test', type=int, metavar='N',
        help='Test mode: process only N custodians'
    )
    parser.add_argument(
        '--province', type=str, metavar='PREFIX',
        help='Process custodians matching GHCID prefix (e.g., NL-NH for Noord-Holland)'
    )
    parser.add_argument(
        '--ghcid', type=str,
        help='Process a single custodian by GHCID'
    )
    parser.add_argument(
        '--all', action='store_true',
        help='Process all Dutch custodians with websites'
    )
    parser.add_argument(
        '--dry-run', action='store_true',
        help='Show what would be done without making changes'
    )
    parser.add_argument(
        '--verbose', '-v', action='store_true',
        help='Show detailed progress'
    )
    parser.add_argument(
        '--concurrency', type=int, default=3,
        help='Number of concurrent requests (default: 3)'
    )
    parser.add_argument(
        '--llm', action='store_true',
        help='Use LLM (Z.AI GLM) for intelligent extraction instead of keyword matching'
    )

    args = parser.parse_args()

    if not any([args.test, args.province, args.ghcid, args.all]):
        parser.print_help()
        print("\nExample usage:")
        print("  python scripts/batch_extract_mission_statements.py --test 5 --verbose")
        print("  python scripts/batch_extract_mission_statements.py --test 5 --llm --verbose  # With LLM extraction")
        print("  python scripts/batch_extract_mission_statements.py --province NL-NH --llm")
        print("  python scripts/batch_extract_mission_statements.py --ghcid NL-ZH-ZUI-M-LMT --llm")
        sys.exit(1)

    # Get API tokens
    try:
        tokens = get_api_tokens()
    except ValueError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

    # Initialize web reader - prefer Linkup (more reliable), fall back to Z.AI
    if 'linkup' in tokens:
        reader = LinkupWebReader(tokens['linkup'])
        print("Using Linkup API for web fetching")
    elif 'zai' in tokens:
        reader = ZAIWebReader(tokens['zai'])
        print("Using Z.AI Web Reader API for web fetching")
    else:
        print("Error: No API token available", file=sys.stderr)
        sys.exit(1)

    # Initialize LLM extractor if requested
    llm_extractor = None
    if args.llm:
        if 'zai' not in tokens:
            print("Error: --llm requires ZAI_API_TOKEN for LLM extraction", file=sys.stderr)
            sys.exit(1)
        llm_extractor = GLMMissionExtractor(tokens['zai'])
        print(f"Using Z.AI GLM ({ZAI_GLM_MODEL}) for LLM-based extraction")

    # Find custodians to process
    if args.ghcid:
        # Single custodian mode
        custodian_dir = PROJECT_ROOT / "data" / "custodian"
        yaml_files = list(custodian_dir.glob(f"{args.ghcid}*.yaml"))

        if not yaml_files:
            print(f"Error: No custodian file found for GHCID {args.ghcid}", file=sys.stderr)
            sys.exit(1)

        yaml_path = yaml_files[0]
        with open(yaml_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        # Find website using same logic as find_custodians_with_websites
        website = None

        # 1. Direct website field
        if 'website' in data and data['website']:
            website = data['website']

        # 2. Original entry webadres_organisatie
        if not website and 'original_entry' in data:
            oe = data['original_entry']
            if isinstance(oe, dict) and oe.get('webadres_organisatie'):
                website = oe['webadres_organisatie']

        # 3. Museum register enrichment website_url
        if not website and 'museum_register_enrichment' in data:
            mre = data['museum_register_enrichment']
            if isinstance(mre, dict) and mre.get('website_url'):
                website = mre['website_url']

        # 4. Wikidata enrichment official_website
        if not website and 'wikidata_enrichment' in data:
            we = data['wikidata_enrichment']
            if isinstance(we, dict) and we.get('official_website'):
                website = we['official_website']

        # 5. Google Maps enrichment website
        if not website and 'google_maps_enrichment' in data:
            gm = data['google_maps_enrichment']
            if isinstance(gm, dict) and gm.get('website'):
                website = gm['website']

        # 6. Location object website
        if not website and 'location' in data:
            loc = data['location']
            if isinstance(loc, dict) and loc.get('website'):
                website = loc['website']

        # 7. Original entry identifiers (Website scheme)
        if not website and 'original_entry' in data:
            oe = data['original_entry']
            if isinstance(oe, dict) and 'identifiers' in oe:
                for ident in oe.get('identifiers', []):
                    if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
                        website = ident.get('identifier_value') or ident.get('identifier_url')
                        if website:
                            break

        # 8. Top-level identifiers array (Website scheme)
        if not website and 'identifiers' in data:
            for ident in data.get('identifiers', []):
                if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
                    website = ident.get('identifier_value') or ident.get('identifier_url')
                    if website:
                        break

        if not website or not website.startswith('http'):
            print(f"Error: No website found for {args.ghcid}", file=sys.stderr)
            sys.exit(1)

        custodians = [(yaml_path, data, website)]
    else:
        # Batch mode
        limit = args.test if args.test else None
        prefix = args.province if args.province else None

        print(f"Finding custodians with websites...")
        custodians = find_custodians_with_websites(prefix=prefix, limit=limit)

    print(f"Found {len(custodians)} custodians with websites")

    if args.dry_run:
        print("\n[DRY RUN MODE - No changes will be made]\n")

    # Process custodians
    results = []
    semaphore = asyncio.Semaphore(args.concurrency)

    async def process_with_semaphore(custodian_tuple):
        async with semaphore:
            yaml_path, data, website = custodian_tuple
            return await process_custodian(
                reader, yaml_path, data, website,
                dry_run=args.dry_run, verbose=args.verbose,
                llm_extractor=llm_extractor
            )

    # Process in batches
    tasks = [process_with_semaphore(c) for c in custodians]

    print(f"\nProcessing {len(tasks)} custodians...")

    for i, coro in enumerate(asyncio.as_completed(tasks), 1):
        result = await coro
        results.append(result)

        if not args.verbose:
            # Progress indicator
            if result['statements_added'] > 0:
                print(f"[{i}/{len(tasks)}] {result['ghcid']}: Added {result['statements_added']} statements")
            elif i % 10 == 0:
                print(f"[{i}/{len(tasks)}] Processing...")

    # Summary statistics
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)

    total_checked = sum(r['pages_checked'] for r in results)
    total_with_content = sum(r['pages_with_content'] for r in results)
    total_found = sum(r['statements_found'] for r in results)
    total_added = sum(r['statements_added'] for r in results)
    total_errors = sum(len(r['errors']) for r in results)
    custodians_with_statements = sum(1 for r in results if r['statements_added'] > 0)

    print(f"Custodians processed: {len(results)}")
    print(f"Pages checked: {total_checked}")
    print(f"Pages with content: {total_with_content}")
    print(f"Statements found: {total_found}")
    print(f"Statements added: {total_added}")
    print(f"Custodians updated: {custodians_with_statements}")
    print(f"Errors encountered: {total_errors}")

    # Show custodians that got statements
    if custodians_with_statements > 0:
        print(f"\nCustodians with new mission statements:")
        for r in results:
            if r['statements_added'] > 0:
                print(f"  - {r['ghcid']}: {r['name']} ({r['statements_added']} statements)")


if __name__ == '__main__':
    asyncio.run(main())