glam/scripts/batch_extract_mission_statements.py
2026-01-04 13:12:32 +01:00

2533 lines
100 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Batch extract mission statements from heritage custodian websites.
This script:
1. Finds Dutch custodians with websites
2. Discovers mission/vision/about pages
3. Uses Linkup API (primary) or Z.AI Web Reader (fallback) to fetch content
4. Creates LinkML-compliant mission_statement entries with full provenance
5. Updates custodian YAML files with extracted statements
Usage:
python scripts/batch_extract_mission_statements.py --test 5 # Test with 5 custodians
python scripts/batch_extract_mission_statements.py --province NL-NH # Noord-Holland only
python scripts/batch_extract_mission_statements.py --all # All Dutch custodians
python scripts/batch_extract_mission_statements.py --ghcid NL-ZH-ZUI-M-LMT # Single custodian
Requirements:
- httpx (pip install httpx)
- pyyaml
- LINKUP_API_KEY environment variable (primary)
- ZAI_API_TOKEN environment variable (fallback)
API Documentation:
- Linkup: https://docs.linkup.so/
- Z.AI: https://docs.z.ai/devpack/mcp/reader-mcp-server
"""
import argparse
import asyncio
import base64
import hashlib
import json
import os
import re
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Any, Union
from urllib.parse import urljoin, urlparse, quote
import httpx
import yaml
# Try importing playwright for fallback
try:
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
PLAYWRIGHT_AVAILABLE = True
except ImportError:
PLAYWRIGHT_AVAILABLE = False
print("Note: Playwright not available. Install with 'pip install playwright && playwright install chromium' for better JP site support.", file=sys.stderr)
# Z.AI GLM API configuration (per Rule 11 in AGENTS.md)
ZAI_GLM_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
ZAI_GLM_MODEL = "glm-4.7" # Latest model with best quality
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
# API configurations
LINKUP_API_URL = "https://api.linkup.so/v1/fetch"
ZAI_MCP_URL = "https://api.z.ai/api/mcp/web_reader/mcp"
# Common mission page URL patterns for Dutch heritage institutions
# Ordered by likelihood of success (most common patterns first)
DUTCH_MISSION_PATTERNS = [
"/over-ons", # Most common Dutch pattern
"/missie", # Direct mission page
"/over", # Short version
"/missie-en-visie", # Combined mission/vision
"/organisatie", # Organization page often has mission
"/about", # English fallback
"/visie", # Vision page
"/over-ons/missie", # Nested mission page
"/onze-missie", # "Our mission"
"/over/missie",
"/organisatie/missie",
"/het-museum/missie",
"/het-museum/missie-en-visie",
"/museum/missie",
"/about/mission",
"/wie-zijn-wij",
"/about-us",
]
# Extended patterns for Dutch museum websites (discovered through testing)
DUTCH_MISSION_EXTENDED_PATTERNS = [
"/het-muzeeum-organisatie/missie-visie",
"/het-museum-organisatie/missie-visie",
"/organisatie/missie-visie",
"/over-het-museum/missie",
"/over-het-museum/missie-en-visie",
"/info/missie",
"/info/over-ons",
"/stichting/missie",
"/museum/over-ons",
"/museum/organisatie",
]
# Spanish mission page patterns (for Latin America)
SPANISH_MISSION_PATTERNS = [
"/sobre-nosotros", # About us
"/quienes-somos", # Who we are
"/mision", # Mission
"/mision-y-vision", # Mission and vision
"/institucional", # Institutional
"/historia", # History often contains mission
"/el-museo", # The museum
"/acerca-de", # About
"/nuestra-mision", # Our mission
"/conocenos", # Get to know us
"/institucion", # Institution
"/nosotros", # Us
"/about", # English fallback
"/about-us",
]
# Portuguese mission page patterns (for Brazil, Portugal)
PORTUGUESE_MISSION_PATTERNS = [
"/sobre", # About
"/sobre-nos", # About us
"/quem-somos", # Who we are
"/missao", # Mission
"/missao-e-visao", # Mission and vision
"/institucional", # Institutional
"/historia", # History
"/o-museu", # The museum
"/a-biblioteca", # The library
"/conheca", # Get to know
"/nossa-missao", # Our mission
"/about", # English fallback
]
# German mission page patterns
GERMAN_MISSION_PATTERNS = [
"/ueber-uns", # About us
"/uber-uns", # Without umlaut
"/leitbild", # Mission statement
"/mission", # Mission
"/das-museum", # The museum
"/institution", # Institution
"/wir-ueber-uns", # We about us
"/about", # English fallback
]
# French mission page patterns
FRENCH_MISSION_PATTERNS = [
"/a-propos", # About
"/qui-sommes-nous", # Who are we
"/mission", # Mission
"/notre-mission", # Our mission
"/le-musee", # The museum
"/presentation", # Presentation
"/historique", # Historical
"/about", # English fallback
]
# English mission page patterns (international fallback)
ENGLISH_MISSION_PATTERNS = [
"/about",
"/about-us",
"/mission",
"/our-mission",
"/mission-vision",
"/mission-and-vision",
"/who-we-are",
"/the-museum",
"/the-library",
"/the-archive",
"/history",
"/institutional",
]
# Japanese mission page patterns (for JP custodians - 12,096 files)
JAPANESE_MISSION_PATTERNS = [
"/about", # English pattern (commonly used in Japan)
"/about-us", # English pattern
"/introduction", # Introduction
"/outline", # Organization outline (概要)
"/gaiyo", # 概要 (romanized)
"/gaiyou", # Alternative romanization
"/rinen", # 理念 (philosophy)
"/mission", # Mission (English)
"/message", # Message from director
"/greeting", # Greeting (挨拶)
"/aisatsu", # 挨拶 (romanized)
"/history", # History
"/enkaku", # 沿革 (history, romanized)
"/profile", # Profile
"/info", # Information
"/organization", # Organization
]
# Czech mission page patterns (for CZ custodians - 8,432 files)
CZECH_MISSION_PATTERNS = [
"/o-nas", # About us
"/o-knihovne", # About the library
"/o-muzeu", # About the museum
"/o-archivu", # About the archive
"/poslani", # Mission
"/poslani-a-vize", # Mission and vision
"/historie", # History
"/informace", # Information
"/zakladni-informace", # Basic information
"/profil", # Profile
"/about", # English fallback
]
# Italian mission page patterns
ITALIAN_MISSION_PATTERNS = [
"/chi-siamo", # Who we are
"/la-missione", # The mission
"/missione", # Mission
"/storia", # History
"/il-museo", # The museum
"/la-biblioteca", # The library
"/presentazione", # Presentation
"/about", # English fallback
]
# Combined patterns - use all languages for maximum coverage
ALL_MISSION_PATTERNS = (
DUTCH_MISSION_PATTERNS +
SPANISH_MISSION_PATTERNS +
PORTUGUESE_MISSION_PATTERNS +
GERMAN_MISSION_PATTERNS +
FRENCH_MISSION_PATTERNS +
ENGLISH_MISSION_PATTERNS +
JAPANESE_MISSION_PATTERNS +
CZECH_MISSION_PATTERNS +
ITALIAN_MISSION_PATTERNS
)
# Language-specific URL patterns mapping
LANGUAGE_URL_PATTERNS = {
'nl': DUTCH_MISSION_PATTERNS + DUTCH_MISSION_EXTENDED_PATTERNS,
'es': SPANISH_MISSION_PATTERNS,
'pt': PORTUGUESE_MISSION_PATTERNS,
'de': GERMAN_MISSION_PATTERNS,
'fr': FRENCH_MISSION_PATTERNS,
'en': ENGLISH_MISSION_PATTERNS,
'ja': JAPANESE_MISSION_PATTERNS,
'cs': CZECH_MISSION_PATTERNS,
'it': ITALIAN_MISSION_PATTERNS,
}
# Keywords indicating mission/vision content (multilingual)
MISSION_KEYWORDS = {
'mission': ['missie', 'mission', 'opdracht', 'kerntaak', 'misión', 'missão', 'leitbild'],
'vision': ['visie', 'vision', 'toekomst', 'ambitie', 'visión', 'visão'],
'goal': ['doelstelling', 'doel', 'doelen', 'goal', 'objective', 'objectives', 'ambitie',
'objetivo', 'objetivos', 'ziel', 'ziele'],
'value': ['waarde', 'waarden', 'kernwaarden', 'value', 'values', 'principle',
'valor', 'valores', 'wert', 'werte'],
'motto': ['motto', 'slogan', 'slagzin', 'lema'],
}
# ISO 3166-1 alpha-2 country code to ISO 639-1 language code mapping
# Maps country to primary/official language
COUNTRY_TO_LANGUAGE = {
# Dutch-speaking
'NL': 'nl', 'BE': 'nl', 'SR': 'nl', 'AW': 'nl', 'CW': 'nl', 'SX': 'nl',
# Spanish-speaking
'AR': 'es', 'BO': 'es', 'CL': 'es', 'CO': 'es', 'CR': 'es', 'CU': 'es',
'DO': 'es', 'EC': 'es', 'SV': 'es', 'GT': 'es', 'HN': 'es', 'MX': 'es',
'NI': 'es', 'PA': 'es', 'PY': 'es', 'PE': 'es', 'PR': 'es', 'ES': 'es',
'UY': 'es', 'VE': 'es', 'GQ': 'es',
# Portuguese-speaking
'BR': 'pt', 'PT': 'pt', 'AO': 'pt', 'MZ': 'pt', 'CV': 'pt', 'GW': 'pt',
'ST': 'pt', 'TL': 'pt',
# German-speaking
'DE': 'de', 'AT': 'de', 'CH': 'de', 'LI': 'de', 'LU': 'de',
# French-speaking
'FR': 'fr', 'MC': 'fr', 'SN': 'fr', 'CI': 'fr', 'ML': 'fr', 'BF': 'fr',
'NE': 'fr', 'TG': 'fr', 'BJ': 'fr', 'GA': 'fr', 'CG': 'fr', 'CD': 'fr',
'MG': 'fr', 'HT': 'fr', 'RE': 'fr', 'MQ': 'fr', 'GP': 'fr', 'GF': 'fr',
'NC': 'fr', 'PF': 'fr',
# Italian-speaking
'IT': 'it', 'SM': 'it', 'VA': 'it',
# English-speaking (default)
'US': 'en', 'GB': 'en', 'AU': 'en', 'NZ': 'en', 'CA': 'en', 'IE': 'en',
'ZA': 'en', 'JM': 'en', 'TT': 'en', 'BB': 'en', 'GH': 'en', 'NG': 'en',
'KE': 'en', 'UG': 'en', 'TZ': 'en', 'ZW': 'en', 'BW': 'en', 'MW': 'en',
'ZM': 'en', 'PH': 'en', 'SG': 'en', 'MY': 'en', 'IN': 'en', 'PK': 'en',
# Japanese
'JP': 'ja',
# Chinese
'CN': 'zh', 'TW': 'zh', 'HK': 'zh', 'MO': 'zh',
# Korean
'KR': 'ko', 'KP': 'ko',
# Russian
'RU': 'ru', 'BY': 'ru', 'KZ': 'ru', 'KG': 'ru', 'TJ': 'ru',
# Arabic
'SA': 'ar', 'AE': 'ar', 'QA': 'ar', 'KW': 'ar', 'BH': 'ar', 'OM': 'ar',
'YE': 'ar', 'JO': 'ar', 'SY': 'ar', 'LB': 'ar', 'IQ': 'ar', 'EG': 'ar',
'LY': 'ar', 'TN': 'ar', 'DZ': 'ar', 'MA': 'ar', 'SD': 'ar', 'MR': 'ar',
# Other
'CZ': 'cs', 'SK': 'sk', 'PL': 'pl', 'HU': 'hu', 'RO': 'ro', 'BG': 'bg',
'HR': 'hr', 'RS': 'sr', 'SI': 'sl', 'GR': 'el', 'TR': 'tr', 'IL': 'he',
'TH': 'th', 'VN': 'vi', 'ID': 'id', 'SE': 'sv', 'NO': 'no', 'DK': 'da',
'FI': 'fi', 'IS': 'is', 'EE': 'et', 'LV': 'lv', 'LT': 'lt', 'UA': 'uk',
}
def get_language_from_ghcid(ghcid: str) -> str:
"""Extract language code from GHCID country prefix.
Args:
ghcid: GHCID string (e.g., "AR-C-BUE-M-MAD")
Returns:
ISO 639-1 language code (e.g., "es" for Argentina)
"""
if not ghcid or len(ghcid) < 2:
return 'en' # Default to English
country_code = ghcid[:2].upper()
return COUNTRY_TO_LANGUAGE.get(country_code, 'en')
def compute_content_hash(text: str) -> str:
"""Compute SHA-256 hash of text in SRI format."""
sha256_hash = hashlib.sha256(text.encode('utf-8')).digest()
b64_hash = base64.b64encode(sha256_hash).decode('ascii')
return f"sha256-{b64_hash}"
def get_api_tokens() -> dict:
"""Get API tokens from environment.
Returns:
dict with 'linkup' and/or 'zai' keys containing API tokens
"""
tokens = {}
# Try environment variables first
linkup_token = os.environ.get('LINKUP_API_KEY')
zai_token = os.environ.get('ZAI_API_TOKEN')
# Try loading from .env file if not in environment
env_path = PROJECT_ROOT / '.env'
if env_path.exists():
with open(env_path, 'r') as f:
for line in f:
line = line.strip()
if line.startswith('LINKUP_API_KEY=') and not linkup_token:
linkup_token = line.split('=', 1)[1].strip().strip('"\'')
elif line.startswith('ZAI_API_TOKEN=') and not zai_token:
zai_token = line.split('=', 1)[1].strip().strip('"\'')
if linkup_token:
tokens['linkup'] = linkup_token
if zai_token:
tokens['zai'] = zai_token
if not tokens:
raise ValueError(
"No API tokens found. Set LINKUP_API_KEY or ZAI_API_TOKEN environment variable."
)
return tokens
class LinkupWebReader:
"""
Client for Linkup API - simple and reliable web fetching.
Reference: https://docs.linkup.so/
"""
def __init__(self, api_key: str):
self.api_key = api_key
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
"""
Read webpage content using Linkup API.
Returns:
dict with keys: content, success, error, url, retrieved_on
"""
async with httpx.AsyncClient(timeout=timeout) as client:
try:
response = await client.post(
LINKUP_API_URL,
headers=self.headers,
json={"url": url}
)
if response.status_code != 200:
return {
"success": False,
"url": url,
"error": f"HTTP {response.status_code}: {response.text[:200]}",
}
result = response.json()
# Linkup returns markdown content directly
content = result.get("markdown", result.get("content", ""))
if not content:
return {
"success": False,
"url": url,
"error": "No content returned",
}
return {
"success": True,
"url": url,
"content": content,
"retrieved_on": datetime.now(timezone.utc).isoformat(),
}
except httpx.TimeoutException:
return {
"success": False,
"url": url,
"error": "Request timed out",
}
except Exception as e:
return {
"success": False,
"url": url,
"error": str(e),
}
class ZAIWebReader:
"""
Client for Z.AI Web Reader MCP API using Streamable HTTP transport.
The MCP protocol requires:
1. Initialize session
2. Send notifications/initialized
3. Call tools
Reference: https://docs.z.ai/devpack/mcp/reader-mcp-server
"""
def __init__(self, api_token: str):
self.api_token = api_token
self.session_id = None
self.headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
"Accept": "application/json, text/event-stream", # Required for MCP Streamable HTTP
}
def _parse_sse_response(self, text: str) -> dict:
"""Parse Server-Sent Events (SSE) response format from MCP API.
SSE format:
id:1
event:message
data:{"jsonrpc":"2.0",...}
Returns the parsed JSON data from the 'data' field.
"""
result = {}
for line in text.strip().split('\n'):
if line.startswith('data:'):
data_content = line[5:].strip()
if data_content:
try:
result = json.loads(data_content)
except json.JSONDecodeError:
pass
return result
async def _send_request(self, client: httpx.AsyncClient, method: str, params: Optional[dict] = None, request_id: int = 1) -> dict:
"""Send a JSON-RPC request to the MCP server and parse SSE response.
Returns dict with keys:
- success: bool
- status_code: int
- data: parsed JSON-RPC result (if success)
- error: error message (if not success)
"""
request_body = {
"jsonrpc": "2.0",
"method": method,
"id": request_id
}
if params:
request_body["params"] = params
# Add session header if we have one
headers = self.headers.copy()
if self.session_id:
headers["mcp-session-id"] = self.session_id
response = await client.post(ZAI_MCP_URL, headers=headers, json=request_body)
# Check for session ID in response headers
if "mcp-session-id" in response.headers:
self.session_id = response.headers["mcp-session-id"]
if response.status_code != 200:
return {
"success": False,
"status_code": response.status_code,
"error": f"HTTP {response.status_code}: {response.text[:200]}"
}
# Parse SSE response
parsed = self._parse_sse_response(response.text)
if not parsed:
return {
"success": False,
"status_code": response.status_code,
"error": f"Failed to parse SSE response: {response.text[:200]}"
}
return {
"success": True,
"status_code": response.status_code,
"data": parsed
}
async def initialize(self, client: httpx.AsyncClient) -> bool:
"""Initialize MCP session."""
try:
response = await self._send_request(
client,
"initialize",
{
"protocolVersion": "2024-11-05",
"capabilities": {},
"clientInfo": {
"name": "glam-mission-extractor",
"version": "1.0.0"
}
},
request_id=1
)
if response.get("success"):
# Send initialized notification
await self._send_request(client, "notifications/initialized", {}, request_id=2)
return True
return False
except Exception as e:
print(f"Initialize error: {e}", file=sys.stderr)
return False
async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
"""
Read webpage content using Z.AI Web Reader.
Returns:
dict with keys: title, content, metadata, links, success, error
"""
async with httpx.AsyncClient(timeout=timeout) as client:
try:
# Initialize session first
if not self.session_id:
await self.initialize(client)
# Call webReader tool
response = await self._send_request(
client,
"tools/call",
{
"name": "webReader",
"arguments": {
"url": url
}
},
request_id=3
)
if not response.get("success"):
return {
"success": False,
"url": url,
"error": response.get("error", "Unknown error"),
}
result = response.get("data", {})
# Parse MCP response
if "result" in result:
content_data = result["result"]
# Extract content from MCP response format
if isinstance(content_data, dict):
# Check for content array (MCP tools/call response format)
if "content" in content_data and isinstance(content_data["content"], list):
text_parts = []
for item in content_data["content"]:
if isinstance(item, dict) and item.get("type") == "text":
text_parts.append(item.get("text", ""))
content_text = "\n".join(text_parts)
else:
content_text = content_data.get("content", content_data.get("text", ""))
return {
"success": True,
"url": url,
"title": content_data.get("title", ""),
"content": content_text,
"metadata": content_data.get("metadata", {}),
"links": content_data.get("links", []),
"retrieved_on": datetime.now(timezone.utc).isoformat(),
}
elif isinstance(content_data, list) and len(content_data) > 0:
# Array of content blocks
text_content = ""
for block in content_data:
if isinstance(block, dict):
if block.get("type") == "text":
text_content += block.get("text", "") + "\n"
elif "text" in block:
text_content += block["text"] + "\n"
elif isinstance(block, str):
text_content += block + "\n"
return {
"success": True,
"url": url,
"content": text_content.strip(),
"retrieved_on": datetime.now(timezone.utc).isoformat(),
}
# Check for error in response
if "error" in result:
return {
"success": False,
"url": url,
"error": f"MCP error: {result['error']}",
}
return {
"success": False,
"url": url,
"error": f"Unexpected response format: {str(result)[:200]}",
}
except httpx.HTTPStatusError as e:
return {
"success": False,
"url": url,
"error": f"HTTP {e.response.status_code}: {e.response.text[:200]}",
}
except Exception as e:
return {
"success": False,
"url": url,
"error": str(e),
}
class CompositeWebReader:
"""
Composite web reader that tries multiple backends with automatic fallback.
Strategy (default - prefer_zai=False):
1. Try Linkup API first (faster, more reliable for most sites)
2. If Linkup fails with HTTP 400/FETCH_ERROR, try Z.AI Web Reader
Strategy (prefer_zai=True):
1. Try Z.AI Web Reader first (better for JS-heavy sites, headless browser)
2. If Z.AI fails, try Linkup as fallback
Z.AI Web Reader uses headless browser rendering (better for JS-heavy sites)
This significantly improves success rate for:
- Japanese sites (.jp) that block simple HTTP requests
- Sites with anti-bot protection
- JavaScript-heavy single-page applications
"""
def __init__(self, linkup_key: Optional[str] = None, zai_key: Optional[str] = None, prefer_zai: bool = False):
self.linkup_reader = LinkupWebReader(linkup_key) if linkup_key else None
self.zai_reader = ZAIWebReader(zai_key) if zai_key else None
self.prefer_zai = prefer_zai
self.stats = {
'linkup_success': 0,
'linkup_fail': 0,
'zai_success': 0,
'zai_fail': 0,
'zai_fallback_success': 0,
'zai_fallback_fail': 0,
'linkup_fallback_success': 0,
'linkup_fallback_fail': 0,
'total_requests': 0,
}
async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
"""
Read webpage with automatic fallback.
Returns:
dict with keys: content, success, error, url, retrieved_on, reader_used
"""
self.stats['total_requests'] += 1
# If prefer_zai is set, try Z.AI first
if self.prefer_zai and self.zai_reader:
result = await self.zai_reader.read_webpage(url, timeout=timeout)
if result.get('success'):
self.stats['zai_success'] += 1
result['reader_used'] = 'zai-primary'
return result
# Z.AI failed, try Linkup as fallback
error = result.get('error', '')
is_retriable = (
'timed out' in error.lower() or
'timeout' in error.lower() or
'No content returned' in error or
'Failed' in error
)
if is_retriable and self.linkup_reader:
self.stats['zai_fail'] += 1
linkup_result = await self.linkup_reader.read_webpage(url, timeout=timeout)
if linkup_result.get('success'):
self.stats['linkup_fallback_success'] += 1
linkup_result['reader_used'] = 'linkup-fallback'
return linkup_result
else:
self.stats['linkup_fallback_fail'] += 1
linkup_result['reader_used'] = 'linkup-fallback-failed'
return linkup_result
# Non-retriable error or no fallback
self.stats['zai_fail'] += 1
result['reader_used'] = 'zai-primary-failed'
return result
# Default: Try Linkup first (if available)
if self.linkup_reader:
result = await self.linkup_reader.read_webpage(url, timeout=timeout)
if result.get('success'):
self.stats['linkup_success'] += 1
result['reader_used'] = 'linkup'
return result
# Check if this is a retriable error (HTTP 400, FETCH_ERROR, timeout)
error = result.get('error', '')
is_retriable = (
'HTTP 400' in error or
'FETCH_ERROR' in error or
'timed out' in error.lower() or
'timeout' in error.lower() or
'No content returned' in error
)
if is_retriable and self.zai_reader:
self.stats['linkup_fail'] += 1
# Fall back to Z.AI Web Reader
zai_result = await self.zai_reader.read_webpage(url, timeout=timeout)
if zai_result.get('success'):
self.stats['zai_fallback_success'] += 1
zai_result['reader_used'] = 'zai-fallback'
return zai_result
else:
self.stats['zai_fallback_fail'] += 1
# Return the Z.AI error (more informative for debugging)
zai_result['reader_used'] = 'zai-fallback-failed'
return zai_result
# Non-retriable error or no fallback available
self.stats['linkup_fail'] += 1
result['reader_used'] = 'linkup-failed'
return result
# No Linkup, try Z.AI directly
elif self.zai_reader:
result = await self.zai_reader.read_webpage(url, timeout=timeout)
result['reader_used'] = 'zai'
if result.get('success'):
self.stats['zai_success'] += 1
else:
self.stats['zai_fail'] += 1
return result
# No readers available
return {
'success': False,
'url': url,
'error': 'No web reader available',
'reader_used': 'none',
}
def get_stats(self) -> dict:
"""Get reader statistics for reporting."""
stats = self.stats.copy()
if stats['total_requests'] > 0:
# Calculate totals based on mode
zai_total = stats['zai_success'] + stats['zai_fail']
linkup_total = stats['linkup_success'] + stats['linkup_fail']
zai_fallback_total = stats['zai_fallback_success'] + stats['zai_fallback_fail']
linkup_fallback_total = stats['linkup_fallback_success'] + stats['linkup_fallback_fail']
total_success = (
stats['linkup_success'] +
stats['zai_success'] +
stats['zai_fallback_success'] +
stats['linkup_fallback_success']
)
if self.prefer_zai:
stats['primary_reader'] = 'Z.AI Web Reader'
stats['zai_rate'] = f"{zai_total / stats['total_requests'] * 100:.1f}%"
stats['fallback_rate'] = f"{linkup_fallback_total / stats['total_requests'] * 100:.1f}%"
else:
stats['primary_reader'] = 'Linkup'
stats['linkup_rate'] = f"{linkup_total / stats['total_requests'] * 100:.1f}%"
stats['fallback_rate'] = f"{zai_fallback_total / stats['total_requests'] * 100:.1f}%"
stats['overall_success_rate'] = f"{total_success / stats['total_requests'] * 100:.1f}%"
return stats
class GLMMissionExtractor:
"""
LLM-based mission statement extractor using Z.AI GLM API.
This provides intelligent extraction of mission, vision, and goal statements
from webpage content, replacing naive keyword matching with semantic understanding.
Uses Z.AI Coding Plan endpoint per Rule 11 in AGENTS.md.
Implements Rule 36: Original Language Preservation - NO TRANSLATION.
"""
# Language-specific prompt templates (Rule 36: Original Language Preservation)
# Each prompt explicitly instructs to NOT translate and preserve original language
EXTRACTION_PROMPT_NL = """Je bent een expert in het analyseren van websites van erfgoedinstellingen (musea, archieven, bibliotheken, etc.).
## KRITIEK - NIET VERTALEN:
Extraheer de tekst EXACT zoals deze op de webpagina staat.
VERTAAL NIET naar een andere taal. Behoud de originele tekst in de originele taal.
Als de bron in het Nederlands is, moet de output in het Nederlands zijn.
Als de bron in het Engels is, moet de output in het Engels zijn (niet vertalen naar Nederlands).
Analyseer de volgende webpagina-inhoud en extraheer de missie, visie en/of doelstellingen van de organisatie.
## Instructies:
1. Zoek naar expliciete missie- of visie-statements
2. Let op zinnen die beginnen met "Onze missie is...", "Wij streven naar...", "Het museum heeft als doel...", etc.
3. Negeer navigatie-elementen, footer-tekst, contactgegevens, openingstijden
4. Negeer advertenties, nieuwsberichten, en evenement-aankondigingen
5. Als er GEEN duidelijke missie/visie/doelstelling te vinden is, retourneer een leeg resultaat
6. KOPIEER de tekst letterlijk - NIET PARAFRASEREN of VERTALEN
## Output Format (JSON):
Retourneer ALLEEN een JSON object in dit exacte formaat:
```json
{{
"mission": "De originele missie-tekst hier (NIET VERTAALD), of null als niet gevonden",
"vision": "De originele visie-tekst hier (NIET VERTAALD), of null als niet gevonden",
"goals": "De originele doelstellingen hier (NIET VERTAALD), of null als niet gevonden",
"confidence": 0.85,
"source_section": "Naam van de sectie waar dit gevonden is (bijv. 'Over ons', 'Missie en Visie')",
"detected_language": "nl"
}}
```
## Webpagina inhoud:
{content}
## Let op:
- Retourneer ALLEEN het JSON object, geen andere tekst
- Confidence moet tussen 0.0 en 1.0 zijn
- NOOIT VERTALEN - behoud originele taal
- Als niets gevonden: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
"""
EXTRACTION_PROMPT_ES = """Eres un experto en analizar sitios web de instituciones patrimoniales (museos, archivos, bibliotecas, etc.).
## CRITICO - NO TRADUCIR:
Extrae el texto EXACTAMENTE como aparece en la pagina web.
NO TRADUZCAS a otro idioma. Preserva el texto original en su idioma original.
Si la fuente esta en espanol, la salida debe estar en espanol.
Si la fuente esta en ingles, la salida debe estar en ingles (no traducir al espanol).
Analiza el siguiente contenido de la pagina web y extrae la mision, vision y/o objetivos de la organizacion.
## Instrucciones:
1. Busca declaraciones explicitas de mision o vision
2. Presta atencion a frases como "Nuestra mision es...", "Tenemos como objetivo...", "El museo busca...", etc.
3. Ignora elementos de navegacion, texto de pie de pagina, informacion de contacto, horarios
4. Ignora anuncios, noticias y anuncios de eventos
5. Si NO hay una mision/vision/objetivo claro, devuelve un resultado vacio
6. COPIA el texto literalmente - NO PARAFRASEAR ni TRADUCIR
## Formato de salida (JSON):
Devuelve SOLO un objeto JSON en este formato exacto:
```json
{{
"mission": "El texto original de la mision aqui (SIN TRADUCIR), o null si no se encuentra",
"vision": "El texto original de la vision aqui (SIN TRADUCIR), o null si no se encuentra",
"goals": "Los objetivos originales aqui (SIN TRADUCIR), o null si no se encuentran",
"confidence": 0.85,
"source_section": "Nombre de la seccion donde se encontro (ej. 'Sobre nosotros', 'Mision y Vision')",
"detected_language": "es"
}}
```
## Contenido de la pagina web:
{content}
## Nota:
- Devuelve SOLO el objeto JSON, sin otro texto
- La confianza debe estar entre 0.0 y 1.0
- NUNCA TRADUCIR - preservar idioma original
- Si no se encuentra nada: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
"""
EXTRACTION_PROMPT_PT = """Voce e um especialista em analisar sites de instituicoes patrimoniais (museus, arquivos, bibliotecas, etc.).
## CRITICO - NAO TRADUZIR:
Extraia o texto EXATAMENTE como aparece na pagina web.
NAO TRADUZA para outro idioma. Preserve o texto original em seu idioma original.
Se a fonte esta em portugues, a saida deve estar em portugues.
Se a fonte esta em ingles, a saida deve estar em ingles (nao traduzir para portugues).
Analise o seguinte conteudo da pagina web e extraia a missao, visao e/ou objetivos da organizacao.
## Instrucoes:
1. Procure declaracoes explicitas de missao ou visao
2. Preste atencao a frases como "Nossa missao e...", "Temos como objetivo...", "O museu busca...", etc.
3. Ignore elementos de navegacao, texto de rodape, informacoes de contato, horarios
4. Ignore anuncios, noticias e anuncios de eventos
5. Se NAO houver uma missao/visao/objetivo claro, retorne um resultado vazio
6. COPIE o texto literalmente - NAO PARAFRASEAR nem TRADUZIR
## Formato de saida (JSON):
Retorne APENAS um objeto JSON neste formato exato:
```json
{{
"mission": "O texto original da missao aqui (SEM TRADUZIR), ou null se nao encontrado",
"vision": "O texto original da visao aqui (SEM TRADUZIR), ou null se nao encontrado",
"goals": "Os objetivos originais aqui (SEM TRADUZIR), ou null se nao encontrados",
"confidence": 0.85,
"source_section": "Nome da secao onde foi encontrado (ex. 'Sobre nos', 'Missao e Visao')",
"detected_language": "pt"
}}
```
## Conteudo da pagina web:
{content}
## Nota:
- Retorne APENAS o objeto JSON, sem outro texto
- A confianca deve estar entre 0.0 e 1.0
- NUNCA TRADUZIR - preservar idioma original
- Se nada encontrado: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
"""
EXTRACTION_PROMPT_DE = """Sie sind ein Experte fur die Analyse von Websites von Kulturerbe-Institutionen (Museen, Archive, Bibliotheken, etc.).
## KRITISCH - NICHT UBERSETZEN:
Extrahieren Sie den Text GENAU so, wie er auf der Webseite erscheint.
NICHT in eine andere Sprache UBERSETZEN. Bewahren Sie den Originaltext in seiner Originalsprache.
Wenn die Quelle auf Deutsch ist, muss die Ausgabe auf Deutsch sein.
Wenn die Quelle auf Englisch ist, muss die Ausgabe auf Englisch sein (nicht ins Deutsche ubersetzen).
Analysieren Sie den folgenden Webseiteninhalt und extrahieren Sie die Mission, Vision und/oder Ziele der Organisation.
## Anweisungen:
1. Suchen Sie nach expliziten Missions- oder Visionserklarungen
2. Achten Sie auf Satze wie "Unsere Mission ist...", "Wir streben an...", "Das Museum hat zum Ziel...", etc.
3. Ignorieren Sie Navigationselemente, Fusszeilen, Kontaktdaten, Offnungszeiten
4. Ignorieren Sie Werbung, Nachrichten und Veranstaltungsankundigungen
5. Wenn KEINE klare Mission/Vision/Ziel zu finden ist, geben Sie ein leeres Ergebnis zuruck
6. KOPIEREN Sie den Text wortlich - NICHT PARAPHRASIEREN oder UBERSETZEN
## Ausgabeformat (JSON):
Geben Sie NUR ein JSON-Objekt in diesem genauen Format zuruck:
```json
{{
"mission": "Der originale Missionstext hier (NICHT UBERSETZT), oder null wenn nicht gefunden",
"vision": "Der originale Visionstext hier (NICHT UBERSETZT), oder null wenn nicht gefunden",
"goals": "Die originalen Ziele hier (NICHT UBERSETZT), oder null wenn nicht gefunden",
"confidence": 0.85,
"source_section": "Name des Abschnitts, in dem dies gefunden wurde (z.B. 'Uber uns', 'Mission und Vision')",
"detected_language": "de"
}}
```
## Webseiteninhalt:
{content}
## Hinweis:
- Geben Sie NUR das JSON-Objekt zuruck, keinen anderen Text
- Confidence muss zwischen 0.0 und 1.0 liegen
- NIEMALS UBERSETZEN - Originalsprache bewahren
- Wenn nichts gefunden: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
"""
EXTRACTION_PROMPT_FR = """Vous etes un expert dans l'analyse des sites web d'institutions patrimoniales (musees, archives, bibliotheques, etc.).
## CRITIQUE - NE PAS TRADUIRE:
Extrayez le texte EXACTEMENT tel qu'il apparait sur la page web.
NE TRADUISEZ PAS dans une autre langue. Preservez le texte original dans sa langue originale.
Si la source est en francais, la sortie doit etre en francais.
Si la source est en anglais, la sortie doit etre en anglais (ne pas traduire en francais).
Analysez le contenu de la page web suivante et extrayez la mission, la vision et/ou les objectifs de l'organisation.
## Instructions:
1. Recherchez des declarations explicites de mission ou de vision
2. Faites attention aux phrases comme "Notre mission est...", "Nous visons a...", "Le musee a pour but...", etc.
3. Ignorez les elements de navigation, le texte de pied de page, les coordonnees, les horaires
4. Ignorez les publicites, les actualites et les annonces d'evenements
5. S'il n'y a PAS de mission/vision/objectif clair, retournez un resultat vide
6. COPIEZ le texte litteralement - NE PAS PARAPHRASER ni TRADUIRE
## Format de sortie (JSON):
Retournez UNIQUEMENT un objet JSON dans ce format exact:
```json
{{
"mission": "Le texte original de la mission ici (NON TRADUIT), ou null si non trouve",
"vision": "Le texte original de la vision ici (NON TRADUIT), ou null si non trouve",
"goals": "Les objectifs originaux ici (NON TRADUITS), ou null si non trouves",
"confidence": 0.85,
"source_section": "Nom de la section ou cela a ete trouve (ex. 'A propos', 'Mission et Vision')",
"detected_language": "fr"
}}
```
## Contenu de la page web:
{content}
## Note:
- Retournez UNIQUEMENT l'objet JSON, pas d'autre texte
- La confiance doit etre entre 0.0 et 1.0
- JAMAIS TRADUIRE - preserver la langue originale
- Si rien trouve: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
"""
EXTRACTION_PROMPT_EN = """You are an expert in analyzing heritage institution websites (museums, archives, libraries, etc.).
## CRITICAL - DO NOT TRANSLATE:
Extract the text EXACTLY as it appears on the webpage.
DO NOT TRANSLATE to another language. Preserve the original text in its original language.
If the source is in English, the output must be in English.
If the source is in Dutch, the output must be in Dutch (do not translate to English).
If the source is in Spanish, the output must be in Spanish (do not translate to English).
If the source is in any other language, preserve that language.
Analyze the following webpage content and extract the mission, vision and/or goals of the organization.
## Instructions:
1. Look for explicit mission or vision statements
2. Pay attention to phrases like "Our mission is...", "We aim to...", "The museum seeks to...", etc.
3. Ignore navigation elements, footer text, contact information, opening hours
4. Ignore advertisements, news, and event announcements
5. If there is NO clear mission/vision/goal, return an empty result
6. COPY the text verbatim - DO NOT PARAPHRASE or TRANSLATE
## Output Format (JSON):
Return ONLY a JSON object in this exact format:
```json
{{
"mission": "The original mission text here (NOT TRANSLATED), or null if not found",
"vision": "The original vision text here (NOT TRANSLATED), or null if not found",
"goals": "The original goals here (NOT TRANSLATED), or null if not found",
"confidence": 0.85,
"source_section": "Name of the section where this was found (e.g., 'About us', 'Mission and Vision')",
"detected_language": "en"
}}
```
## Webpage content:
{content}
## Note:
- Return ONLY the JSON object, no other text
- Confidence must be between 0.0 and 1.0
- NEVER TRANSLATE - preserve original language
- If nothing found: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
"""
# Japanese prompt - for JP custodians (12,096 files)
EXTRACTION_PROMPT_JA = """あなたは文化遺産機関(博物館、図書館、文書館など)のウェブサイトを分析する専門家です。
## 重要 - 翻訳禁止:
テキストはウェブページに表示されているとおり、正確に抽出してください。
他の言語に翻訳しないでください。元のテキストを元の言語のまま保持してください。
日本語のソースは日本語で出力してください。
英語のソースは英語のまま出力してください(日本語に翻訳しないでください)。
以下のウェブページの内容を分析し、組織のミッション、ビジョン、および/または目標を抽出してください。
## 指示:
1. 明示的なミッションステートメントまたはビジョンステートメントを探してください
2. 「私たちの使命は...」「当館は...を目指しています」「○○博物館の理念」などの表現に注目してください
3. ナビゲーション要素、フッターテキスト、連絡先情報、営業時間は無視してください
4. 広告、ニュース、イベント告知は無視してください
5. 明確なミッション/ビジョン/目標がない場合は、空の結果を返してください
6. テキストをそのままコピーしてください - 言い換えや翻訳は禁止です
## 出力形式 (JSON):
以下の形式のJSONオブジェクトのみを返してください:
```json
{{
"mission": "元のミッションテキストをここに翻訳なし、見つからない場合はnull",
"vision": "元のビジョンテキストをここに翻訳なし、見つからない場合はnull",
"goals": "元の目標をここに翻訳なし、見つからない場合はnull",
"confidence": 0.85,
"source_section": "見つかったセクション名(例:「ご挨拶」「基本理念」「館長メッセージ」)",
"detected_language": "ja"
}}
```
## ウェブページの内容:
{content}
## 注意:
- JSONオブジェクトのみを返し、他のテキストは含めないでください
- confidenceは0.0から1.0の間でなければなりません
- 絶対に翻訳しないでください - 元の言語を保持してください
- 何も見つからない場合: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
"""
# Czech prompt - for CZ custodians (8,432 files)
EXTRACTION_PROMPT_CS = """Jste odbornik na analyzu webovych stranek pamatkových institucí (muzea, archivy, knihovny atd.).
## KRITICKÉ - NEPŘEKLÁDEJTE:
Extrahujte text PŘESNĚ tak, jak se objevuje na webove strance.
NEPŘEKLÁDEJTE do jineho jazyka. Zachovejte původní text v jeho původním jazyce.
Pokud je zdroj v češtině, výstup musí být v češtině.
Pokud je zdroj v angličtině, výstup musí být v angličtině (nepřekládejte do češtiny).
Analyzujte nasledující obsah webove stranky a extrahujte poslání, vizi a/nebo cíle organizace.
## Pokyny:
1. Hledejte explicitní prohlášení o poslání nebo vizi
2. Věnujte pozornost frazím jako "Naším posláním je...", "Usilujeme o...", "Muzeum si klade za cíl...", atd.
3. Ignorujte navigační prvky, text zapati, kontaktní údaje, otevírací dobu
4. Ignorujte reklamy, novinky a oznámení o akcích
5. Pokud NENÍ žádné jasné poslání/vize/cíl, vraťte prazdny vysledek
6. ZKOPÍRUJTE text doslovně - NEPARAFRÁZUJTE ani NEPŘEKLÁDEJTE
## Formát výstupu (JSON):
Vraťte POUZE objekt JSON v tomto přesném formátu:
```json
{{
"mission": "Původní text poslání zde (NEPŘELOŽENO), nebo null pokud nenalezeno",
"vision": "Původní text vize zde (NEPŘELOŽENO), nebo null pokud nenalezeno",
"goals": "Původní cíle zde (NEPŘELOŽENO), nebo null pokud nenalezeno",
"confidence": 0.85,
"source_section": "Název sekce, kde bylo nalezeno (např. 'O nas', 'Poslání a vize')",
"detected_language": "cs"
}}
```
## Obsah webové stránky:
{content}
## Poznámka:
- Vraťte POUZE objekt JSON, žádný jiný text
- Confidence musí být mezi 0.0 a 1.0
- NIKDY NEPŘEKLÁDEJTE - zachovejte původní jazyk
- Pokud nic nenalezeno: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
"""
# Italian prompt - for IT custodians
EXTRACTION_PROMPT_IT = """Sei un esperto nell'analisi di siti web di istituzioni del patrimonio culturale (musei, archivi, biblioteche, ecc.).
## CRITICO - NON TRADURRE:
Estrai il testo ESATTAMENTE come appare sulla pagina web.
NON TRADURRE in un'altra lingua. Preserva il testo originale nella sua lingua originale.
Se la fonte e in italiano, l'output deve essere in italiano.
Se la fonte e in inglese, l'output deve essere in inglese (non tradurre in italiano).
Analizza il seguente contenuto della pagina web ed estrai la missione, la visione e/o gli obiettivi dell'organizzazione.
## Istruzioni:
1. Cerca dichiarazioni esplicite di missione o visione
2. Presta attenzione a frasi come "La nostra missione e...", "Miriamo a...", "Il museo si propone di...", ecc.
3. Ignora elementi di navigazione, testo a pie di pagina, informazioni di contatto, orari di apertura
4. Ignora pubblicita, notizie e annunci di eventi
5. Se NON c'e una chiara missione/visione/obiettivo, restituisci un risultato vuoto
6. COPIA il testo letteralmente - NON PARAFRASARE ne TRADURRE
## Formato di output (JSON):
Restituisci SOLO un oggetto JSON in questo formato esatto:
```json
{{
"mission": "Il testo originale della missione qui (NON TRADOTTO), o null se non trovato",
"vision": "Il testo originale della visione qui (NON TRADOTTO), o null se non trovato",
"goals": "Gli obiettivi originali qui (NON TRADOTTI), o null se non trovati",
"confidence": 0.85,
"source_section": "Nome della sezione dove e stato trovato (es. 'Chi siamo', 'Missione e Visione')",
"detected_language": "it"
}}
```
## Contenuto della pagina web:
{content}
## Nota:
- Restituisci SOLO l'oggetto JSON, nessun altro testo
- La confidence deve essere tra 0.0 e 1.0
- MAI TRADURRE - preservare la lingua originale
- Se nulla trovato: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
"""
# Map language codes to prompts
LANGUAGE_PROMPTS = {
'nl': EXTRACTION_PROMPT_NL,
'es': EXTRACTION_PROMPT_ES,
'pt': EXTRACTION_PROMPT_PT,
'de': EXTRACTION_PROMPT_DE,
'fr': EXTRACTION_PROMPT_FR,
'en': EXTRACTION_PROMPT_EN,
'ja': EXTRACTION_PROMPT_JA,
'cs': EXTRACTION_PROMPT_CS,
'it': EXTRACTION_PROMPT_IT,
}
# Default prompt for languages without specific template
EXTRACTION_PROMPT = EXTRACTION_PROMPT_EN # Fallback to English prompt
def __init__(self, api_token: str, model: str = ZAI_GLM_MODEL):
self.api_token = api_token
self.model = model
self.headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
# Language-specific system messages (Rule 36: preserve original language)
SYSTEM_MESSAGES = {
'nl': "Je bent een assistent die JSON-gestructureerde data extraheert uit webpagina's. Antwoord ALLEEN met valid JSON. KRITIEK: Vertaal NOOIT de geëxtraheerde tekst - behoud de originele taal.",
'es': "Eres un asistente que extrae datos estructurados en JSON de paginas web. Responde SOLO con JSON valido. CRITICO: NUNCA traduzcas el texto extraido - preserva el idioma original.",
'pt': "Voce e um assistente que extrai dados estruturados em JSON de paginas web. Responda APENAS com JSON valido. CRITICO: NUNCA traduza o texto extraido - preserve o idioma original.",
'de': "Sie sind ein Assistent, der JSON-strukturierte Daten aus Webseiten extrahiert. Antworten Sie NUR mit validem JSON. KRITISCH: Übersetzen Sie NIEMALS den extrahierten Text - bewahren Sie die Originalsprache.",
'fr': "Vous etes un assistant qui extrait des donnees structurees JSON des pages web. Repondez UNIQUEMENT avec du JSON valide. CRITIQUE: Ne traduisez JAMAIS le texte extrait - preservez la langue originale.",
'en': "You are an assistant that extracts JSON-structured data from webpages. Respond ONLY with valid JSON. CRITICAL: NEVER translate the extracted text - preserve the original language.",
'ja': "あなたはウェブページからJSON形式のデータを抽出するアシスタントです。有効なJSONのみで応答してください。重要: 抽出したテキストは絶対に翻訳しないでください - 元の言語を保持してください。",
'cs': "Jste asistent, ktery extrahuje JSON strukturovana data z webovych stranek. Odpovezte POUZE validnim JSON. KRITICKE: NIKDY neprekladejte extrahovany text - zachovejte puvodni jazyk.",
'it': "Sei un assistente che estrae dati strutturati JSON dalle pagine web. Rispondi SOLO con JSON valido. CRITICO: NON tradurre MAI il testo estratto - preserva la lingua originale.",
}
async def extract_mission_from_content(
self,
content: str,
source_url: str,
language: str = 'en',
timeout: float = 60.0
) -> dict:
"""
Use LLM to extract mission statement from webpage content.
Implements Rule 36: Original Language Preservation - NO TRANSLATION.
Args:
content: The webpage text content (markdown or plain text)
source_url: URL of the source page (for context)
language: ISO 639-1 language code (e.g., 'nl', 'es', 'de') for prompt selection
timeout: Request timeout in seconds
Returns:
dict with keys: success, mission, vision, goals, confidence, detected_language, error
"""
# Truncate content if too long (GLM has context limits)
max_chars = 12000
if len(content) > max_chars:
content = content[:max_chars] + "\n\n[... content truncated ...]"
# Select language-appropriate prompt (Rule 36: Original Language Preservation)
prompt_template = self.LANGUAGE_PROMPTS.get(language, self.EXTRACTION_PROMPT)
prompt = prompt_template.format(content=content)
# Select language-appropriate system message
system_message = self.SYSTEM_MESSAGES.get(language, self.SYSTEM_MESSAGES['en'])
request_body = {
"model": self.model,
"messages": [
{
"role": "system",
"content": system_message
},
{
"role": "user",
"content": prompt
}
],
"temperature": 0.1, # Low temperature for consistent extraction
"max_tokens": 2048,
}
async with httpx.AsyncClient(timeout=timeout) as client:
try:
response = await client.post(
ZAI_GLM_API_URL,
headers=self.headers,
json=request_body
)
if response.status_code != 200:
return {
"success": False,
"error": f"API error {response.status_code}: {response.text[:200]}",
}
result = response.json()
# Extract the assistant's response
if "choices" not in result or len(result["choices"]) == 0:
return {
"success": False,
"error": "No response from API",
}
assistant_message = result["choices"][0]["message"]["content"]
# Parse JSON from response
# Handle markdown code blocks if present
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', assistant_message)
if json_match:
json_str = json_match.group(1)
else:
json_str = assistant_message.strip()
try:
extracted = json.loads(json_str)
except json.JSONDecodeError as e:
return {
"success": False,
"error": f"Failed to parse JSON response: {e}",
"raw_response": assistant_message[:500],
}
# Validate and return
return {
"success": True,
"mission": extracted.get("mission"),
"vision": extracted.get("vision"),
"goals": extracted.get("goals"),
"confidence": extracted.get("confidence", 0.0),
"source_section": extracted.get("source_section"),
"model": self.model,
}
except httpx.TimeoutException:
return {
"success": False,
"error": "Request timed out",
}
except Exception as e:
return {
"success": False,
"error": str(e),
}
def find_custodians_with_websites(
prefix: Optional[str] = None,
limit: Optional[int] = None,
skip_existing: bool = False
) -> list[tuple[Path, dict, str]]:
"""
Find custodian YAML files that have website URLs.
Args:
prefix: Filter by GHCID prefix (e.g., "NL-NH" for Noord-Holland)
limit: Maximum number of custodians to return
skip_existing: If True, skip custodians that already have mission_statement
Returns:
List of (path, custodian_data, website_url) tuples
"""
custodian_dir = PROJECT_ROOT / "data" / "custodian"
results = []
pattern = f"{prefix}*.yaml" if prefix else "NL-*.yaml"
for yaml_path in custodian_dir.glob(pattern):
if limit and len(results) >= limit:
break
try:
with open(yaml_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
continue
# Skip if already has mission statements and skip_existing is True
if skip_existing and data.get('mission_statement'):
continue
# Extract website URL from various possible locations (priority order)
website = None
# 1. Direct website field
if 'website' in data and data['website']:
website = data['website']
# 2. Original entry webadres_organisatie
if not website and 'original_entry' in data:
oe = data['original_entry']
if isinstance(oe, dict) and oe.get('webadres_organisatie'):
website = oe['webadres_organisatie']
# 3. Museum register enrichment website_url
if not website and 'museum_register_enrichment' in data:
mre = data['museum_register_enrichment']
if isinstance(mre, dict) and mre.get('website_url'):
website = mre['website_url']
# 4. Wikidata enrichment official_website
if not website and 'wikidata_enrichment' in data:
we = data['wikidata_enrichment']
if isinstance(we, dict) and we.get('official_website'):
website = we['official_website']
# 5. Google Maps enrichment website
if not website and 'google_maps_enrichment' in data:
gm = data['google_maps_enrichment']
if isinstance(gm, dict) and gm.get('website'):
website = gm['website']
# 6. Location object website
if not website and 'location' in data:
loc = data['location']
if isinstance(loc, dict) and loc.get('website'):
website = loc['website']
# 7. Original entry identifiers (Website scheme)
if not website and 'original_entry' in data:
oe = data['original_entry']
if isinstance(oe, dict) and 'identifiers' in oe:
for ident in oe.get('identifiers', []):
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
website = ident.get('identifier_value') or ident.get('identifier_url')
if website:
break
# 8. Top-level identifiers array (Website scheme)
if not website and 'identifiers' in data:
for ident in data.get('identifiers', []):
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
website = ident.get('identifier_value') or ident.get('identifier_url')
if website:
break
if website and website.startswith('http'):
results.append((yaml_path, data, website))
except Exception as e:
print(f"Warning: Failed to parse {yaml_path}: {e}", file=sys.stderr)
return results
def discover_mission_page_urls(base_url: str, language: str = 'en') -> list[str]:
"""
Generate candidate URLs for mission/vision pages.
Prioritizes language-specific patterns for better results.
Args:
base_url: The custodian's main website URL
language: ISO 639-1 language code (e.g., 'nl', 'ja', 'cs')
Returns:
List of URLs to check for mission content
"""
# Normalize base URL - prefer https
parsed = urlparse(base_url)
scheme = 'https' if parsed.scheme == 'http' else parsed.scheme
base = f"{scheme}://{parsed.netloc}"
candidates = []
# First, add language-specific patterns (prioritized)
language_patterns = LANGUAGE_URL_PATTERNS.get(language, [])
for pattern in language_patterns:
candidates.append(urljoin(base, pattern))
# Then add English patterns as fallback (many international sites use English)
if language != 'en':
for pattern in ENGLISH_MISSION_PATTERNS:
url = urljoin(base, pattern)
if url not in candidates:
candidates.append(url)
# Also add the homepage as it might contain mission info
if base_url not in candidates:
candidates.append(base_url)
return candidates
# Keywords to look for in links when discovering mission pages (multilingual)
MISSION_LINK_KEYWORDS = [
# Dutch
'missie', 'visie', 'over-ons', 'over', 'organisatie', 'doelstelling',
'wie-zijn-wij', 'wie-we-zijn', 'onze-missie', 'het-museum', 'het-archief',
'de-bibliotheek', 'stichting', 'vereniging', 'kernwaarden', 'ambitie',
# Spanish
'mision', 'vision', 'sobre-nosotros', 'quienes-somos', 'institucional',
'historia', 'el-museo', 'la-biblioteca', 'el-archivo', 'acerca-de',
'nuestra-mision', 'conocenos', 'nosotros',
# Portuguese
'missao', 'visao', 'sobre', 'sobre-nos', 'quem-somos', 'o-museu',
'a-biblioteca', 'o-arquivo', 'nossa-missao', 'conheca',
# German
'leitbild', 'ueber-uns', 'uber-uns', 'das-museum', 'wir-ueber-uns',
# French
'a-propos', 'qui-sommes-nous', 'notre-mission', 'le-musee', 'presentation',
# English
'about', 'about-us', 'mission', 'vision', 'organization', 'who-we-are',
# Japanese (romanized)
'gaiyo', 'gaiyou', 'rinen', 'aisatsu', 'enkaku', 'greeting', 'message',
'introduction', 'outline', 'profile', 'history',
# Czech
'o-nas', 'o-knihovne', 'o-muzeu', 'o-archivu', 'poslani', 'historie',
'zakladni-informace', 'profil',
# Italian
'chi-siamo', 'la-missione', 'missione', 'storia', 'il-museo', 'presentazione',
]
def extract_links_from_markdown(content: str, base_url: str) -> list[str]:
"""
Extract all links from markdown content.
Args:
content: Markdown text content
base_url: Base URL for resolving relative links
Returns:
List of absolute URLs found in the content
"""
links = []
# Match markdown links: [text](url)
md_link_pattern = r'\[([^\]]*)\]\(([^)]+)\)'
for match in re.finditer(md_link_pattern, content):
url = match.group(2).strip()
if url:
# Skip anchors, mailto, tel, etc.
if url.startswith('#') or url.startswith('mailto:') or url.startswith('tel:'):
continue
# Resolve relative URLs
if not url.startswith('http'):
url = urljoin(base_url, url)
links.append(url)
# Also match plain URLs in text
url_pattern = r'https?://[^\s<>\)\]"\']+'
for match in re.finditer(url_pattern, content):
url = match.group(0).rstrip('.,;:')
if url not in links:
links.append(url)
return links
# URL path patterns to EXCLUDE from mission link discovery
# These are false positives - URLs that contain mission keywords but aren't about pages
EXCLUDE_URL_PATTERNS = [
# Library catalogs (Belgian bibliotheek.be, Dutch, etc.)
'/catalogus/', # Dutch/Belgian library catalogs
'/catalog/', # English library catalogs
'/catalogue/', # French library catalogs
'/katalog/', # German/Czech library catalogs
'/search-history', # Library search history pages
'/recover-password', # Password recovery pages
'/login', # Login pages
'/account/', # Account pages
'/my-account', # Account pages
'/mijn-account', # Dutch account pages
'/cart/', # Shopping cart
'/checkout/', # Checkout
'/winkelwagen', # Dutch shopping cart
# Book/item detail pages (often have "over" in Dutch book titles)
'library-marc', # Library MARC records
'vlacc', # Flemish library consortium records
'library-marc-vlacc', # Combined pattern
# Media and file pages
'/download/', # Download pages
'/uploads/', # Upload directories
'/files/', # File directories
'/media/', # Media directories (unless part of museum name)
'/assets/', # Asset directories
# Administrative and utility pages
'/admin/', # Admin pages
'/wp-admin/', # WordPress admin
'/wp-content/', # WordPress content
'/wp-includes/', # WordPress includes
'/cgi-bin/', # CGI scripts
'/api/', # API endpoints
'/_next/', # Next.js internal
'/_nuxt/', # Nuxt.js internal
# Social and external
'/share/', # Share pages
'/print/', # Print pages
'/email/', # Email pages
'/rss', # RSS feeds
'/feed', # Feed pages
# E-commerce patterns
'/product/', # Product pages
'/products/', # Product listings
'/shop/', # Shop pages
'/store/', # Store pages
'/bestellen/', # Dutch ordering
'/reserveren/', # Dutch reservation
# Japanese specific exclusions
'/search', # Search pages
'/result', # Search result pages
'/ebook/', # E-book pages
'/overdrive', # Overdrive e-book service
]
def filter_mission_links(links: list[str], base_domain: str) -> list[str]:
"""
Filter links to only those likely to contain mission/vision content.
Args:
links: List of URLs to filter
base_domain: Domain of the custodian website (only keep same-domain links)
Returns:
List of URLs that likely contain mission content
"""
mission_urls = []
for url in links:
try:
parsed = urlparse(url)
except ValueError:
# Skip malformed URLs (e.g., invalid IPv6)
continue
# Only keep links from the same domain
if parsed.netloc and base_domain not in parsed.netloc:
continue
path_lower = parsed.path.lower()
# EXCLUSION CHECK: Skip URLs that match exclusion patterns
is_excluded = False
for exclude_pattern in EXCLUDE_URL_PATTERNS:
if exclude_pattern in path_lower or exclude_pattern in url.lower():
is_excluded = True
break
if is_excluded:
continue
# Check if path contains mission-related keywords
for keyword in MISSION_LINK_KEYWORDS:
if keyword in path_lower:
if url not in mission_urls:
mission_urls.append(url)
break
return mission_urls
async def discover_mission_links_from_homepage(
reader: Union['LinkupWebReader', 'ZAIWebReader', 'CompositeWebReader'],
homepage_url: str,
verbose: bool = False
) -> tuple[list[str], str, str]:
"""
Fetch homepage and discover links to mission/vision pages.
This is more reliable than guessing URL patterns because it finds
the actual links used by the website.
Args:
reader: Web reader instance
homepage_url: The custodian's homepage URL
verbose: Whether to print progress
Returns:
Tuple of (discovered_urls, homepage_content, retrieved_on)
Returns ([], '', '') if homepage fetch fails
"""
# Fetch homepage
result = await reader.read_webpage(homepage_url)
if not result['success']:
if verbose:
print(f" Homepage fetch failed: {result.get('error', 'Unknown')[:50]}")
return [], '', ''
content = result.get('content', '')
retrieved_on = result.get('retrieved_on', datetime.now(timezone.utc).isoformat())
if not content:
return [], content, retrieved_on
# Extract base domain for filtering
parsed = urlparse(homepage_url)
base_domain = parsed.netloc.lower()
# Extract all links from homepage
all_links = extract_links_from_markdown(content, homepage_url)
if verbose:
print(f" Found {len(all_links)} links on homepage")
# Filter to mission-related links
mission_links = filter_mission_links(all_links, base_domain)
if verbose and mission_links:
print(f" Found {len(mission_links)} mission-related links:")
for link in mission_links[:5]: # Show first 5
print(f" - {link}")
return mission_links, content, retrieved_on
def extract_statements_from_content(
content: str,
source_url: str,
retrieved_on: str,
ghcid: str,
) -> list[dict]:
"""
Extract mission, vision, and goal statements from webpage content.
This uses keyword matching and section detection. For production,
consider using an LLM for more intelligent extraction.
Args:
content: The webpage text content
source_url: URL of the source page
retrieved_on: ISO timestamp when page was retrieved
ghcid: GHCID of the custodian
Returns:
List of mission statement dictionaries
"""
statements = []
content_lower = content.lower()
# Skip error pages (404, 500, etc.)
error_indicators = [
'pagina niet gevonden', 'page not found', '404',
'niet gevonden', 'not found', 'error', 'fout',
'deze pagina bestaat niet', 'this page does not exist'
]
# Check title and first 500 chars for error indicators
if any(indicator in content_lower[:500] for indicator in error_indicators):
return []
# Also check if content looks like raw JSON (Z.AI sometimes returns this)
if content.strip().startswith('{"') or content.strip().startswith('"{'):
return []
# Check if this page has mission-related content
has_mission_content = any(
keyword in content_lower
for keywords in MISSION_KEYWORDS.values()
for keyword in keywords
)
if not has_mission_content:
return []
# Split content into sections (by headings or blank lines)
sections = re.split(r'\n\s*\n|\n#+\s+|\n\*\*[^*]+\*\*\n', content)
for section in sections:
section = section.strip()
if len(section) < 20: # Skip very short sections
continue
section_lower = section.lower()
# Detect statement type based on keywords
statement_type = None
confidence = 0.7
for stype, keywords in MISSION_KEYWORDS.items():
for keyword in keywords:
if keyword in section_lower[:200]: # Check beginning of section
statement_type = stype
confidence = 0.85 if keyword in section_lower[:50] else 0.75
break
if statement_type:
break
if not statement_type:
continue
# Clean up the section text
# Remove markdown formatting
clean_text = re.sub(r'\*\*([^*]+)\*\*', r'\1', section)
clean_text = re.sub(r'\*([^*]+)\*', r'\1', clean_text)
clean_text = re.sub(r'#+\s*', '', clean_text)
clean_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', clean_text)
clean_text = clean_text.strip()
if len(clean_text) < 20:
continue
# Skip boilerplate/footer content
boilerplate_indicators = [
'©', 'copyright', 'all rights reserved', 'alle rechten voorbehouden',
'privacybeleid', 'privacy policy', 'cookie', 'algemene voorwaarden',
'terms and conditions', 'nieuwsbrief', 'newsletter', 'subscribe',
'volg ons', 'follow us', 'social media', 'facebook', 'instagram',
'twitter', 'linkedin', 'youtube', 'contact', 'openingstijden',
'opening hours', 'bereikbaarheid', 'route', 'adres:', 'address:',
]
clean_lower = clean_text.lower()
boilerplate_count = sum(1 for ind in boilerplate_indicators if ind in clean_lower)
# If more than 2 boilerplate indicators in a short text, skip it
if boilerplate_count >= 2 and len(clean_text) < 200:
continue
# If the text is primarily copyright/footer (starts with ©)
if clean_text.strip().startswith('©'):
continue
# Skip navigation/intro text (too short to be actual mission content)
# Actual mission statements are usually at least 50 characters
if len(clean_text) < 50:
continue
# Skip text that looks like a link/intro (e.g., "Lees alles over...")
skip_patterns = [
r'^lees\s+(alles\s+)?over',
r'^klik\s+hier',
r'^meer\s+(info|informatie)',
r'^bekijk\s+',
r'^ga\s+naar',
r'^read\s+(more\s+)?about',
r'^click\s+here',
r'^view\s+',
]
if any(re.match(pattern, clean_lower) for pattern in skip_patterns):
continue
# Generate statement ID
year = datetime.now().year
statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{statement_type}-{year}"
# Compute content hash
content_hash = compute_content_hash(clean_text)
# Current timestamp for statement creation (Rule 35: Dual Timestamps)
statement_created_at = datetime.now(timezone.utc).isoformat()
statement = {
'statement_id': statement_id,
'statement_type': statement_type,
'statement_text': clean_text,
'statement_language': get_language_from_ghcid(ghcid), # Detect from GHCID country
'extracted_verbatim': True, # Rule 36: Confirms no translation occurred
'source_url': source_url,
'content_hash': content_hash,
# Rule 35: Dual Timestamp Provenance
'provenance': {
'statement_created_at': statement_created_at, # When this claim was extracted
'source_archived_at': retrieved_on, # When the webpage was fetched
'retrieval_agent': 'linkup-api',
'extraction_agent': 'keyword-matching/batch',
'extraction_confidence': confidence,
# W3C PROV-O compatible fields
'prov:wasDerivedFrom': source_url,
'prov:generatedAtTime': statement_created_at,
}
}
statements.append(statement)
return statements
async def extract_statements_with_llm(
llm_extractor: GLMMissionExtractor,
content: str,
source_url: str,
retrieved_on: str,
ghcid: str,
) -> list[dict]:
"""
Extract mission, vision, and goal statements using LLM (Z.AI GLM).
This provides much better quality extraction than keyword matching
by using semantic understanding of the content.
Args:
llm_extractor: GLMMissionExtractor instance
content: The webpage text content
source_url: URL of the source page
retrieved_on: ISO timestamp when page was retrieved
ghcid: GHCID of the custodian
Returns:
List of mission statement dictionaries
"""
# Quick pre-filter: skip obvious error pages
content_lower = content.lower()
error_indicators = [
'pagina niet gevonden', 'page not found', '404',
'niet gevonden', 'not found', 'deze pagina bestaat niet',
'oeps', 'error', 'no routes match', 'routing error'
]
if any(indicator in content_lower[:500] for indicator in error_indicators):
return []
# Skip raw JSON responses
if content.strip().startswith('{"') or content.strip().startswith('"{'):
return []
# Skip very short content (likely empty page)
if len(content.strip()) < 200:
return []
# Detect language from GHCID (Rule 36: Original Language Preservation)
language = get_language_from_ghcid(ghcid)
# Call LLM for extraction with language-specific prompt
result = await llm_extractor.extract_mission_from_content(
content=content,
source_url=source_url,
language=language
)
if not result['success']:
return []
statements = []
year = datetime.now().year
# Get expected language from GHCID (Rule 36: Original Language Preservation)
expected_language = get_language_from_ghcid(ghcid)
# Use detected language from LLM if available, else fall back to expected
detected_language = result.get('detected_language') or expected_language
# Process each statement type
for statement_type in ['mission', 'vision', 'goals']:
text = result.get(statement_type)
if not text or text == 'null' or len(str(text).strip()) < 20:
continue
# Map 'goals' to 'goal' for consistency with schema
schema_type = 'goal' if statement_type == 'goals' else statement_type
# Generate statement ID
statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{schema_type}-{year}"
# Compute content hash
content_hash = compute_content_hash(str(text))
# Current timestamp for statement creation (Rule 35: Dual Timestamps)
statement_created_at = datetime.now(timezone.utc).isoformat()
statement = {
'statement_id': statement_id,
'statement_type': schema_type,
'statement_text': str(text).strip(),
'statement_language': detected_language, # Use LLM-detected language
'extracted_verbatim': True, # Rule 36: Confirms no translation occurred
'source_url': source_url,
'content_hash': content_hash,
# Rule 35: Dual Timestamp Provenance
'provenance': {
'statement_created_at': statement_created_at, # When this claim was extracted
'source_archived_at': retrieved_on, # When the webpage was fetched
'retrieval_agent': 'linkup-api',
'extraction_agent': f'zai-glm/{result.get("model", ZAI_GLM_MODEL)}',
'extraction_confidence': result.get('confidence', 0.0),
# W3C PROV-O compatible fields
'prov:wasDerivedFrom': source_url,
'prov:generatedAtTime': statement_created_at,
}
}
# Add source section if available
if result.get('source_section'):
statement['source_section'] = result['source_section']
# Document language mismatch if detected language differs from expected (Rule 36)
if detected_language != expected_language:
statement['language_note'] = f"Content in {detected_language}, expected {expected_language} based on GHCID country code"
statements.append(statement)
return statements
def update_custodian_yaml(
yaml_path: Path,
custodian_data: dict,
statements: list[dict],
dry_run: bool = False
) -> bool:
"""
Update custodian YAML file with extracted mission statements.
Args:
yaml_path: Path to the custodian YAML file
custodian_data: Current custodian data
statements: List of extracted statements
dry_run: If True, don't write changes
Returns:
True if updated successfully
"""
if not statements:
return False
# Initialize or update mission_statement field
if 'mission_statement' not in custodian_data:
custodian_data['mission_statement'] = []
existing_ids = {
s.get('statement_id') for s in custodian_data['mission_statement']
if isinstance(s, dict)
}
# Add new statements
added = 0
for statement in statements:
if statement['statement_id'] not in existing_ids:
custodian_data['mission_statement'].append(statement)
added += 1
if added == 0:
return False
if dry_run:
print(f" Would add {added} statements to {yaml_path.name}")
return True
# Write updated YAML
try:
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(
custodian_data,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120
)
print(f" Added {added} statements to {yaml_path.name}")
return True
except Exception as e:
print(f" Error writing {yaml_path.name}: {e}", file=sys.stderr)
return False
async def process_custodian(
reader: Union[LinkupWebReader, ZAIWebReader, CompositeWebReader],
yaml_path: Path,
custodian_data: dict,
website: str,
dry_run: bool = False,
verbose: bool = False,
llm_extractor: Optional[GLMMissionExtractor] = None,
) -> dict:
"""
Process a single custodian: discover pages, fetch content, extract statements.
IMPROVED: Now uses two-phase discovery:
1. First fetch homepage and extract actual mission page links from navigation
2. Fall back to URL pattern guessing only if no links found
Args:
reader: Web reader instance (Linkup, ZAI, or Composite with fallback)
yaml_path: Path to custodian YAML file
custodian_data: Current custodian data
website: Website URL to process
dry_run: If True, don't write changes
verbose: If True, show detailed progress
llm_extractor: Optional LLM extractor for intelligent extraction
Returns:
dict with processing results
"""
ghcid = yaml_path.stem.split('-')[0:5] # Extract base GHCID from filename
ghcid = '-'.join(ghcid[:5]) if len(ghcid) >= 5 else yaml_path.stem
# Get name for display
name = custodian_data.get('custodian_name', {}).get('emic_name')
if not name:
name = custodian_data.get('name', ghcid)
result = {
'ghcid': ghcid,
'name': name,
'website': website,
'pages_checked': 0,
'pages_with_content': 0,
'statements_found': 0,
'statements_added': 0,
'discovery_method': 'none',
'errors': [],
}
if verbose:
print(f"\nProcessing {ghcid}: {name}")
print(f" Website: {website}")
all_statements = []
homepage_content = None
homepage_retrieved_on = None
# PHASE 1: Discover mission pages from homepage links (preferred method)
if verbose:
print(f" Phase 1: Discovering mission pages from homepage...")
discovered_links, homepage_content, homepage_retrieved_on = await discover_mission_links_from_homepage(
reader, website, verbose
)
result['pages_checked'] += 1 # Homepage was fetched
if discovered_links:
result['discovery_method'] = 'homepage_links'
candidate_urls = discovered_links[:5] # Limit to 5 discovered links
if verbose:
print(f" Using {len(candidate_urls)} discovered mission links")
else:
# PHASE 2: Fall back to URL pattern guessing
result['discovery_method'] = 'pattern_guessing'
if verbose:
print(f" Phase 2: No mission links found, falling back to URL patterns...")
# Use language-specific patterns based on GHCID country code
language = get_language_from_ghcid(ghcid)
candidate_urls = discover_mission_page_urls(website, language=language)[:5]
# First, try to extract from homepage content (if we have it)
if homepage_content and len(homepage_content) > 200:
result['pages_with_content'] += 1
if llm_extractor:
statements = await extract_statements_with_llm(
llm_extractor, homepage_content, website,
homepage_retrieved_on or datetime.now(timezone.utc).isoformat(), ghcid
)
if verbose and statements:
print(f" [LLM] Found {len(statements)} statements on homepage")
else:
statements = extract_statements_from_content(
homepage_content, website,
homepage_retrieved_on or datetime.now(timezone.utc).isoformat(), ghcid
)
if verbose and statements:
print(f" [Keyword] Found {len(statements)} statements on homepage")
if statements:
all_statements.extend(statements)
# If we found a mission statement on homepage with high confidence, skip dedicated pages
# (unless using keyword extraction which has lower accuracy)
# Helper to get confidence (handles nested provenance structure)
def get_stmt_confidence(s):
if 'provenance' in s and 'extraction_confidence' in s['provenance']:
return s['provenance']['extraction_confidence']
return s.get('extraction_confidence', 0)
if llm_extractor and any(s['statement_type'] == 'mission' and get_stmt_confidence(s) > 0.7 for s in statements):
if verbose:
print(f" Found high-confidence mission on homepage, skipping dedicated pages")
result['discovery_method'] = 'homepage_content'
result['statements_found'] = len(all_statements)
# Deduplicate and return early
unique_statements = {}
for stmt in all_statements:
stype = stmt['statement_type']
if stype not in unique_statements or get_stmt_confidence(stmt) > get_stmt_confidence(unique_statements[stype]):
unique_statements[stype] = stmt
final_statements = list(unique_statements.values())
if final_statements:
if update_custodian_yaml(yaml_path, custodian_data, final_statements, dry_run):
result['statements_added'] = len(final_statements)
return result
# Check candidate mission page URLs - PARALLEL FETCHING for speed
# Filter out homepage (already processed)
urls_to_check = [url for url in candidate_urls if url.rstrip('/') != website.rstrip('/')]
if urls_to_check:
# Fetch all URLs in parallel (up to 5 concurrent)
async def fetch_url(url):
"""Fetch a single URL and return result with URL."""
page_result = await reader.read_webpage(url)
return url, page_result
# Create tasks for parallel fetching
fetch_tasks = [fetch_url(url) for url in urls_to_check]
fetch_results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
# Process results
for fetch_result in fetch_results:
if isinstance(fetch_result, Exception):
continue
url, page_result = fetch_result
result['pages_checked'] += 1
if verbose:
print(f" Checking: {url}")
if not page_result['success']:
if verbose:
print(f" Failed: {page_result.get('error', 'Unknown error')[:50]}")
result['errors'].append(f"{url}: {page_result.get('error', 'Unknown')[:50]}")
continue
content = page_result.get('content', '')
if not content or len(content) < 100:
if verbose:
print(f" No content")
continue
result['pages_with_content'] += 1
# Extract statements from content
retrieved_on = page_result.get('retrieved_on', datetime.now(timezone.utc).isoformat())
# Use LLM extraction if available, otherwise fall back to keyword-based
if llm_extractor:
statements = await extract_statements_with_llm(
llm_extractor, content, url, retrieved_on, ghcid
)
if verbose and statements:
print(f" [LLM] Found {len(statements)} statements")
else:
statements = extract_statements_from_content(content, url, retrieved_on, ghcid)
if verbose and statements:
print(f" [Keyword] Found {len(statements)} statements")
if statements:
all_statements.extend(statements)
result['statements_found'] = len(all_statements)
# Helper function to get confidence from statement (handles nested provenance)
def get_confidence(stmt):
if 'provenance' in stmt and 'extraction_confidence' in stmt['provenance']:
return stmt['provenance']['extraction_confidence']
return stmt.get('extraction_confidence', 0)
# Deduplicate statements by type (keep highest confidence)
unique_statements = {}
for stmt in all_statements:
stype = stmt['statement_type']
if stype not in unique_statements or get_confidence(stmt) > get_confidence(unique_statements[stype]):
unique_statements[stype] = stmt
final_statements = list(unique_statements.values())
# Update YAML file
if final_statements:
if update_custodian_yaml(yaml_path, custodian_data, final_statements, dry_run):
result['statements_added'] = len(final_statements)
return result
async def main():
parser = argparse.ArgumentParser(
description='Batch extract mission statements from heritage custodian websites'
)
parser.add_argument(
'--test', type=int, metavar='N',
help='Test mode: process only N custodians'
)
parser.add_argument(
'--province', type=str, metavar='PREFIX',
help='Process custodians matching GHCID prefix (e.g., NL-NH for Noord-Holland)'
)
parser.add_argument(
'--ghcid', type=str,
help='Process a single custodian by GHCID'
)
parser.add_argument(
'--all', action='store_true',
help='Process all Dutch custodians with websites'
)
parser.add_argument(
'--dry-run', action='store_true',
help='Show what would be done without making changes'
)
parser.add_argument(
'--verbose', '-v', action='store_true',
help='Show detailed progress'
)
parser.add_argument(
'--concurrency', type=int, default=3,
help='Number of concurrent requests (default: 3)'
)
parser.add_argument(
'--llm', action='store_true',
help='Use LLM (Z.AI GLM) for intelligent extraction instead of keyword matching'
)
parser.add_argument(
'--prefer-zai', action='store_true',
help='Use Z.AI Web Reader as primary fetcher instead of Linkup (better for JS-heavy sites)'
)
parser.add_argument(
'--skip-existing', action='store_true',
help='Skip custodians that already have mission statements'
)
args = parser.parse_args()
if not any([args.test, args.province, args.ghcid, args.all]):
parser.print_help()
print("\nExample usage:")
print(" python scripts/batch_extract_mission_statements.py --test 5 --verbose")
print(" python scripts/batch_extract_mission_statements.py --test 5 --llm --verbose # With LLM extraction")
print(" python scripts/batch_extract_mission_statements.py --province NL --llm --prefer-zai # Use Z.AI Web Reader")
print(" python scripts/batch_extract_mission_statements.py --province NL-NH --llm")
print(" python scripts/batch_extract_mission_statements.py --ghcid NL-ZH-ZUI-M-LMT --llm")
sys.exit(1)
# Get API tokens
try:
tokens = get_api_tokens()
except ValueError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
# Initialize composite web reader with fallback capability
# This tries Linkup first, then falls back to Z.AI Web Reader for failed requests
# Significantly improves success rate for Japanese sites and JS-heavy pages
linkup_key = tokens.get('linkup')
zai_key = tokens.get('zai')
if linkup_key and zai_key:
prefer_zai = getattr(args, 'prefer_zai', False)
reader = CompositeWebReader(linkup_key=linkup_key, zai_key=zai_key, prefer_zai=prefer_zai)
if prefer_zai:
print("Using Z.AI Web Reader as PRIMARY with Linkup fallback for web fetching")
else:
print("Using Linkup API with Z.AI Web Reader fallback for web fetching")
elif linkup_key:
reader = CompositeWebReader(linkup_key=linkup_key)
print("Using Linkup API for web fetching (no fallback available)")
elif zai_key:
reader = CompositeWebReader(zai_key=zai_key, prefer_zai=True)
print("Using Z.AI Web Reader API for web fetching")
else:
print("Error: No API token available (need LINKUP_API_KEY or ZAI_API_TOKEN)", file=sys.stderr)
sys.exit(1)
# Initialize LLM extractor if requested
llm_extractor = None
if args.llm:
if 'zai' not in tokens:
print("Error: --llm requires ZAI_API_TOKEN for LLM extraction", file=sys.stderr)
sys.exit(1)
llm_extractor = GLMMissionExtractor(tokens['zai'])
print(f"Using Z.AI GLM ({ZAI_GLM_MODEL}) for LLM-based extraction")
# Find custodians to process
if args.ghcid:
# Single custodian mode
custodian_dir = PROJECT_ROOT / "data" / "custodian"
yaml_files = list(custodian_dir.glob(f"{args.ghcid}*.yaml"))
if not yaml_files:
print(f"Error: No custodian file found for GHCID {args.ghcid}", file=sys.stderr)
sys.exit(1)
yaml_path = yaml_files[0]
with open(yaml_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Find website using same logic as find_custodians_with_websites
website = None
# 1. Direct website field
if 'website' in data and data['website']:
website = data['website']
# 2. Original entry webadres_organisatie
if not website and 'original_entry' in data:
oe = data['original_entry']
if isinstance(oe, dict) and oe.get('webadres_organisatie'):
website = oe['webadres_organisatie']
# 3. Museum register enrichment website_url
if not website and 'museum_register_enrichment' in data:
mre = data['museum_register_enrichment']
if isinstance(mre, dict) and mre.get('website_url'):
website = mre['website_url']
# 4. Wikidata enrichment official_website
if not website and 'wikidata_enrichment' in data:
we = data['wikidata_enrichment']
if isinstance(we, dict) and we.get('official_website'):
website = we['official_website']
# 5. Google Maps enrichment website
if not website and 'google_maps_enrichment' in data:
gm = data['google_maps_enrichment']
if isinstance(gm, dict) and gm.get('website'):
website = gm['website']
# 6. Location object website
if not website and 'location' in data:
loc = data['location']
if isinstance(loc, dict) and loc.get('website'):
website = loc['website']
# 7. Original entry identifiers (Website scheme)
if not website and 'original_entry' in data:
oe = data['original_entry']
if isinstance(oe, dict) and 'identifiers' in oe:
for ident in oe.get('identifiers', []):
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
website = ident.get('identifier_value') or ident.get('identifier_url')
if website:
break
# 8. Top-level identifiers array (Website scheme)
if not website and 'identifiers' in data:
for ident in data.get('identifiers', []):
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
website = ident.get('identifier_value') or ident.get('identifier_url')
if website:
break
if not website or not website.startswith('http'):
print(f"Error: No website found for {args.ghcid}", file=sys.stderr)
sys.exit(1)
custodians = [(yaml_path, data, website)]
else:
# Batch mode
limit = args.test if args.test else None
prefix = args.province if args.province else None
print(f"Finding custodians with websites...")
skip_existing = getattr(args, 'skip_existing', False)
custodians = find_custodians_with_websites(prefix=prefix, limit=limit, skip_existing=skip_existing)
if skip_existing:
print(f" (skipping custodians with existing mission statements)")
print(f"Found {len(custodians)} custodians with websites")
if args.dry_run:
print("\n[DRY RUN MODE - No changes will be made]\n")
# Process custodians
results = []
semaphore = asyncio.Semaphore(args.concurrency)
async def process_with_semaphore(custodian_tuple):
async with semaphore:
yaml_path, data, website = custodian_tuple
return await process_custodian(
reader, yaml_path, data, website,
dry_run=args.dry_run, verbose=args.verbose,
llm_extractor=llm_extractor
)
# Process in batches
tasks = [process_with_semaphore(c) for c in custodians]
print(f"\nProcessing {len(tasks)} custodians...")
for i, coro in enumerate(asyncio.as_completed(tasks), 1):
result = await coro
results.append(result)
if not args.verbose:
# Progress indicator
if result['statements_added'] > 0:
print(f"[{i}/{len(tasks)}] {result['ghcid']}: Added {result['statements_added']} statements")
elif i % 10 == 0:
print(f"[{i}/{len(tasks)}] Processing...")
# Summary statistics
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
total_checked = sum(r['pages_checked'] for r in results)
total_with_content = sum(r['pages_with_content'] for r in results)
total_found = sum(r['statements_found'] for r in results)
total_added = sum(r['statements_added'] for r in results)
total_errors = sum(len(r['errors']) for r in results)
custodians_with_statements = sum(1 for r in results if r['statements_added'] > 0)
print(f"Custodians processed: {len(results)}")
print(f"Pages checked: {total_checked}")
print(f"Pages with content: {total_with_content}")
print(f"Statements found: {total_found}")
print(f"Statements added: {total_added}")
print(f"Custodians updated: {custodians_with_statements}")
print(f"Errors encountered: {total_errors}")
# Show web reader statistics (if using CompositeWebReader)
if hasattr(reader, 'get_stats'):
stats = reader.get_stats()
print(f"\nWeb Reader Statistics:")
print(f" Total requests: {stats['total_requests']}")
print(f" Linkup success: {stats['linkup_success']} ({stats.get('linkup_rate', 'N/A')})")
print(f" Linkup failures (triggering fallback): {stats['linkup_fail']}")
print(f" Z.AI fallback success: {stats['zai_fallback_success']}")
print(f" Z.AI fallback failures: {stats['zai_fallback_fail']}")
print(f" Overall success rate: {stats.get('overall_success_rate', 'N/A')}")
# Show custodians that got statements
if custodians_with_statements > 0:
print(f"\nCustodians with new mission statements:")
for r in results:
if r['statements_added'] > 0:
print(f" - {r['ghcid']}: {r['name']} ({r['statements_added']} statements)")
if __name__ == '__main__':
asyncio.run(main())