2533 lines
100 KiB
Python
Executable file
2533 lines
100 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Batch extract mission statements from heritage custodian websites.
|
||
|
||
This script:
|
||
1. Finds Dutch custodians with websites
|
||
2. Discovers mission/vision/about pages
|
||
3. Uses Linkup API (primary) or Z.AI Web Reader (fallback) to fetch content
|
||
4. Creates LinkML-compliant mission_statement entries with full provenance
|
||
5. Updates custodian YAML files with extracted statements
|
||
|
||
Usage:
|
||
python scripts/batch_extract_mission_statements.py --test 5 # Test with 5 custodians
|
||
python scripts/batch_extract_mission_statements.py --province NL-NH # Noord-Holland only
|
||
python scripts/batch_extract_mission_statements.py --all # All Dutch custodians
|
||
python scripts/batch_extract_mission_statements.py --ghcid NL-ZH-ZUI-M-LMT # Single custodian
|
||
|
||
Requirements:
|
||
- httpx (pip install httpx)
|
||
- pyyaml
|
||
- LINKUP_API_KEY environment variable (primary)
|
||
- ZAI_API_TOKEN environment variable (fallback)
|
||
|
||
API Documentation:
|
||
- Linkup: https://docs.linkup.so/
|
||
- Z.AI: https://docs.z.ai/devpack/mcp/reader-mcp-server
|
||
"""
|
||
|
||
import argparse
|
||
import asyncio
|
||
import base64
|
||
import hashlib
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
import uuid
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Optional, Any, Union
|
||
from urllib.parse import urljoin, urlparse, quote
|
||
|
||
import httpx
|
||
import yaml
|
||
|
||
# Try importing playwright for fallback
|
||
try:
|
||
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
|
||
PLAYWRIGHT_AVAILABLE = True
|
||
except ImportError:
|
||
PLAYWRIGHT_AVAILABLE = False
|
||
print("Note: Playwright not available. Install with 'pip install playwright && playwright install chromium' for better JP site support.", file=sys.stderr)
|
||
|
||
# Z.AI GLM API configuration (per Rule 11 in AGENTS.md)
|
||
ZAI_GLM_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
|
||
ZAI_GLM_MODEL = "glm-4.7" # Latest model with best quality
|
||
|
||
# Add project root to path
|
||
PROJECT_ROOT = Path(__file__).parent.parent
|
||
sys.path.insert(0, str(PROJECT_ROOT))
|
||
|
||
# API configurations
|
||
LINKUP_API_URL = "https://api.linkup.so/v1/fetch"
|
||
ZAI_MCP_URL = "https://api.z.ai/api/mcp/web_reader/mcp"
|
||
|
||
# Common mission page URL patterns for Dutch heritage institutions
|
||
# Ordered by likelihood of success (most common patterns first)
|
||
DUTCH_MISSION_PATTERNS = [
|
||
"/over-ons", # Most common Dutch pattern
|
||
"/missie", # Direct mission page
|
||
"/over", # Short version
|
||
"/missie-en-visie", # Combined mission/vision
|
||
"/organisatie", # Organization page often has mission
|
||
"/about", # English fallback
|
||
"/visie", # Vision page
|
||
"/over-ons/missie", # Nested mission page
|
||
"/onze-missie", # "Our mission"
|
||
"/over/missie",
|
||
"/organisatie/missie",
|
||
"/het-museum/missie",
|
||
"/het-museum/missie-en-visie",
|
||
"/museum/missie",
|
||
"/about/mission",
|
||
"/wie-zijn-wij",
|
||
"/about-us",
|
||
]
|
||
|
||
# Extended patterns for Dutch museum websites (discovered through testing)
|
||
DUTCH_MISSION_EXTENDED_PATTERNS = [
|
||
"/het-muzeeum-organisatie/missie-visie",
|
||
"/het-museum-organisatie/missie-visie",
|
||
"/organisatie/missie-visie",
|
||
"/over-het-museum/missie",
|
||
"/over-het-museum/missie-en-visie",
|
||
"/info/missie",
|
||
"/info/over-ons",
|
||
"/stichting/missie",
|
||
"/museum/over-ons",
|
||
"/museum/organisatie",
|
||
]
|
||
|
||
# Spanish mission page patterns (for Latin America)
|
||
SPANISH_MISSION_PATTERNS = [
|
||
"/sobre-nosotros", # About us
|
||
"/quienes-somos", # Who we are
|
||
"/mision", # Mission
|
||
"/mision-y-vision", # Mission and vision
|
||
"/institucional", # Institutional
|
||
"/historia", # History often contains mission
|
||
"/el-museo", # The museum
|
||
"/acerca-de", # About
|
||
"/nuestra-mision", # Our mission
|
||
"/conocenos", # Get to know us
|
||
"/institucion", # Institution
|
||
"/nosotros", # Us
|
||
"/about", # English fallback
|
||
"/about-us",
|
||
]
|
||
|
||
# Portuguese mission page patterns (for Brazil, Portugal)
|
||
PORTUGUESE_MISSION_PATTERNS = [
|
||
"/sobre", # About
|
||
"/sobre-nos", # About us
|
||
"/quem-somos", # Who we are
|
||
"/missao", # Mission
|
||
"/missao-e-visao", # Mission and vision
|
||
"/institucional", # Institutional
|
||
"/historia", # History
|
||
"/o-museu", # The museum
|
||
"/a-biblioteca", # The library
|
||
"/conheca", # Get to know
|
||
"/nossa-missao", # Our mission
|
||
"/about", # English fallback
|
||
]
|
||
|
||
# German mission page patterns
|
||
GERMAN_MISSION_PATTERNS = [
|
||
"/ueber-uns", # About us
|
||
"/uber-uns", # Without umlaut
|
||
"/leitbild", # Mission statement
|
||
"/mission", # Mission
|
||
"/das-museum", # The museum
|
||
"/institution", # Institution
|
||
"/wir-ueber-uns", # We about us
|
||
"/about", # English fallback
|
||
]
|
||
|
||
# French mission page patterns
|
||
FRENCH_MISSION_PATTERNS = [
|
||
"/a-propos", # About
|
||
"/qui-sommes-nous", # Who are we
|
||
"/mission", # Mission
|
||
"/notre-mission", # Our mission
|
||
"/le-musee", # The museum
|
||
"/presentation", # Presentation
|
||
"/historique", # Historical
|
||
"/about", # English fallback
|
||
]
|
||
|
||
# English mission page patterns (international fallback)
|
||
ENGLISH_MISSION_PATTERNS = [
|
||
"/about",
|
||
"/about-us",
|
||
"/mission",
|
||
"/our-mission",
|
||
"/mission-vision",
|
||
"/mission-and-vision",
|
||
"/who-we-are",
|
||
"/the-museum",
|
||
"/the-library",
|
||
"/the-archive",
|
||
"/history",
|
||
"/institutional",
|
||
]
|
||
|
||
# Japanese mission page patterns (for JP custodians - 12,096 files)
|
||
JAPANESE_MISSION_PATTERNS = [
|
||
"/about", # English pattern (commonly used in Japan)
|
||
"/about-us", # English pattern
|
||
"/introduction", # Introduction
|
||
"/outline", # Organization outline (概要)
|
||
"/gaiyo", # 概要 (romanized)
|
||
"/gaiyou", # Alternative romanization
|
||
"/rinen", # 理念 (philosophy)
|
||
"/mission", # Mission (English)
|
||
"/message", # Message from director
|
||
"/greeting", # Greeting (挨拶)
|
||
"/aisatsu", # 挨拶 (romanized)
|
||
"/history", # History
|
||
"/enkaku", # 沿革 (history, romanized)
|
||
"/profile", # Profile
|
||
"/info", # Information
|
||
"/organization", # Organization
|
||
]
|
||
|
||
# Czech mission page patterns (for CZ custodians - 8,432 files)
|
||
CZECH_MISSION_PATTERNS = [
|
||
"/o-nas", # About us
|
||
"/o-knihovne", # About the library
|
||
"/o-muzeu", # About the museum
|
||
"/o-archivu", # About the archive
|
||
"/poslani", # Mission
|
||
"/poslani-a-vize", # Mission and vision
|
||
"/historie", # History
|
||
"/informace", # Information
|
||
"/zakladni-informace", # Basic information
|
||
"/profil", # Profile
|
||
"/about", # English fallback
|
||
]
|
||
|
||
# Italian mission page patterns
|
||
ITALIAN_MISSION_PATTERNS = [
|
||
"/chi-siamo", # Who we are
|
||
"/la-missione", # The mission
|
||
"/missione", # Mission
|
||
"/storia", # History
|
||
"/il-museo", # The museum
|
||
"/la-biblioteca", # The library
|
||
"/presentazione", # Presentation
|
||
"/about", # English fallback
|
||
]
|
||
|
||
# Combined patterns - use all languages for maximum coverage
|
||
ALL_MISSION_PATTERNS = (
|
||
DUTCH_MISSION_PATTERNS +
|
||
SPANISH_MISSION_PATTERNS +
|
||
PORTUGUESE_MISSION_PATTERNS +
|
||
GERMAN_MISSION_PATTERNS +
|
||
FRENCH_MISSION_PATTERNS +
|
||
ENGLISH_MISSION_PATTERNS +
|
||
JAPANESE_MISSION_PATTERNS +
|
||
CZECH_MISSION_PATTERNS +
|
||
ITALIAN_MISSION_PATTERNS
|
||
)
|
||
|
||
# Language-specific URL patterns mapping
|
||
LANGUAGE_URL_PATTERNS = {
|
||
'nl': DUTCH_MISSION_PATTERNS + DUTCH_MISSION_EXTENDED_PATTERNS,
|
||
'es': SPANISH_MISSION_PATTERNS,
|
||
'pt': PORTUGUESE_MISSION_PATTERNS,
|
||
'de': GERMAN_MISSION_PATTERNS,
|
||
'fr': FRENCH_MISSION_PATTERNS,
|
||
'en': ENGLISH_MISSION_PATTERNS,
|
||
'ja': JAPANESE_MISSION_PATTERNS,
|
||
'cs': CZECH_MISSION_PATTERNS,
|
||
'it': ITALIAN_MISSION_PATTERNS,
|
||
}
|
||
|
||
# Keywords indicating mission/vision content (multilingual)
|
||
MISSION_KEYWORDS = {
|
||
'mission': ['missie', 'mission', 'opdracht', 'kerntaak', 'misión', 'missão', 'leitbild'],
|
||
'vision': ['visie', 'vision', 'toekomst', 'ambitie', 'visión', 'visão'],
|
||
'goal': ['doelstelling', 'doel', 'doelen', 'goal', 'objective', 'objectives', 'ambitie',
|
||
'objetivo', 'objetivos', 'ziel', 'ziele'],
|
||
'value': ['waarde', 'waarden', 'kernwaarden', 'value', 'values', 'principle',
|
||
'valor', 'valores', 'wert', 'werte'],
|
||
'motto': ['motto', 'slogan', 'slagzin', 'lema'],
|
||
}
|
||
|
||
# ISO 3166-1 alpha-2 country code to ISO 639-1 language code mapping
|
||
# Maps country to primary/official language
|
||
COUNTRY_TO_LANGUAGE = {
|
||
# Dutch-speaking
|
||
'NL': 'nl', 'BE': 'nl', 'SR': 'nl', 'AW': 'nl', 'CW': 'nl', 'SX': 'nl',
|
||
# Spanish-speaking
|
||
'AR': 'es', 'BO': 'es', 'CL': 'es', 'CO': 'es', 'CR': 'es', 'CU': 'es',
|
||
'DO': 'es', 'EC': 'es', 'SV': 'es', 'GT': 'es', 'HN': 'es', 'MX': 'es',
|
||
'NI': 'es', 'PA': 'es', 'PY': 'es', 'PE': 'es', 'PR': 'es', 'ES': 'es',
|
||
'UY': 'es', 'VE': 'es', 'GQ': 'es',
|
||
# Portuguese-speaking
|
||
'BR': 'pt', 'PT': 'pt', 'AO': 'pt', 'MZ': 'pt', 'CV': 'pt', 'GW': 'pt',
|
||
'ST': 'pt', 'TL': 'pt',
|
||
# German-speaking
|
||
'DE': 'de', 'AT': 'de', 'CH': 'de', 'LI': 'de', 'LU': 'de',
|
||
# French-speaking
|
||
'FR': 'fr', 'MC': 'fr', 'SN': 'fr', 'CI': 'fr', 'ML': 'fr', 'BF': 'fr',
|
||
'NE': 'fr', 'TG': 'fr', 'BJ': 'fr', 'GA': 'fr', 'CG': 'fr', 'CD': 'fr',
|
||
'MG': 'fr', 'HT': 'fr', 'RE': 'fr', 'MQ': 'fr', 'GP': 'fr', 'GF': 'fr',
|
||
'NC': 'fr', 'PF': 'fr',
|
||
# Italian-speaking
|
||
'IT': 'it', 'SM': 'it', 'VA': 'it',
|
||
# English-speaking (default)
|
||
'US': 'en', 'GB': 'en', 'AU': 'en', 'NZ': 'en', 'CA': 'en', 'IE': 'en',
|
||
'ZA': 'en', 'JM': 'en', 'TT': 'en', 'BB': 'en', 'GH': 'en', 'NG': 'en',
|
||
'KE': 'en', 'UG': 'en', 'TZ': 'en', 'ZW': 'en', 'BW': 'en', 'MW': 'en',
|
||
'ZM': 'en', 'PH': 'en', 'SG': 'en', 'MY': 'en', 'IN': 'en', 'PK': 'en',
|
||
# Japanese
|
||
'JP': 'ja',
|
||
# Chinese
|
||
'CN': 'zh', 'TW': 'zh', 'HK': 'zh', 'MO': 'zh',
|
||
# Korean
|
||
'KR': 'ko', 'KP': 'ko',
|
||
# Russian
|
||
'RU': 'ru', 'BY': 'ru', 'KZ': 'ru', 'KG': 'ru', 'TJ': 'ru',
|
||
# Arabic
|
||
'SA': 'ar', 'AE': 'ar', 'QA': 'ar', 'KW': 'ar', 'BH': 'ar', 'OM': 'ar',
|
||
'YE': 'ar', 'JO': 'ar', 'SY': 'ar', 'LB': 'ar', 'IQ': 'ar', 'EG': 'ar',
|
||
'LY': 'ar', 'TN': 'ar', 'DZ': 'ar', 'MA': 'ar', 'SD': 'ar', 'MR': 'ar',
|
||
# Other
|
||
'CZ': 'cs', 'SK': 'sk', 'PL': 'pl', 'HU': 'hu', 'RO': 'ro', 'BG': 'bg',
|
||
'HR': 'hr', 'RS': 'sr', 'SI': 'sl', 'GR': 'el', 'TR': 'tr', 'IL': 'he',
|
||
'TH': 'th', 'VN': 'vi', 'ID': 'id', 'SE': 'sv', 'NO': 'no', 'DK': 'da',
|
||
'FI': 'fi', 'IS': 'is', 'EE': 'et', 'LV': 'lv', 'LT': 'lt', 'UA': 'uk',
|
||
}
|
||
|
||
|
||
def get_language_from_ghcid(ghcid: str) -> str:
|
||
"""Extract language code from GHCID country prefix.
|
||
|
||
Args:
|
||
ghcid: GHCID string (e.g., "AR-C-BUE-M-MAD")
|
||
|
||
Returns:
|
||
ISO 639-1 language code (e.g., "es" for Argentina)
|
||
"""
|
||
if not ghcid or len(ghcid) < 2:
|
||
return 'en' # Default to English
|
||
|
||
country_code = ghcid[:2].upper()
|
||
return COUNTRY_TO_LANGUAGE.get(country_code, 'en')
|
||
|
||
|
||
def compute_content_hash(text: str) -> str:
|
||
"""Compute SHA-256 hash of text in SRI format."""
|
||
sha256_hash = hashlib.sha256(text.encode('utf-8')).digest()
|
||
b64_hash = base64.b64encode(sha256_hash).decode('ascii')
|
||
return f"sha256-{b64_hash}"
|
||
|
||
|
||
def get_api_tokens() -> dict:
|
||
"""Get API tokens from environment.
|
||
|
||
Returns:
|
||
dict with 'linkup' and/or 'zai' keys containing API tokens
|
||
"""
|
||
tokens = {}
|
||
|
||
# Try environment variables first
|
||
linkup_token = os.environ.get('LINKUP_API_KEY')
|
||
zai_token = os.environ.get('ZAI_API_TOKEN')
|
||
|
||
# Try loading from .env file if not in environment
|
||
env_path = PROJECT_ROOT / '.env'
|
||
if env_path.exists():
|
||
with open(env_path, 'r') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line.startswith('LINKUP_API_KEY=') and not linkup_token:
|
||
linkup_token = line.split('=', 1)[1].strip().strip('"\'')
|
||
elif line.startswith('ZAI_API_TOKEN=') and not zai_token:
|
||
zai_token = line.split('=', 1)[1].strip().strip('"\'')
|
||
|
||
if linkup_token:
|
||
tokens['linkup'] = linkup_token
|
||
if zai_token:
|
||
tokens['zai'] = zai_token
|
||
|
||
if not tokens:
|
||
raise ValueError(
|
||
"No API tokens found. Set LINKUP_API_KEY or ZAI_API_TOKEN environment variable."
|
||
)
|
||
|
||
return tokens
|
||
|
||
|
||
class LinkupWebReader:
|
||
"""
|
||
Client for Linkup API - simple and reliable web fetching.
|
||
|
||
Reference: https://docs.linkup.so/
|
||
"""
|
||
|
||
def __init__(self, api_key: str):
|
||
self.api_key = api_key
|
||
self.headers = {
|
||
"Authorization": f"Bearer {api_key}",
|
||
"Content-Type": "application/json",
|
||
}
|
||
|
||
async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
|
||
"""
|
||
Read webpage content using Linkup API.
|
||
|
||
Returns:
|
||
dict with keys: content, success, error, url, retrieved_on
|
||
"""
|
||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||
try:
|
||
response = await client.post(
|
||
LINKUP_API_URL,
|
||
headers=self.headers,
|
||
json={"url": url}
|
||
)
|
||
|
||
if response.status_code != 200:
|
||
return {
|
||
"success": False,
|
||
"url": url,
|
||
"error": f"HTTP {response.status_code}: {response.text[:200]}",
|
||
}
|
||
|
||
result = response.json()
|
||
|
||
# Linkup returns markdown content directly
|
||
content = result.get("markdown", result.get("content", ""))
|
||
|
||
if not content:
|
||
return {
|
||
"success": False,
|
||
"url": url,
|
||
"error": "No content returned",
|
||
}
|
||
|
||
return {
|
||
"success": True,
|
||
"url": url,
|
||
"content": content,
|
||
"retrieved_on": datetime.now(timezone.utc).isoformat(),
|
||
}
|
||
|
||
except httpx.TimeoutException:
|
||
return {
|
||
"success": False,
|
||
"url": url,
|
||
"error": "Request timed out",
|
||
}
|
||
except Exception as e:
|
||
return {
|
||
"success": False,
|
||
"url": url,
|
||
"error": str(e),
|
||
}
|
||
|
||
|
||
class ZAIWebReader:
|
||
"""
|
||
Client for Z.AI Web Reader MCP API using Streamable HTTP transport.
|
||
|
||
The MCP protocol requires:
|
||
1. Initialize session
|
||
2. Send notifications/initialized
|
||
3. Call tools
|
||
|
||
Reference: https://docs.z.ai/devpack/mcp/reader-mcp-server
|
||
"""
|
||
|
||
def __init__(self, api_token: str):
|
||
self.api_token = api_token
|
||
self.session_id = None
|
||
self.headers = {
|
||
"Authorization": f"Bearer {api_token}",
|
||
"Content-Type": "application/json",
|
||
"Accept": "application/json, text/event-stream", # Required for MCP Streamable HTTP
|
||
}
|
||
|
||
def _parse_sse_response(self, text: str) -> dict:
|
||
"""Parse Server-Sent Events (SSE) response format from MCP API.
|
||
|
||
SSE format:
|
||
id:1
|
||
event:message
|
||
data:{"jsonrpc":"2.0",...}
|
||
|
||
Returns the parsed JSON data from the 'data' field.
|
||
"""
|
||
result = {}
|
||
for line in text.strip().split('\n'):
|
||
if line.startswith('data:'):
|
||
data_content = line[5:].strip()
|
||
if data_content:
|
||
try:
|
||
result = json.loads(data_content)
|
||
except json.JSONDecodeError:
|
||
pass
|
||
return result
|
||
|
||
async def _send_request(self, client: httpx.AsyncClient, method: str, params: Optional[dict] = None, request_id: int = 1) -> dict:
|
||
"""Send a JSON-RPC request to the MCP server and parse SSE response.
|
||
|
||
Returns dict with keys:
|
||
- success: bool
|
||
- status_code: int
|
||
- data: parsed JSON-RPC result (if success)
|
||
- error: error message (if not success)
|
||
"""
|
||
request_body = {
|
||
"jsonrpc": "2.0",
|
||
"method": method,
|
||
"id": request_id
|
||
}
|
||
if params:
|
||
request_body["params"] = params
|
||
|
||
# Add session header if we have one
|
||
headers = self.headers.copy()
|
||
if self.session_id:
|
||
headers["mcp-session-id"] = self.session_id
|
||
|
||
response = await client.post(ZAI_MCP_URL, headers=headers, json=request_body)
|
||
|
||
# Check for session ID in response headers
|
||
if "mcp-session-id" in response.headers:
|
||
self.session_id = response.headers["mcp-session-id"]
|
||
|
||
if response.status_code != 200:
|
||
return {
|
||
"success": False,
|
||
"status_code": response.status_code,
|
||
"error": f"HTTP {response.status_code}: {response.text[:200]}"
|
||
}
|
||
|
||
# Parse SSE response
|
||
parsed = self._parse_sse_response(response.text)
|
||
if not parsed:
|
||
return {
|
||
"success": False,
|
||
"status_code": response.status_code,
|
||
"error": f"Failed to parse SSE response: {response.text[:200]}"
|
||
}
|
||
|
||
return {
|
||
"success": True,
|
||
"status_code": response.status_code,
|
||
"data": parsed
|
||
}
|
||
|
||
async def initialize(self, client: httpx.AsyncClient) -> bool:
|
||
"""Initialize MCP session."""
|
||
try:
|
||
response = await self._send_request(
|
||
client,
|
||
"initialize",
|
||
{
|
||
"protocolVersion": "2024-11-05",
|
||
"capabilities": {},
|
||
"clientInfo": {
|
||
"name": "glam-mission-extractor",
|
||
"version": "1.0.0"
|
||
}
|
||
},
|
||
request_id=1
|
||
)
|
||
|
||
if response.get("success"):
|
||
# Send initialized notification
|
||
await self._send_request(client, "notifications/initialized", {}, request_id=2)
|
||
return True
|
||
return False
|
||
except Exception as e:
|
||
print(f"Initialize error: {e}", file=sys.stderr)
|
||
return False
|
||
|
||
async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
|
||
"""
|
||
Read webpage content using Z.AI Web Reader.
|
||
|
||
Returns:
|
||
dict with keys: title, content, metadata, links, success, error
|
||
"""
|
||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||
try:
|
||
# Initialize session first
|
||
if not self.session_id:
|
||
await self.initialize(client)
|
||
|
||
# Call webReader tool
|
||
response = await self._send_request(
|
||
client,
|
||
"tools/call",
|
||
{
|
||
"name": "webReader",
|
||
"arguments": {
|
||
"url": url
|
||
}
|
||
},
|
||
request_id=3
|
||
)
|
||
|
||
if not response.get("success"):
|
||
return {
|
||
"success": False,
|
||
"url": url,
|
||
"error": response.get("error", "Unknown error"),
|
||
}
|
||
|
||
result = response.get("data", {})
|
||
|
||
# Parse MCP response
|
||
if "result" in result:
|
||
content_data = result["result"]
|
||
|
||
# Extract content from MCP response format
|
||
if isinstance(content_data, dict):
|
||
# Check for content array (MCP tools/call response format)
|
||
if "content" in content_data and isinstance(content_data["content"], list):
|
||
text_parts = []
|
||
for item in content_data["content"]:
|
||
if isinstance(item, dict) and item.get("type") == "text":
|
||
text_parts.append(item.get("text", ""))
|
||
content_text = "\n".join(text_parts)
|
||
else:
|
||
content_text = content_data.get("content", content_data.get("text", ""))
|
||
|
||
return {
|
||
"success": True,
|
||
"url": url,
|
||
"title": content_data.get("title", ""),
|
||
"content": content_text,
|
||
"metadata": content_data.get("metadata", {}),
|
||
"links": content_data.get("links", []),
|
||
"retrieved_on": datetime.now(timezone.utc).isoformat(),
|
||
}
|
||
elif isinstance(content_data, list) and len(content_data) > 0:
|
||
# Array of content blocks
|
||
text_content = ""
|
||
for block in content_data:
|
||
if isinstance(block, dict):
|
||
if block.get("type") == "text":
|
||
text_content += block.get("text", "") + "\n"
|
||
elif "text" in block:
|
||
text_content += block["text"] + "\n"
|
||
elif isinstance(block, str):
|
||
text_content += block + "\n"
|
||
return {
|
||
"success": True,
|
||
"url": url,
|
||
"content": text_content.strip(),
|
||
"retrieved_on": datetime.now(timezone.utc).isoformat(),
|
||
}
|
||
|
||
# Check for error in response
|
||
if "error" in result:
|
||
return {
|
||
"success": False,
|
||
"url": url,
|
||
"error": f"MCP error: {result['error']}",
|
||
}
|
||
|
||
return {
|
||
"success": False,
|
||
"url": url,
|
||
"error": f"Unexpected response format: {str(result)[:200]}",
|
||
}
|
||
|
||
except httpx.HTTPStatusError as e:
|
||
return {
|
||
"success": False,
|
||
"url": url,
|
||
"error": f"HTTP {e.response.status_code}: {e.response.text[:200]}",
|
||
}
|
||
except Exception as e:
|
||
return {
|
||
"success": False,
|
||
"url": url,
|
||
"error": str(e),
|
||
}
|
||
|
||
|
||
class CompositeWebReader:
|
||
"""
|
||
Composite web reader that tries multiple backends with automatic fallback.
|
||
|
||
Strategy (default - prefer_zai=False):
|
||
1. Try Linkup API first (faster, more reliable for most sites)
|
||
2. If Linkup fails with HTTP 400/FETCH_ERROR, try Z.AI Web Reader
|
||
|
||
Strategy (prefer_zai=True):
|
||
1. Try Z.AI Web Reader first (better for JS-heavy sites, headless browser)
|
||
2. If Z.AI fails, try Linkup as fallback
|
||
|
||
Z.AI Web Reader uses headless browser rendering (better for JS-heavy sites)
|
||
|
||
This significantly improves success rate for:
|
||
- Japanese sites (.jp) that block simple HTTP requests
|
||
- Sites with anti-bot protection
|
||
- JavaScript-heavy single-page applications
|
||
"""
|
||
|
||
def __init__(self, linkup_key: Optional[str] = None, zai_key: Optional[str] = None, prefer_zai: bool = False):
|
||
self.linkup_reader = LinkupWebReader(linkup_key) if linkup_key else None
|
||
self.zai_reader = ZAIWebReader(zai_key) if zai_key else None
|
||
self.prefer_zai = prefer_zai
|
||
self.stats = {
|
||
'linkup_success': 0,
|
||
'linkup_fail': 0,
|
||
'zai_success': 0,
|
||
'zai_fail': 0,
|
||
'zai_fallback_success': 0,
|
||
'zai_fallback_fail': 0,
|
||
'linkup_fallback_success': 0,
|
||
'linkup_fallback_fail': 0,
|
||
'total_requests': 0,
|
||
}
|
||
|
||
async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
|
||
"""
|
||
Read webpage with automatic fallback.
|
||
|
||
Returns:
|
||
dict with keys: content, success, error, url, retrieved_on, reader_used
|
||
"""
|
||
self.stats['total_requests'] += 1
|
||
|
||
# If prefer_zai is set, try Z.AI first
|
||
if self.prefer_zai and self.zai_reader:
|
||
result = await self.zai_reader.read_webpage(url, timeout=timeout)
|
||
|
||
if result.get('success'):
|
||
self.stats['zai_success'] += 1
|
||
result['reader_used'] = 'zai-primary'
|
||
return result
|
||
|
||
# Z.AI failed, try Linkup as fallback
|
||
error = result.get('error', '')
|
||
is_retriable = (
|
||
'timed out' in error.lower() or
|
||
'timeout' in error.lower() or
|
||
'No content returned' in error or
|
||
'Failed' in error
|
||
)
|
||
|
||
if is_retriable and self.linkup_reader:
|
||
self.stats['zai_fail'] += 1
|
||
linkup_result = await self.linkup_reader.read_webpage(url, timeout=timeout)
|
||
|
||
if linkup_result.get('success'):
|
||
self.stats['linkup_fallback_success'] += 1
|
||
linkup_result['reader_used'] = 'linkup-fallback'
|
||
return linkup_result
|
||
else:
|
||
self.stats['linkup_fallback_fail'] += 1
|
||
linkup_result['reader_used'] = 'linkup-fallback-failed'
|
||
return linkup_result
|
||
|
||
# Non-retriable error or no fallback
|
||
self.stats['zai_fail'] += 1
|
||
result['reader_used'] = 'zai-primary-failed'
|
||
return result
|
||
|
||
# Default: Try Linkup first (if available)
|
||
if self.linkup_reader:
|
||
result = await self.linkup_reader.read_webpage(url, timeout=timeout)
|
||
|
||
if result.get('success'):
|
||
self.stats['linkup_success'] += 1
|
||
result['reader_used'] = 'linkup'
|
||
return result
|
||
|
||
# Check if this is a retriable error (HTTP 400, FETCH_ERROR, timeout)
|
||
error = result.get('error', '')
|
||
is_retriable = (
|
||
'HTTP 400' in error or
|
||
'FETCH_ERROR' in error or
|
||
'timed out' in error.lower() or
|
||
'timeout' in error.lower() or
|
||
'No content returned' in error
|
||
)
|
||
|
||
if is_retriable and self.zai_reader:
|
||
self.stats['linkup_fail'] += 1
|
||
# Fall back to Z.AI Web Reader
|
||
zai_result = await self.zai_reader.read_webpage(url, timeout=timeout)
|
||
|
||
if zai_result.get('success'):
|
||
self.stats['zai_fallback_success'] += 1
|
||
zai_result['reader_used'] = 'zai-fallback'
|
||
return zai_result
|
||
else:
|
||
self.stats['zai_fallback_fail'] += 1
|
||
# Return the Z.AI error (more informative for debugging)
|
||
zai_result['reader_used'] = 'zai-fallback-failed'
|
||
return zai_result
|
||
|
||
# Non-retriable error or no fallback available
|
||
self.stats['linkup_fail'] += 1
|
||
result['reader_used'] = 'linkup-failed'
|
||
return result
|
||
|
||
# No Linkup, try Z.AI directly
|
||
elif self.zai_reader:
|
||
result = await self.zai_reader.read_webpage(url, timeout=timeout)
|
||
result['reader_used'] = 'zai'
|
||
if result.get('success'):
|
||
self.stats['zai_success'] += 1
|
||
else:
|
||
self.stats['zai_fail'] += 1
|
||
return result
|
||
|
||
# No readers available
|
||
return {
|
||
'success': False,
|
||
'url': url,
|
||
'error': 'No web reader available',
|
||
'reader_used': 'none',
|
||
}
|
||
|
||
def get_stats(self) -> dict:
|
||
"""Get reader statistics for reporting."""
|
||
stats = self.stats.copy()
|
||
if stats['total_requests'] > 0:
|
||
# Calculate totals based on mode
|
||
zai_total = stats['zai_success'] + stats['zai_fail']
|
||
linkup_total = stats['linkup_success'] + stats['linkup_fail']
|
||
zai_fallback_total = stats['zai_fallback_success'] + stats['zai_fallback_fail']
|
||
linkup_fallback_total = stats['linkup_fallback_success'] + stats['linkup_fallback_fail']
|
||
|
||
total_success = (
|
||
stats['linkup_success'] +
|
||
stats['zai_success'] +
|
||
stats['zai_fallback_success'] +
|
||
stats['linkup_fallback_success']
|
||
)
|
||
|
||
if self.prefer_zai:
|
||
stats['primary_reader'] = 'Z.AI Web Reader'
|
||
stats['zai_rate'] = f"{zai_total / stats['total_requests'] * 100:.1f}%"
|
||
stats['fallback_rate'] = f"{linkup_fallback_total / stats['total_requests'] * 100:.1f}%"
|
||
else:
|
||
stats['primary_reader'] = 'Linkup'
|
||
stats['linkup_rate'] = f"{linkup_total / stats['total_requests'] * 100:.1f}%"
|
||
stats['fallback_rate'] = f"{zai_fallback_total / stats['total_requests'] * 100:.1f}%"
|
||
|
||
stats['overall_success_rate'] = f"{total_success / stats['total_requests'] * 100:.1f}%"
|
||
return stats
|
||
|
||
|
||
class GLMMissionExtractor:
|
||
"""
|
||
LLM-based mission statement extractor using Z.AI GLM API.
|
||
|
||
This provides intelligent extraction of mission, vision, and goal statements
|
||
from webpage content, replacing naive keyword matching with semantic understanding.
|
||
|
||
Uses Z.AI Coding Plan endpoint per Rule 11 in AGENTS.md.
|
||
Implements Rule 36: Original Language Preservation - NO TRANSLATION.
|
||
"""
|
||
|
||
# Language-specific prompt templates (Rule 36: Original Language Preservation)
|
||
# Each prompt explicitly instructs to NOT translate and preserve original language
|
||
|
||
EXTRACTION_PROMPT_NL = """Je bent een expert in het analyseren van websites van erfgoedinstellingen (musea, archieven, bibliotheken, etc.).
|
||
|
||
## KRITIEK - NIET VERTALEN:
|
||
Extraheer de tekst EXACT zoals deze op de webpagina staat.
|
||
VERTAAL NIET naar een andere taal. Behoud de originele tekst in de originele taal.
|
||
Als de bron in het Nederlands is, moet de output in het Nederlands zijn.
|
||
Als de bron in het Engels is, moet de output in het Engels zijn (niet vertalen naar Nederlands).
|
||
|
||
Analyseer de volgende webpagina-inhoud en extraheer de missie, visie en/of doelstellingen van de organisatie.
|
||
|
||
## Instructies:
|
||
1. Zoek naar expliciete missie- of visie-statements
|
||
2. Let op zinnen die beginnen met "Onze missie is...", "Wij streven naar...", "Het museum heeft als doel...", etc.
|
||
3. Negeer navigatie-elementen, footer-tekst, contactgegevens, openingstijden
|
||
4. Negeer advertenties, nieuwsberichten, en evenement-aankondigingen
|
||
5. Als er GEEN duidelijke missie/visie/doelstelling te vinden is, retourneer een leeg resultaat
|
||
6. KOPIEER de tekst letterlijk - NIET PARAFRASEREN of VERTALEN
|
||
|
||
## Output Format (JSON):
|
||
Retourneer ALLEEN een JSON object in dit exacte formaat:
|
||
```json
|
||
{{
|
||
"mission": "De originele missie-tekst hier (NIET VERTAALD), of null als niet gevonden",
|
||
"vision": "De originele visie-tekst hier (NIET VERTAALD), of null als niet gevonden",
|
||
"goals": "De originele doelstellingen hier (NIET VERTAALD), of null als niet gevonden",
|
||
"confidence": 0.85,
|
||
"source_section": "Naam van de sectie waar dit gevonden is (bijv. 'Over ons', 'Missie en Visie')",
|
||
"detected_language": "nl"
|
||
}}
|
||
```
|
||
|
||
## Webpagina inhoud:
|
||
{content}
|
||
|
||
## Let op:
|
||
- Retourneer ALLEEN het JSON object, geen andere tekst
|
||
- Confidence moet tussen 0.0 en 1.0 zijn
|
||
- NOOIT VERTALEN - behoud originele taal
|
||
- Als niets gevonden: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
||
"""
|
||
|
||
EXTRACTION_PROMPT_ES = """Eres un experto en analizar sitios web de instituciones patrimoniales (museos, archivos, bibliotecas, etc.).
|
||
|
||
## CRITICO - NO TRADUCIR:
|
||
Extrae el texto EXACTAMENTE como aparece en la pagina web.
|
||
NO TRADUZCAS a otro idioma. Preserva el texto original en su idioma original.
|
||
Si la fuente esta en espanol, la salida debe estar en espanol.
|
||
Si la fuente esta en ingles, la salida debe estar en ingles (no traducir al espanol).
|
||
|
||
Analiza el siguiente contenido de la pagina web y extrae la mision, vision y/o objetivos de la organizacion.
|
||
|
||
## Instrucciones:
|
||
1. Busca declaraciones explicitas de mision o vision
|
||
2. Presta atencion a frases como "Nuestra mision es...", "Tenemos como objetivo...", "El museo busca...", etc.
|
||
3. Ignora elementos de navegacion, texto de pie de pagina, informacion de contacto, horarios
|
||
4. Ignora anuncios, noticias y anuncios de eventos
|
||
5. Si NO hay una mision/vision/objetivo claro, devuelve un resultado vacio
|
||
6. COPIA el texto literalmente - NO PARAFRASEAR ni TRADUCIR
|
||
|
||
## Formato de salida (JSON):
|
||
Devuelve SOLO un objeto JSON en este formato exacto:
|
||
```json
|
||
{{
|
||
"mission": "El texto original de la mision aqui (SIN TRADUCIR), o null si no se encuentra",
|
||
"vision": "El texto original de la vision aqui (SIN TRADUCIR), o null si no se encuentra",
|
||
"goals": "Los objetivos originales aqui (SIN TRADUCIR), o null si no se encuentran",
|
||
"confidence": 0.85,
|
||
"source_section": "Nombre de la seccion donde se encontro (ej. 'Sobre nosotros', 'Mision y Vision')",
|
||
"detected_language": "es"
|
||
}}
|
||
```
|
||
|
||
## Contenido de la pagina web:
|
||
{content}
|
||
|
||
## Nota:
|
||
- Devuelve SOLO el objeto JSON, sin otro texto
|
||
- La confianza debe estar entre 0.0 y 1.0
|
||
- NUNCA TRADUCIR - preservar idioma original
|
||
- Si no se encuentra nada: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
||
"""
|
||
|
||
EXTRACTION_PROMPT_PT = """Voce e um especialista em analisar sites de instituicoes patrimoniais (museus, arquivos, bibliotecas, etc.).
|
||
|
||
## CRITICO - NAO TRADUZIR:
|
||
Extraia o texto EXATAMENTE como aparece na pagina web.
|
||
NAO TRADUZA para outro idioma. Preserve o texto original em seu idioma original.
|
||
Se a fonte esta em portugues, a saida deve estar em portugues.
|
||
Se a fonte esta em ingles, a saida deve estar em ingles (nao traduzir para portugues).
|
||
|
||
Analise o seguinte conteudo da pagina web e extraia a missao, visao e/ou objetivos da organizacao.
|
||
|
||
## Instrucoes:
|
||
1. Procure declaracoes explicitas de missao ou visao
|
||
2. Preste atencao a frases como "Nossa missao e...", "Temos como objetivo...", "O museu busca...", etc.
|
||
3. Ignore elementos de navegacao, texto de rodape, informacoes de contato, horarios
|
||
4. Ignore anuncios, noticias e anuncios de eventos
|
||
5. Se NAO houver uma missao/visao/objetivo claro, retorne um resultado vazio
|
||
6. COPIE o texto literalmente - NAO PARAFRASEAR nem TRADUZIR
|
||
|
||
## Formato de saida (JSON):
|
||
Retorne APENAS um objeto JSON neste formato exato:
|
||
```json
|
||
{{
|
||
"mission": "O texto original da missao aqui (SEM TRADUZIR), ou null se nao encontrado",
|
||
"vision": "O texto original da visao aqui (SEM TRADUZIR), ou null se nao encontrado",
|
||
"goals": "Os objetivos originais aqui (SEM TRADUZIR), ou null se nao encontrados",
|
||
"confidence": 0.85,
|
||
"source_section": "Nome da secao onde foi encontrado (ex. 'Sobre nos', 'Missao e Visao')",
|
||
"detected_language": "pt"
|
||
}}
|
||
```
|
||
|
||
## Conteudo da pagina web:
|
||
{content}
|
||
|
||
## Nota:
|
||
- Retorne APENAS o objeto JSON, sem outro texto
|
||
- A confianca deve estar entre 0.0 e 1.0
|
||
- NUNCA TRADUZIR - preservar idioma original
|
||
- Se nada encontrado: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
||
"""
|
||
|
||
EXTRACTION_PROMPT_DE = """Sie sind ein Experte fur die Analyse von Websites von Kulturerbe-Institutionen (Museen, Archive, Bibliotheken, etc.).
|
||
|
||
## KRITISCH - NICHT UBERSETZEN:
|
||
Extrahieren Sie den Text GENAU so, wie er auf der Webseite erscheint.
|
||
NICHT in eine andere Sprache UBERSETZEN. Bewahren Sie den Originaltext in seiner Originalsprache.
|
||
Wenn die Quelle auf Deutsch ist, muss die Ausgabe auf Deutsch sein.
|
||
Wenn die Quelle auf Englisch ist, muss die Ausgabe auf Englisch sein (nicht ins Deutsche ubersetzen).
|
||
|
||
Analysieren Sie den folgenden Webseiteninhalt und extrahieren Sie die Mission, Vision und/oder Ziele der Organisation.
|
||
|
||
## Anweisungen:
|
||
1. Suchen Sie nach expliziten Missions- oder Visionserklarungen
|
||
2. Achten Sie auf Satze wie "Unsere Mission ist...", "Wir streben an...", "Das Museum hat zum Ziel...", etc.
|
||
3. Ignorieren Sie Navigationselemente, Fusszeilen, Kontaktdaten, Offnungszeiten
|
||
4. Ignorieren Sie Werbung, Nachrichten und Veranstaltungsankundigungen
|
||
5. Wenn KEINE klare Mission/Vision/Ziel zu finden ist, geben Sie ein leeres Ergebnis zuruck
|
||
6. KOPIEREN Sie den Text wortlich - NICHT PARAPHRASIEREN oder UBERSETZEN
|
||
|
||
## Ausgabeformat (JSON):
|
||
Geben Sie NUR ein JSON-Objekt in diesem genauen Format zuruck:
|
||
```json
|
||
{{
|
||
"mission": "Der originale Missionstext hier (NICHT UBERSETZT), oder null wenn nicht gefunden",
|
||
"vision": "Der originale Visionstext hier (NICHT UBERSETZT), oder null wenn nicht gefunden",
|
||
"goals": "Die originalen Ziele hier (NICHT UBERSETZT), oder null wenn nicht gefunden",
|
||
"confidence": 0.85,
|
||
"source_section": "Name des Abschnitts, in dem dies gefunden wurde (z.B. 'Uber uns', 'Mission und Vision')",
|
||
"detected_language": "de"
|
||
}}
|
||
```
|
||
|
||
## Webseiteninhalt:
|
||
{content}
|
||
|
||
## Hinweis:
|
||
- Geben Sie NUR das JSON-Objekt zuruck, keinen anderen Text
|
||
- Confidence muss zwischen 0.0 und 1.0 liegen
|
||
- NIEMALS UBERSETZEN - Originalsprache bewahren
|
||
- Wenn nichts gefunden: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
||
"""
|
||
|
||
EXTRACTION_PROMPT_FR = """Vous etes un expert dans l'analyse des sites web d'institutions patrimoniales (musees, archives, bibliotheques, etc.).
|
||
|
||
## CRITIQUE - NE PAS TRADUIRE:
|
||
Extrayez le texte EXACTEMENT tel qu'il apparait sur la page web.
|
||
NE TRADUISEZ PAS dans une autre langue. Preservez le texte original dans sa langue originale.
|
||
Si la source est en francais, la sortie doit etre en francais.
|
||
Si la source est en anglais, la sortie doit etre en anglais (ne pas traduire en francais).
|
||
|
||
Analysez le contenu de la page web suivante et extrayez la mission, la vision et/ou les objectifs de l'organisation.
|
||
|
||
## Instructions:
|
||
1. Recherchez des declarations explicites de mission ou de vision
|
||
2. Faites attention aux phrases comme "Notre mission est...", "Nous visons a...", "Le musee a pour but...", etc.
|
||
3. Ignorez les elements de navigation, le texte de pied de page, les coordonnees, les horaires
|
||
4. Ignorez les publicites, les actualites et les annonces d'evenements
|
||
5. S'il n'y a PAS de mission/vision/objectif clair, retournez un resultat vide
|
||
6. COPIEZ le texte litteralement - NE PAS PARAPHRASER ni TRADUIRE
|
||
|
||
## Format de sortie (JSON):
|
||
Retournez UNIQUEMENT un objet JSON dans ce format exact:
|
||
```json
|
||
{{
|
||
"mission": "Le texte original de la mission ici (NON TRADUIT), ou null si non trouve",
|
||
"vision": "Le texte original de la vision ici (NON TRADUIT), ou null si non trouve",
|
||
"goals": "Les objectifs originaux ici (NON TRADUITS), ou null si non trouves",
|
||
"confidence": 0.85,
|
||
"source_section": "Nom de la section ou cela a ete trouve (ex. 'A propos', 'Mission et Vision')",
|
||
"detected_language": "fr"
|
||
}}
|
||
```
|
||
|
||
## Contenu de la page web:
|
||
{content}
|
||
|
||
## Note:
|
||
- Retournez UNIQUEMENT l'objet JSON, pas d'autre texte
|
||
- La confiance doit etre entre 0.0 et 1.0
|
||
- JAMAIS TRADUIRE - preserver la langue originale
|
||
- Si rien trouve: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
||
"""
|
||
|
||
EXTRACTION_PROMPT_EN = """You are an expert in analyzing heritage institution websites (museums, archives, libraries, etc.).
|
||
|
||
## CRITICAL - DO NOT TRANSLATE:
|
||
Extract the text EXACTLY as it appears on the webpage.
|
||
DO NOT TRANSLATE to another language. Preserve the original text in its original language.
|
||
If the source is in English, the output must be in English.
|
||
If the source is in Dutch, the output must be in Dutch (do not translate to English).
|
||
If the source is in Spanish, the output must be in Spanish (do not translate to English).
|
||
If the source is in any other language, preserve that language.
|
||
|
||
Analyze the following webpage content and extract the mission, vision and/or goals of the organization.
|
||
|
||
## Instructions:
|
||
1. Look for explicit mission or vision statements
|
||
2. Pay attention to phrases like "Our mission is...", "We aim to...", "The museum seeks to...", etc.
|
||
3. Ignore navigation elements, footer text, contact information, opening hours
|
||
4. Ignore advertisements, news, and event announcements
|
||
5. If there is NO clear mission/vision/goal, return an empty result
|
||
6. COPY the text verbatim - DO NOT PARAPHRASE or TRANSLATE
|
||
|
||
## Output Format (JSON):
|
||
Return ONLY a JSON object in this exact format:
|
||
```json
|
||
{{
|
||
"mission": "The original mission text here (NOT TRANSLATED), or null if not found",
|
||
"vision": "The original vision text here (NOT TRANSLATED), or null if not found",
|
||
"goals": "The original goals here (NOT TRANSLATED), or null if not found",
|
||
"confidence": 0.85,
|
||
"source_section": "Name of the section where this was found (e.g., 'About us', 'Mission and Vision')",
|
||
"detected_language": "en"
|
||
}}
|
||
```
|
||
|
||
## Webpage content:
|
||
{content}
|
||
|
||
## Note:
|
||
- Return ONLY the JSON object, no other text
|
||
- Confidence must be between 0.0 and 1.0
|
||
- NEVER TRANSLATE - preserve original language
|
||
- If nothing found: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
||
"""
|
||
|
||
# Japanese prompt - for JP custodians (12,096 files)
|
||
EXTRACTION_PROMPT_JA = """あなたは文化遺産機関(博物館、図書館、文書館など)のウェブサイトを分析する専門家です。
|
||
|
||
## 重要 - 翻訳禁止:
|
||
テキストはウェブページに表示されているとおり、正確に抽出してください。
|
||
他の言語に翻訳しないでください。元のテキストを元の言語のまま保持してください。
|
||
日本語のソースは日本語で出力してください。
|
||
英語のソースは英語のまま出力してください(日本語に翻訳しないでください)。
|
||
|
||
以下のウェブページの内容を分析し、組織のミッション、ビジョン、および/または目標を抽出してください。
|
||
|
||
## 指示:
|
||
1. 明示的なミッションステートメントまたはビジョンステートメントを探してください
|
||
2. 「私たちの使命は...」「当館は...を目指しています」「○○博物館の理念」などの表現に注目してください
|
||
3. ナビゲーション要素、フッターテキスト、連絡先情報、営業時間は無視してください
|
||
4. 広告、ニュース、イベント告知は無視してください
|
||
5. 明確なミッション/ビジョン/目標がない場合は、空の結果を返してください
|
||
6. テキストをそのままコピーしてください - 言い換えや翻訳は禁止です
|
||
|
||
## 出力形式 (JSON):
|
||
以下の形式のJSONオブジェクトのみを返してください:
|
||
```json
|
||
{{
|
||
"mission": "元のミッションテキストをここに(翻訳なし)、見つからない場合はnull",
|
||
"vision": "元のビジョンテキストをここに(翻訳なし)、見つからない場合はnull",
|
||
"goals": "元の目標をここに(翻訳なし)、見つからない場合はnull",
|
||
"confidence": 0.85,
|
||
"source_section": "見つかったセクション名(例:「ご挨拶」「基本理念」「館長メッセージ」)",
|
||
"detected_language": "ja"
|
||
}}
|
||
```
|
||
|
||
## ウェブページの内容:
|
||
{content}
|
||
|
||
## 注意:
|
||
- JSONオブジェクトのみを返し、他のテキストは含めないでください
|
||
- confidenceは0.0から1.0の間でなければなりません
|
||
- 絶対に翻訳しないでください - 元の言語を保持してください
|
||
- 何も見つからない場合: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
||
"""
|
||
|
||
# Czech prompt - for CZ custodians (8,432 files)
|
||
EXTRACTION_PROMPT_CS = """Jste odbornik na analyzu webovych stranek pamatkových institucí (muzea, archivy, knihovny atd.).
|
||
|
||
## KRITICKÉ - NEPŘEKLÁDEJTE:
|
||
Extrahujte text PŘESNĚ tak, jak se objevuje na webove strance.
|
||
NEPŘEKLÁDEJTE do jineho jazyka. Zachovejte původní text v jeho původním jazyce.
|
||
Pokud je zdroj v češtině, výstup musí být v češtině.
|
||
Pokud je zdroj v angličtině, výstup musí být v angličtině (nepřekládejte do češtiny).
|
||
|
||
Analyzujte nasledující obsah webove stranky a extrahujte poslání, vizi a/nebo cíle organizace.
|
||
|
||
## Pokyny:
|
||
1. Hledejte explicitní prohlášení o poslání nebo vizi
|
||
2. Věnujte pozornost frazím jako "Naším posláním je...", "Usilujeme o...", "Muzeum si klade za cíl...", atd.
|
||
3. Ignorujte navigační prvky, text zapati, kontaktní údaje, otevírací dobu
|
||
4. Ignorujte reklamy, novinky a oznámení o akcích
|
||
5. Pokud NENÍ žádné jasné poslání/vize/cíl, vraťte prazdny vysledek
|
||
6. ZKOPÍRUJTE text doslovně - NEPARAFRÁZUJTE ani NEPŘEKLÁDEJTE
|
||
|
||
## Formát výstupu (JSON):
|
||
Vraťte POUZE objekt JSON v tomto přesném formátu:
|
||
```json
|
||
{{
|
||
"mission": "Původní text poslání zde (NEPŘELOŽENO), nebo null pokud nenalezeno",
|
||
"vision": "Původní text vize zde (NEPŘELOŽENO), nebo null pokud nenalezeno",
|
||
"goals": "Původní cíle zde (NEPŘELOŽENO), nebo null pokud nenalezeno",
|
||
"confidence": 0.85,
|
||
"source_section": "Název sekce, kde bylo nalezeno (např. 'O nas', 'Poslání a vize')",
|
||
"detected_language": "cs"
|
||
}}
|
||
```
|
||
|
||
## Obsah webové stránky:
|
||
{content}
|
||
|
||
## Poznámka:
|
||
- Vraťte POUZE objekt JSON, žádný jiný text
|
||
- Confidence musí být mezi 0.0 a 1.0
|
||
- NIKDY NEPŘEKLÁDEJTE - zachovejte původní jazyk
|
||
- Pokud nic nenalezeno: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
||
"""
|
||
|
||
# Italian prompt - for IT custodians
|
||
EXTRACTION_PROMPT_IT = """Sei un esperto nell'analisi di siti web di istituzioni del patrimonio culturale (musei, archivi, biblioteche, ecc.).
|
||
|
||
## CRITICO - NON TRADURRE:
|
||
Estrai il testo ESATTAMENTE come appare sulla pagina web.
|
||
NON TRADURRE in un'altra lingua. Preserva il testo originale nella sua lingua originale.
|
||
Se la fonte e in italiano, l'output deve essere in italiano.
|
||
Se la fonte e in inglese, l'output deve essere in inglese (non tradurre in italiano).
|
||
|
||
Analizza il seguente contenuto della pagina web ed estrai la missione, la visione e/o gli obiettivi dell'organizzazione.
|
||
|
||
## Istruzioni:
|
||
1. Cerca dichiarazioni esplicite di missione o visione
|
||
2. Presta attenzione a frasi come "La nostra missione e...", "Miriamo a...", "Il museo si propone di...", ecc.
|
||
3. Ignora elementi di navigazione, testo a pie di pagina, informazioni di contatto, orari di apertura
|
||
4. Ignora pubblicita, notizie e annunci di eventi
|
||
5. Se NON c'e una chiara missione/visione/obiettivo, restituisci un risultato vuoto
|
||
6. COPIA il testo letteralmente - NON PARAFRASARE ne TRADURRE
|
||
|
||
## Formato di output (JSON):
|
||
Restituisci SOLO un oggetto JSON in questo formato esatto:
|
||
```json
|
||
{{
|
||
"mission": "Il testo originale della missione qui (NON TRADOTTO), o null se non trovato",
|
||
"vision": "Il testo originale della visione qui (NON TRADOTTO), o null se non trovato",
|
||
"goals": "Gli obiettivi originali qui (NON TRADOTTI), o null se non trovati",
|
||
"confidence": 0.85,
|
||
"source_section": "Nome della sezione dove e stato trovato (es. 'Chi siamo', 'Missione e Visione')",
|
||
"detected_language": "it"
|
||
}}
|
||
```
|
||
|
||
## Contenuto della pagina web:
|
||
{content}
|
||
|
||
## Nota:
|
||
- Restituisci SOLO l'oggetto JSON, nessun altro testo
|
||
- La confidence deve essere tra 0.0 e 1.0
|
||
- MAI TRADURRE - preservare la lingua originale
|
||
- Se nulla trovato: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
||
"""
|
||
|
||
# Map language codes to prompts
|
||
LANGUAGE_PROMPTS = {
|
||
'nl': EXTRACTION_PROMPT_NL,
|
||
'es': EXTRACTION_PROMPT_ES,
|
||
'pt': EXTRACTION_PROMPT_PT,
|
||
'de': EXTRACTION_PROMPT_DE,
|
||
'fr': EXTRACTION_PROMPT_FR,
|
||
'en': EXTRACTION_PROMPT_EN,
|
||
'ja': EXTRACTION_PROMPT_JA,
|
||
'cs': EXTRACTION_PROMPT_CS,
|
||
'it': EXTRACTION_PROMPT_IT,
|
||
}
|
||
|
||
# Default prompt for languages without specific template
|
||
EXTRACTION_PROMPT = EXTRACTION_PROMPT_EN # Fallback to English prompt
|
||
|
||
def __init__(self, api_token: str, model: str = ZAI_GLM_MODEL):
|
||
self.api_token = api_token
|
||
self.model = model
|
||
self.headers = {
|
||
"Authorization": f"Bearer {api_token}",
|
||
"Content-Type": "application/json",
|
||
}
|
||
|
||
# Language-specific system messages (Rule 36: preserve original language)
|
||
SYSTEM_MESSAGES = {
|
||
'nl': "Je bent een assistent die JSON-gestructureerde data extraheert uit webpagina's. Antwoord ALLEEN met valid JSON. KRITIEK: Vertaal NOOIT de geëxtraheerde tekst - behoud de originele taal.",
|
||
'es': "Eres un asistente que extrae datos estructurados en JSON de paginas web. Responde SOLO con JSON valido. CRITICO: NUNCA traduzcas el texto extraido - preserva el idioma original.",
|
||
'pt': "Voce e um assistente que extrai dados estruturados em JSON de paginas web. Responda APENAS com JSON valido. CRITICO: NUNCA traduza o texto extraido - preserve o idioma original.",
|
||
'de': "Sie sind ein Assistent, der JSON-strukturierte Daten aus Webseiten extrahiert. Antworten Sie NUR mit validem JSON. KRITISCH: Übersetzen Sie NIEMALS den extrahierten Text - bewahren Sie die Originalsprache.",
|
||
'fr': "Vous etes un assistant qui extrait des donnees structurees JSON des pages web. Repondez UNIQUEMENT avec du JSON valide. CRITIQUE: Ne traduisez JAMAIS le texte extrait - preservez la langue originale.",
|
||
'en': "You are an assistant that extracts JSON-structured data from webpages. Respond ONLY with valid JSON. CRITICAL: NEVER translate the extracted text - preserve the original language.",
|
||
'ja': "あなたはウェブページからJSON形式のデータを抽出するアシスタントです。有効なJSONのみで応答してください。重要: 抽出したテキストは絶対に翻訳しないでください - 元の言語を保持してください。",
|
||
'cs': "Jste asistent, ktery extrahuje JSON strukturovana data z webovych stranek. Odpovezte POUZE validnim JSON. KRITICKE: NIKDY neprekladejte extrahovany text - zachovejte puvodni jazyk.",
|
||
'it': "Sei un assistente che estrae dati strutturati JSON dalle pagine web. Rispondi SOLO con JSON valido. CRITICO: NON tradurre MAI il testo estratto - preserva la lingua originale.",
|
||
}
|
||
|
||
async def extract_mission_from_content(
|
||
self,
|
||
content: str,
|
||
source_url: str,
|
||
language: str = 'en',
|
||
timeout: float = 60.0
|
||
) -> dict:
|
||
"""
|
||
Use LLM to extract mission statement from webpage content.
|
||
|
||
Implements Rule 36: Original Language Preservation - NO TRANSLATION.
|
||
|
||
Args:
|
||
content: The webpage text content (markdown or plain text)
|
||
source_url: URL of the source page (for context)
|
||
language: ISO 639-1 language code (e.g., 'nl', 'es', 'de') for prompt selection
|
||
timeout: Request timeout in seconds
|
||
|
||
Returns:
|
||
dict with keys: success, mission, vision, goals, confidence, detected_language, error
|
||
"""
|
||
# Truncate content if too long (GLM has context limits)
|
||
max_chars = 12000
|
||
if len(content) > max_chars:
|
||
content = content[:max_chars] + "\n\n[... content truncated ...]"
|
||
|
||
# Select language-appropriate prompt (Rule 36: Original Language Preservation)
|
||
prompt_template = self.LANGUAGE_PROMPTS.get(language, self.EXTRACTION_PROMPT)
|
||
prompt = prompt_template.format(content=content)
|
||
|
||
# Select language-appropriate system message
|
||
system_message = self.SYSTEM_MESSAGES.get(language, self.SYSTEM_MESSAGES['en'])
|
||
|
||
request_body = {
|
||
"model": self.model,
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": system_message
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": prompt
|
||
}
|
||
],
|
||
"temperature": 0.1, # Low temperature for consistent extraction
|
||
"max_tokens": 2048,
|
||
}
|
||
|
||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||
try:
|
||
response = await client.post(
|
||
ZAI_GLM_API_URL,
|
||
headers=self.headers,
|
||
json=request_body
|
||
)
|
||
|
||
if response.status_code != 200:
|
||
return {
|
||
"success": False,
|
||
"error": f"API error {response.status_code}: {response.text[:200]}",
|
||
}
|
||
|
||
result = response.json()
|
||
|
||
# Extract the assistant's response
|
||
if "choices" not in result or len(result["choices"]) == 0:
|
||
return {
|
||
"success": False,
|
||
"error": "No response from API",
|
||
}
|
||
|
||
assistant_message = result["choices"][0]["message"]["content"]
|
||
|
||
# Parse JSON from response
|
||
# Handle markdown code blocks if present
|
||
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', assistant_message)
|
||
if json_match:
|
||
json_str = json_match.group(1)
|
||
else:
|
||
json_str = assistant_message.strip()
|
||
|
||
try:
|
||
extracted = json.loads(json_str)
|
||
except json.JSONDecodeError as e:
|
||
return {
|
||
"success": False,
|
||
"error": f"Failed to parse JSON response: {e}",
|
||
"raw_response": assistant_message[:500],
|
||
}
|
||
|
||
# Validate and return
|
||
return {
|
||
"success": True,
|
||
"mission": extracted.get("mission"),
|
||
"vision": extracted.get("vision"),
|
||
"goals": extracted.get("goals"),
|
||
"confidence": extracted.get("confidence", 0.0),
|
||
"source_section": extracted.get("source_section"),
|
||
"model": self.model,
|
||
}
|
||
|
||
except httpx.TimeoutException:
|
||
return {
|
||
"success": False,
|
||
"error": "Request timed out",
|
||
}
|
||
except Exception as e:
|
||
return {
|
||
"success": False,
|
||
"error": str(e),
|
||
}
|
||
|
||
|
||
def find_custodians_with_websites(
|
||
prefix: Optional[str] = None,
|
||
limit: Optional[int] = None,
|
||
skip_existing: bool = False
|
||
) -> list[tuple[Path, dict, str]]:
|
||
"""
|
||
Find custodian YAML files that have website URLs.
|
||
|
||
Args:
|
||
prefix: Filter by GHCID prefix (e.g., "NL-NH" for Noord-Holland)
|
||
limit: Maximum number of custodians to return
|
||
skip_existing: If True, skip custodians that already have mission_statement
|
||
|
||
Returns:
|
||
List of (path, custodian_data, website_url) tuples
|
||
"""
|
||
custodian_dir = PROJECT_ROOT / "data" / "custodian"
|
||
results = []
|
||
|
||
pattern = f"{prefix}*.yaml" if prefix else "NL-*.yaml"
|
||
|
||
for yaml_path in custodian_dir.glob(pattern):
|
||
if limit and len(results) >= limit:
|
||
break
|
||
|
||
try:
|
||
with open(yaml_path, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
|
||
if not data:
|
||
continue
|
||
|
||
# Skip if already has mission statements and skip_existing is True
|
||
if skip_existing and data.get('mission_statement'):
|
||
continue
|
||
|
||
# Extract website URL from various possible locations (priority order)
|
||
website = None
|
||
|
||
# 1. Direct website field
|
||
if 'website' in data and data['website']:
|
||
website = data['website']
|
||
|
||
# 2. Original entry webadres_organisatie
|
||
if not website and 'original_entry' in data:
|
||
oe = data['original_entry']
|
||
if isinstance(oe, dict) and oe.get('webadres_organisatie'):
|
||
website = oe['webadres_organisatie']
|
||
|
||
# 3. Museum register enrichment website_url
|
||
if not website and 'museum_register_enrichment' in data:
|
||
mre = data['museum_register_enrichment']
|
||
if isinstance(mre, dict) and mre.get('website_url'):
|
||
website = mre['website_url']
|
||
|
||
# 4. Wikidata enrichment official_website
|
||
if not website and 'wikidata_enrichment' in data:
|
||
we = data['wikidata_enrichment']
|
||
if isinstance(we, dict) and we.get('official_website'):
|
||
website = we['official_website']
|
||
|
||
# 5. Google Maps enrichment website
|
||
if not website and 'google_maps_enrichment' in data:
|
||
gm = data['google_maps_enrichment']
|
||
if isinstance(gm, dict) and gm.get('website'):
|
||
website = gm['website']
|
||
|
||
# 6. Location object website
|
||
if not website and 'location' in data:
|
||
loc = data['location']
|
||
if isinstance(loc, dict) and loc.get('website'):
|
||
website = loc['website']
|
||
|
||
# 7. Original entry identifiers (Website scheme)
|
||
if not website and 'original_entry' in data:
|
||
oe = data['original_entry']
|
||
if isinstance(oe, dict) and 'identifiers' in oe:
|
||
for ident in oe.get('identifiers', []):
|
||
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
|
||
website = ident.get('identifier_value') or ident.get('identifier_url')
|
||
if website:
|
||
break
|
||
|
||
# 8. Top-level identifiers array (Website scheme)
|
||
if not website and 'identifiers' in data:
|
||
for ident in data.get('identifiers', []):
|
||
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
|
||
website = ident.get('identifier_value') or ident.get('identifier_url')
|
||
if website:
|
||
break
|
||
|
||
if website and website.startswith('http'):
|
||
results.append((yaml_path, data, website))
|
||
|
||
except Exception as e:
|
||
print(f"Warning: Failed to parse {yaml_path}: {e}", file=sys.stderr)
|
||
|
||
return results
|
||
|
||
|
||
def discover_mission_page_urls(base_url: str, language: str = 'en') -> list[str]:
|
||
"""
|
||
Generate candidate URLs for mission/vision pages.
|
||
|
||
Prioritizes language-specific patterns for better results.
|
||
|
||
Args:
|
||
base_url: The custodian's main website URL
|
||
language: ISO 639-1 language code (e.g., 'nl', 'ja', 'cs')
|
||
|
||
Returns:
|
||
List of URLs to check for mission content
|
||
"""
|
||
# Normalize base URL - prefer https
|
||
parsed = urlparse(base_url)
|
||
scheme = 'https' if parsed.scheme == 'http' else parsed.scheme
|
||
base = f"{scheme}://{parsed.netloc}"
|
||
|
||
candidates = []
|
||
|
||
# First, add language-specific patterns (prioritized)
|
||
language_patterns = LANGUAGE_URL_PATTERNS.get(language, [])
|
||
for pattern in language_patterns:
|
||
candidates.append(urljoin(base, pattern))
|
||
|
||
# Then add English patterns as fallback (many international sites use English)
|
||
if language != 'en':
|
||
for pattern in ENGLISH_MISSION_PATTERNS:
|
||
url = urljoin(base, pattern)
|
||
if url not in candidates:
|
||
candidates.append(url)
|
||
|
||
# Also add the homepage as it might contain mission info
|
||
if base_url not in candidates:
|
||
candidates.append(base_url)
|
||
|
||
return candidates
|
||
|
||
|
||
# Keywords to look for in links when discovering mission pages (multilingual)
|
||
MISSION_LINK_KEYWORDS = [
|
||
# Dutch
|
||
'missie', 'visie', 'over-ons', 'over', 'organisatie', 'doelstelling',
|
||
'wie-zijn-wij', 'wie-we-zijn', 'onze-missie', 'het-museum', 'het-archief',
|
||
'de-bibliotheek', 'stichting', 'vereniging', 'kernwaarden', 'ambitie',
|
||
# Spanish
|
||
'mision', 'vision', 'sobre-nosotros', 'quienes-somos', 'institucional',
|
||
'historia', 'el-museo', 'la-biblioteca', 'el-archivo', 'acerca-de',
|
||
'nuestra-mision', 'conocenos', 'nosotros',
|
||
# Portuguese
|
||
'missao', 'visao', 'sobre', 'sobre-nos', 'quem-somos', 'o-museu',
|
||
'a-biblioteca', 'o-arquivo', 'nossa-missao', 'conheca',
|
||
# German
|
||
'leitbild', 'ueber-uns', 'uber-uns', 'das-museum', 'wir-ueber-uns',
|
||
# French
|
||
'a-propos', 'qui-sommes-nous', 'notre-mission', 'le-musee', 'presentation',
|
||
# English
|
||
'about', 'about-us', 'mission', 'vision', 'organization', 'who-we-are',
|
||
# Japanese (romanized)
|
||
'gaiyo', 'gaiyou', 'rinen', 'aisatsu', 'enkaku', 'greeting', 'message',
|
||
'introduction', 'outline', 'profile', 'history',
|
||
# Czech
|
||
'o-nas', 'o-knihovne', 'o-muzeu', 'o-archivu', 'poslani', 'historie',
|
||
'zakladni-informace', 'profil',
|
||
# Italian
|
||
'chi-siamo', 'la-missione', 'missione', 'storia', 'il-museo', 'presentazione',
|
||
]
|
||
|
||
|
||
def extract_links_from_markdown(content: str, base_url: str) -> list[str]:
|
||
"""
|
||
Extract all links from markdown content.
|
||
|
||
Args:
|
||
content: Markdown text content
|
||
base_url: Base URL for resolving relative links
|
||
|
||
Returns:
|
||
List of absolute URLs found in the content
|
||
"""
|
||
links = []
|
||
|
||
# Match markdown links: [text](url)
|
||
md_link_pattern = r'\[([^\]]*)\]\(([^)]+)\)'
|
||
for match in re.finditer(md_link_pattern, content):
|
||
url = match.group(2).strip()
|
||
if url:
|
||
# Skip anchors, mailto, tel, etc.
|
||
if url.startswith('#') or url.startswith('mailto:') or url.startswith('tel:'):
|
||
continue
|
||
# Resolve relative URLs
|
||
if not url.startswith('http'):
|
||
url = urljoin(base_url, url)
|
||
links.append(url)
|
||
|
||
# Also match plain URLs in text
|
||
url_pattern = r'https?://[^\s<>\)\]"\']+'
|
||
for match in re.finditer(url_pattern, content):
|
||
url = match.group(0).rstrip('.,;:')
|
||
if url not in links:
|
||
links.append(url)
|
||
|
||
return links
|
||
|
||
|
||
# URL path patterns to EXCLUDE from mission link discovery
|
||
# These are false positives - URLs that contain mission keywords but aren't about pages
|
||
EXCLUDE_URL_PATTERNS = [
|
||
# Library catalogs (Belgian bibliotheek.be, Dutch, etc.)
|
||
'/catalogus/', # Dutch/Belgian library catalogs
|
||
'/catalog/', # English library catalogs
|
||
'/catalogue/', # French library catalogs
|
||
'/katalog/', # German/Czech library catalogs
|
||
'/search-history', # Library search history pages
|
||
'/recover-password', # Password recovery pages
|
||
'/login', # Login pages
|
||
'/account/', # Account pages
|
||
'/my-account', # Account pages
|
||
'/mijn-account', # Dutch account pages
|
||
'/cart/', # Shopping cart
|
||
'/checkout/', # Checkout
|
||
'/winkelwagen', # Dutch shopping cart
|
||
|
||
# Book/item detail pages (often have "over" in Dutch book titles)
|
||
'library-marc', # Library MARC records
|
||
'vlacc', # Flemish library consortium records
|
||
'library-marc-vlacc', # Combined pattern
|
||
|
||
# Media and file pages
|
||
'/download/', # Download pages
|
||
'/uploads/', # Upload directories
|
||
'/files/', # File directories
|
||
'/media/', # Media directories (unless part of museum name)
|
||
'/assets/', # Asset directories
|
||
|
||
# Administrative and utility pages
|
||
'/admin/', # Admin pages
|
||
'/wp-admin/', # WordPress admin
|
||
'/wp-content/', # WordPress content
|
||
'/wp-includes/', # WordPress includes
|
||
'/cgi-bin/', # CGI scripts
|
||
'/api/', # API endpoints
|
||
'/_next/', # Next.js internal
|
||
'/_nuxt/', # Nuxt.js internal
|
||
|
||
# Social and external
|
||
'/share/', # Share pages
|
||
'/print/', # Print pages
|
||
'/email/', # Email pages
|
||
'/rss', # RSS feeds
|
||
'/feed', # Feed pages
|
||
|
||
# E-commerce patterns
|
||
'/product/', # Product pages
|
||
'/products/', # Product listings
|
||
'/shop/', # Shop pages
|
||
'/store/', # Store pages
|
||
'/bestellen/', # Dutch ordering
|
||
'/reserveren/', # Dutch reservation
|
||
|
||
# Japanese specific exclusions
|
||
'/search', # Search pages
|
||
'/result', # Search result pages
|
||
'/ebook/', # E-book pages
|
||
'/overdrive', # Overdrive e-book service
|
||
]
|
||
|
||
|
||
def filter_mission_links(links: list[str], base_domain: str) -> list[str]:
|
||
"""
|
||
Filter links to only those likely to contain mission/vision content.
|
||
|
||
Args:
|
||
links: List of URLs to filter
|
||
base_domain: Domain of the custodian website (only keep same-domain links)
|
||
|
||
Returns:
|
||
List of URLs that likely contain mission content
|
||
"""
|
||
mission_urls = []
|
||
|
||
for url in links:
|
||
try:
|
||
parsed = urlparse(url)
|
||
except ValueError:
|
||
# Skip malformed URLs (e.g., invalid IPv6)
|
||
continue
|
||
|
||
# Only keep links from the same domain
|
||
if parsed.netloc and base_domain not in parsed.netloc:
|
||
continue
|
||
|
||
path_lower = parsed.path.lower()
|
||
|
||
# EXCLUSION CHECK: Skip URLs that match exclusion patterns
|
||
is_excluded = False
|
||
for exclude_pattern in EXCLUDE_URL_PATTERNS:
|
||
if exclude_pattern in path_lower or exclude_pattern in url.lower():
|
||
is_excluded = True
|
||
break
|
||
|
||
if is_excluded:
|
||
continue
|
||
|
||
# Check if path contains mission-related keywords
|
||
for keyword in MISSION_LINK_KEYWORDS:
|
||
if keyword in path_lower:
|
||
if url not in mission_urls:
|
||
mission_urls.append(url)
|
||
break
|
||
|
||
return mission_urls
|
||
|
||
|
||
async def discover_mission_links_from_homepage(
|
||
reader: Union['LinkupWebReader', 'ZAIWebReader', 'CompositeWebReader'],
|
||
homepage_url: str,
|
||
verbose: bool = False
|
||
) -> tuple[list[str], str, str]:
|
||
"""
|
||
Fetch homepage and discover links to mission/vision pages.
|
||
|
||
This is more reliable than guessing URL patterns because it finds
|
||
the actual links used by the website.
|
||
|
||
Args:
|
||
reader: Web reader instance
|
||
homepage_url: The custodian's homepage URL
|
||
verbose: Whether to print progress
|
||
|
||
Returns:
|
||
Tuple of (discovered_urls, homepage_content, retrieved_on)
|
||
Returns ([], '', '') if homepage fetch fails
|
||
"""
|
||
# Fetch homepage
|
||
result = await reader.read_webpage(homepage_url)
|
||
|
||
if not result['success']:
|
||
if verbose:
|
||
print(f" Homepage fetch failed: {result.get('error', 'Unknown')[:50]}")
|
||
return [], '', ''
|
||
|
||
content = result.get('content', '')
|
||
retrieved_on = result.get('retrieved_on', datetime.now(timezone.utc).isoformat())
|
||
|
||
if not content:
|
||
return [], content, retrieved_on
|
||
|
||
# Extract base domain for filtering
|
||
parsed = urlparse(homepage_url)
|
||
base_domain = parsed.netloc.lower()
|
||
|
||
# Extract all links from homepage
|
||
all_links = extract_links_from_markdown(content, homepage_url)
|
||
|
||
if verbose:
|
||
print(f" Found {len(all_links)} links on homepage")
|
||
|
||
# Filter to mission-related links
|
||
mission_links = filter_mission_links(all_links, base_domain)
|
||
|
||
if verbose and mission_links:
|
||
print(f" Found {len(mission_links)} mission-related links:")
|
||
for link in mission_links[:5]: # Show first 5
|
||
print(f" - {link}")
|
||
|
||
return mission_links, content, retrieved_on
|
||
|
||
|
||
def extract_statements_from_content(
|
||
content: str,
|
||
source_url: str,
|
||
retrieved_on: str,
|
||
ghcid: str,
|
||
) -> list[dict]:
|
||
"""
|
||
Extract mission, vision, and goal statements from webpage content.
|
||
|
||
This uses keyword matching and section detection. For production,
|
||
consider using an LLM for more intelligent extraction.
|
||
|
||
Args:
|
||
content: The webpage text content
|
||
source_url: URL of the source page
|
||
retrieved_on: ISO timestamp when page was retrieved
|
||
ghcid: GHCID of the custodian
|
||
|
||
Returns:
|
||
List of mission statement dictionaries
|
||
"""
|
||
statements = []
|
||
content_lower = content.lower()
|
||
|
||
# Skip error pages (404, 500, etc.)
|
||
error_indicators = [
|
||
'pagina niet gevonden', 'page not found', '404',
|
||
'niet gevonden', 'not found', 'error', 'fout',
|
||
'deze pagina bestaat niet', 'this page does not exist'
|
||
]
|
||
# Check title and first 500 chars for error indicators
|
||
if any(indicator in content_lower[:500] for indicator in error_indicators):
|
||
return []
|
||
|
||
# Also check if content looks like raw JSON (Z.AI sometimes returns this)
|
||
if content.strip().startswith('{"') or content.strip().startswith('"{'):
|
||
return []
|
||
|
||
# Check if this page has mission-related content
|
||
has_mission_content = any(
|
||
keyword in content_lower
|
||
for keywords in MISSION_KEYWORDS.values()
|
||
for keyword in keywords
|
||
)
|
||
|
||
if not has_mission_content:
|
||
return []
|
||
|
||
# Split content into sections (by headings or blank lines)
|
||
sections = re.split(r'\n\s*\n|\n#+\s+|\n\*\*[^*]+\*\*\n', content)
|
||
|
||
for section in sections:
|
||
section = section.strip()
|
||
if len(section) < 20: # Skip very short sections
|
||
continue
|
||
|
||
section_lower = section.lower()
|
||
|
||
# Detect statement type based on keywords
|
||
statement_type = None
|
||
confidence = 0.7
|
||
|
||
for stype, keywords in MISSION_KEYWORDS.items():
|
||
for keyword in keywords:
|
||
if keyword in section_lower[:200]: # Check beginning of section
|
||
statement_type = stype
|
||
confidence = 0.85 if keyword in section_lower[:50] else 0.75
|
||
break
|
||
if statement_type:
|
||
break
|
||
|
||
if not statement_type:
|
||
continue
|
||
|
||
# Clean up the section text
|
||
# Remove markdown formatting
|
||
clean_text = re.sub(r'\*\*([^*]+)\*\*', r'\1', section)
|
||
clean_text = re.sub(r'\*([^*]+)\*', r'\1', clean_text)
|
||
clean_text = re.sub(r'#+\s*', '', clean_text)
|
||
clean_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', clean_text)
|
||
clean_text = clean_text.strip()
|
||
|
||
if len(clean_text) < 20:
|
||
continue
|
||
|
||
# Skip boilerplate/footer content
|
||
boilerplate_indicators = [
|
||
'©', 'copyright', 'all rights reserved', 'alle rechten voorbehouden',
|
||
'privacybeleid', 'privacy policy', 'cookie', 'algemene voorwaarden',
|
||
'terms and conditions', 'nieuwsbrief', 'newsletter', 'subscribe',
|
||
'volg ons', 'follow us', 'social media', 'facebook', 'instagram',
|
||
'twitter', 'linkedin', 'youtube', 'contact', 'openingstijden',
|
||
'opening hours', 'bereikbaarheid', 'route', 'adres:', 'address:',
|
||
]
|
||
clean_lower = clean_text.lower()
|
||
boilerplate_count = sum(1 for ind in boilerplate_indicators if ind in clean_lower)
|
||
# If more than 2 boilerplate indicators in a short text, skip it
|
||
if boilerplate_count >= 2 and len(clean_text) < 200:
|
||
continue
|
||
# If the text is primarily copyright/footer (starts with ©)
|
||
if clean_text.strip().startswith('©'):
|
||
continue
|
||
|
||
# Skip navigation/intro text (too short to be actual mission content)
|
||
# Actual mission statements are usually at least 50 characters
|
||
if len(clean_text) < 50:
|
||
continue
|
||
|
||
# Skip text that looks like a link/intro (e.g., "Lees alles over...")
|
||
skip_patterns = [
|
||
r'^lees\s+(alles\s+)?over',
|
||
r'^klik\s+hier',
|
||
r'^meer\s+(info|informatie)',
|
||
r'^bekijk\s+',
|
||
r'^ga\s+naar',
|
||
r'^read\s+(more\s+)?about',
|
||
r'^click\s+here',
|
||
r'^view\s+',
|
||
]
|
||
if any(re.match(pattern, clean_lower) for pattern in skip_patterns):
|
||
continue
|
||
|
||
# Generate statement ID
|
||
year = datetime.now().year
|
||
statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{statement_type}-{year}"
|
||
|
||
# Compute content hash
|
||
content_hash = compute_content_hash(clean_text)
|
||
|
||
# Current timestamp for statement creation (Rule 35: Dual Timestamps)
|
||
statement_created_at = datetime.now(timezone.utc).isoformat()
|
||
|
||
statement = {
|
||
'statement_id': statement_id,
|
||
'statement_type': statement_type,
|
||
'statement_text': clean_text,
|
||
'statement_language': get_language_from_ghcid(ghcid), # Detect from GHCID country
|
||
'extracted_verbatim': True, # Rule 36: Confirms no translation occurred
|
||
'source_url': source_url,
|
||
'content_hash': content_hash,
|
||
# Rule 35: Dual Timestamp Provenance
|
||
'provenance': {
|
||
'statement_created_at': statement_created_at, # When this claim was extracted
|
||
'source_archived_at': retrieved_on, # When the webpage was fetched
|
||
'retrieval_agent': 'linkup-api',
|
||
'extraction_agent': 'keyword-matching/batch',
|
||
'extraction_confidence': confidence,
|
||
# W3C PROV-O compatible fields
|
||
'prov:wasDerivedFrom': source_url,
|
||
'prov:generatedAtTime': statement_created_at,
|
||
}
|
||
}
|
||
|
||
statements.append(statement)
|
||
|
||
return statements
|
||
|
||
|
||
async def extract_statements_with_llm(
|
||
llm_extractor: GLMMissionExtractor,
|
||
content: str,
|
||
source_url: str,
|
||
retrieved_on: str,
|
||
ghcid: str,
|
||
) -> list[dict]:
|
||
"""
|
||
Extract mission, vision, and goal statements using LLM (Z.AI GLM).
|
||
|
||
This provides much better quality extraction than keyword matching
|
||
by using semantic understanding of the content.
|
||
|
||
Args:
|
||
llm_extractor: GLMMissionExtractor instance
|
||
content: The webpage text content
|
||
source_url: URL of the source page
|
||
retrieved_on: ISO timestamp when page was retrieved
|
||
ghcid: GHCID of the custodian
|
||
|
||
Returns:
|
||
List of mission statement dictionaries
|
||
"""
|
||
# Quick pre-filter: skip obvious error pages
|
||
content_lower = content.lower()
|
||
error_indicators = [
|
||
'pagina niet gevonden', 'page not found', '404',
|
||
'niet gevonden', 'not found', 'deze pagina bestaat niet',
|
||
'oeps', 'error', 'no routes match', 'routing error'
|
||
]
|
||
if any(indicator in content_lower[:500] for indicator in error_indicators):
|
||
return []
|
||
|
||
# Skip raw JSON responses
|
||
if content.strip().startswith('{"') or content.strip().startswith('"{'):
|
||
return []
|
||
|
||
# Skip very short content (likely empty page)
|
||
if len(content.strip()) < 200:
|
||
return []
|
||
|
||
# Detect language from GHCID (Rule 36: Original Language Preservation)
|
||
language = get_language_from_ghcid(ghcid)
|
||
|
||
# Call LLM for extraction with language-specific prompt
|
||
result = await llm_extractor.extract_mission_from_content(
|
||
content=content,
|
||
source_url=source_url,
|
||
language=language
|
||
)
|
||
|
||
if not result['success']:
|
||
return []
|
||
|
||
statements = []
|
||
year = datetime.now().year
|
||
|
||
# Get expected language from GHCID (Rule 36: Original Language Preservation)
|
||
expected_language = get_language_from_ghcid(ghcid)
|
||
# Use detected language from LLM if available, else fall back to expected
|
||
detected_language = result.get('detected_language') or expected_language
|
||
|
||
# Process each statement type
|
||
for statement_type in ['mission', 'vision', 'goals']:
|
||
text = result.get(statement_type)
|
||
if not text or text == 'null' or len(str(text).strip()) < 20:
|
||
continue
|
||
|
||
# Map 'goals' to 'goal' for consistency with schema
|
||
schema_type = 'goal' if statement_type == 'goals' else statement_type
|
||
|
||
# Generate statement ID
|
||
statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{schema_type}-{year}"
|
||
|
||
# Compute content hash
|
||
content_hash = compute_content_hash(str(text))
|
||
|
||
# Current timestamp for statement creation (Rule 35: Dual Timestamps)
|
||
statement_created_at = datetime.now(timezone.utc).isoformat()
|
||
|
||
statement = {
|
||
'statement_id': statement_id,
|
||
'statement_type': schema_type,
|
||
'statement_text': str(text).strip(),
|
||
'statement_language': detected_language, # Use LLM-detected language
|
||
'extracted_verbatim': True, # Rule 36: Confirms no translation occurred
|
||
'source_url': source_url,
|
||
'content_hash': content_hash,
|
||
# Rule 35: Dual Timestamp Provenance
|
||
'provenance': {
|
||
'statement_created_at': statement_created_at, # When this claim was extracted
|
||
'source_archived_at': retrieved_on, # When the webpage was fetched
|
||
'retrieval_agent': 'linkup-api',
|
||
'extraction_agent': f'zai-glm/{result.get("model", ZAI_GLM_MODEL)}',
|
||
'extraction_confidence': result.get('confidence', 0.0),
|
||
# W3C PROV-O compatible fields
|
||
'prov:wasDerivedFrom': source_url,
|
||
'prov:generatedAtTime': statement_created_at,
|
||
}
|
||
}
|
||
|
||
# Add source section if available
|
||
if result.get('source_section'):
|
||
statement['source_section'] = result['source_section']
|
||
|
||
# Document language mismatch if detected language differs from expected (Rule 36)
|
||
if detected_language != expected_language:
|
||
statement['language_note'] = f"Content in {detected_language}, expected {expected_language} based on GHCID country code"
|
||
|
||
statements.append(statement)
|
||
|
||
return statements
|
||
|
||
|
||
def update_custodian_yaml(
|
||
yaml_path: Path,
|
||
custodian_data: dict,
|
||
statements: list[dict],
|
||
dry_run: bool = False
|
||
) -> bool:
|
||
"""
|
||
Update custodian YAML file with extracted mission statements.
|
||
|
||
Args:
|
||
yaml_path: Path to the custodian YAML file
|
||
custodian_data: Current custodian data
|
||
statements: List of extracted statements
|
||
dry_run: If True, don't write changes
|
||
|
||
Returns:
|
||
True if updated successfully
|
||
"""
|
||
if not statements:
|
||
return False
|
||
|
||
# Initialize or update mission_statement field
|
||
if 'mission_statement' not in custodian_data:
|
||
custodian_data['mission_statement'] = []
|
||
|
||
existing_ids = {
|
||
s.get('statement_id') for s in custodian_data['mission_statement']
|
||
if isinstance(s, dict)
|
||
}
|
||
|
||
# Add new statements
|
||
added = 0
|
||
for statement in statements:
|
||
if statement['statement_id'] not in existing_ids:
|
||
custodian_data['mission_statement'].append(statement)
|
||
added += 1
|
||
|
||
if added == 0:
|
||
return False
|
||
|
||
if dry_run:
|
||
print(f" Would add {added} statements to {yaml_path.name}")
|
||
return True
|
||
|
||
# Write updated YAML
|
||
try:
|
||
with open(yaml_path, 'w', encoding='utf-8') as f:
|
||
yaml.dump(
|
||
custodian_data,
|
||
f,
|
||
default_flow_style=False,
|
||
allow_unicode=True,
|
||
sort_keys=False,
|
||
width=120
|
||
)
|
||
print(f" Added {added} statements to {yaml_path.name}")
|
||
return True
|
||
except Exception as e:
|
||
print(f" Error writing {yaml_path.name}: {e}", file=sys.stderr)
|
||
return False
|
||
|
||
|
||
async def process_custodian(
|
||
reader: Union[LinkupWebReader, ZAIWebReader, CompositeWebReader],
|
||
yaml_path: Path,
|
||
custodian_data: dict,
|
||
website: str,
|
||
dry_run: bool = False,
|
||
verbose: bool = False,
|
||
llm_extractor: Optional[GLMMissionExtractor] = None,
|
||
) -> dict:
|
||
"""
|
||
Process a single custodian: discover pages, fetch content, extract statements.
|
||
|
||
IMPROVED: Now uses two-phase discovery:
|
||
1. First fetch homepage and extract actual mission page links from navigation
|
||
2. Fall back to URL pattern guessing only if no links found
|
||
|
||
Args:
|
||
reader: Web reader instance (Linkup, ZAI, or Composite with fallback)
|
||
yaml_path: Path to custodian YAML file
|
||
custodian_data: Current custodian data
|
||
website: Website URL to process
|
||
dry_run: If True, don't write changes
|
||
verbose: If True, show detailed progress
|
||
llm_extractor: Optional LLM extractor for intelligent extraction
|
||
|
||
Returns:
|
||
dict with processing results
|
||
"""
|
||
ghcid = yaml_path.stem.split('-')[0:5] # Extract base GHCID from filename
|
||
ghcid = '-'.join(ghcid[:5]) if len(ghcid) >= 5 else yaml_path.stem
|
||
|
||
# Get name for display
|
||
name = custodian_data.get('custodian_name', {}).get('emic_name')
|
||
if not name:
|
||
name = custodian_data.get('name', ghcid)
|
||
|
||
result = {
|
||
'ghcid': ghcid,
|
||
'name': name,
|
||
'website': website,
|
||
'pages_checked': 0,
|
||
'pages_with_content': 0,
|
||
'statements_found': 0,
|
||
'statements_added': 0,
|
||
'discovery_method': 'none',
|
||
'errors': [],
|
||
}
|
||
|
||
if verbose:
|
||
print(f"\nProcessing {ghcid}: {name}")
|
||
print(f" Website: {website}")
|
||
|
||
all_statements = []
|
||
homepage_content = None
|
||
homepage_retrieved_on = None
|
||
|
||
# PHASE 1: Discover mission pages from homepage links (preferred method)
|
||
if verbose:
|
||
print(f" Phase 1: Discovering mission pages from homepage...")
|
||
|
||
discovered_links, homepage_content, homepage_retrieved_on = await discover_mission_links_from_homepage(
|
||
reader, website, verbose
|
||
)
|
||
|
||
result['pages_checked'] += 1 # Homepage was fetched
|
||
|
||
if discovered_links:
|
||
result['discovery_method'] = 'homepage_links'
|
||
candidate_urls = discovered_links[:5] # Limit to 5 discovered links
|
||
if verbose:
|
||
print(f" Using {len(candidate_urls)} discovered mission links")
|
||
else:
|
||
# PHASE 2: Fall back to URL pattern guessing
|
||
result['discovery_method'] = 'pattern_guessing'
|
||
if verbose:
|
||
print(f" Phase 2: No mission links found, falling back to URL patterns...")
|
||
# Use language-specific patterns based on GHCID country code
|
||
language = get_language_from_ghcid(ghcid)
|
||
candidate_urls = discover_mission_page_urls(website, language=language)[:5]
|
||
|
||
# First, try to extract from homepage content (if we have it)
|
||
if homepage_content and len(homepage_content) > 200:
|
||
result['pages_with_content'] += 1
|
||
|
||
if llm_extractor:
|
||
statements = await extract_statements_with_llm(
|
||
llm_extractor, homepage_content, website,
|
||
homepage_retrieved_on or datetime.now(timezone.utc).isoformat(), ghcid
|
||
)
|
||
if verbose and statements:
|
||
print(f" [LLM] Found {len(statements)} statements on homepage")
|
||
else:
|
||
statements = extract_statements_from_content(
|
||
homepage_content, website,
|
||
homepage_retrieved_on or datetime.now(timezone.utc).isoformat(), ghcid
|
||
)
|
||
if verbose and statements:
|
||
print(f" [Keyword] Found {len(statements)} statements on homepage")
|
||
|
||
if statements:
|
||
all_statements.extend(statements)
|
||
# If we found a mission statement on homepage with high confidence, skip dedicated pages
|
||
# (unless using keyword extraction which has lower accuracy)
|
||
# Helper to get confidence (handles nested provenance structure)
|
||
def get_stmt_confidence(s):
|
||
if 'provenance' in s and 'extraction_confidence' in s['provenance']:
|
||
return s['provenance']['extraction_confidence']
|
||
return s.get('extraction_confidence', 0)
|
||
|
||
if llm_extractor and any(s['statement_type'] == 'mission' and get_stmt_confidence(s) > 0.7 for s in statements):
|
||
if verbose:
|
||
print(f" Found high-confidence mission on homepage, skipping dedicated pages")
|
||
result['discovery_method'] = 'homepage_content'
|
||
result['statements_found'] = len(all_statements)
|
||
# Deduplicate and return early
|
||
unique_statements = {}
|
||
for stmt in all_statements:
|
||
stype = stmt['statement_type']
|
||
if stype not in unique_statements or get_stmt_confidence(stmt) > get_stmt_confidence(unique_statements[stype]):
|
||
unique_statements[stype] = stmt
|
||
final_statements = list(unique_statements.values())
|
||
if final_statements:
|
||
if update_custodian_yaml(yaml_path, custodian_data, final_statements, dry_run):
|
||
result['statements_added'] = len(final_statements)
|
||
return result
|
||
|
||
# Check candidate mission page URLs - PARALLEL FETCHING for speed
|
||
# Filter out homepage (already processed)
|
||
urls_to_check = [url for url in candidate_urls if url.rstrip('/') != website.rstrip('/')]
|
||
|
||
if urls_to_check:
|
||
# Fetch all URLs in parallel (up to 5 concurrent)
|
||
async def fetch_url(url):
|
||
"""Fetch a single URL and return result with URL."""
|
||
page_result = await reader.read_webpage(url)
|
||
return url, page_result
|
||
|
||
# Create tasks for parallel fetching
|
||
fetch_tasks = [fetch_url(url) for url in urls_to_check]
|
||
fetch_results = await asyncio.gather(*fetch_tasks, return_exceptions=True)
|
||
|
||
# Process results
|
||
for fetch_result in fetch_results:
|
||
if isinstance(fetch_result, Exception):
|
||
continue
|
||
|
||
url, page_result = fetch_result
|
||
result['pages_checked'] += 1
|
||
|
||
if verbose:
|
||
print(f" Checking: {url}")
|
||
|
||
if not page_result['success']:
|
||
if verbose:
|
||
print(f" Failed: {page_result.get('error', 'Unknown error')[:50]}")
|
||
result['errors'].append(f"{url}: {page_result.get('error', 'Unknown')[:50]}")
|
||
continue
|
||
|
||
content = page_result.get('content', '')
|
||
if not content or len(content) < 100:
|
||
if verbose:
|
||
print(f" No content")
|
||
continue
|
||
|
||
result['pages_with_content'] += 1
|
||
|
||
# Extract statements from content
|
||
retrieved_on = page_result.get('retrieved_on', datetime.now(timezone.utc).isoformat())
|
||
|
||
# Use LLM extraction if available, otherwise fall back to keyword-based
|
||
if llm_extractor:
|
||
statements = await extract_statements_with_llm(
|
||
llm_extractor, content, url, retrieved_on, ghcid
|
||
)
|
||
if verbose and statements:
|
||
print(f" [LLM] Found {len(statements)} statements")
|
||
else:
|
||
statements = extract_statements_from_content(content, url, retrieved_on, ghcid)
|
||
if verbose and statements:
|
||
print(f" [Keyword] Found {len(statements)} statements")
|
||
|
||
if statements:
|
||
all_statements.extend(statements)
|
||
|
||
result['statements_found'] = len(all_statements)
|
||
|
||
# Helper function to get confidence from statement (handles nested provenance)
|
||
def get_confidence(stmt):
|
||
if 'provenance' in stmt and 'extraction_confidence' in stmt['provenance']:
|
||
return stmt['provenance']['extraction_confidence']
|
||
return stmt.get('extraction_confidence', 0)
|
||
|
||
# Deduplicate statements by type (keep highest confidence)
|
||
unique_statements = {}
|
||
for stmt in all_statements:
|
||
stype = stmt['statement_type']
|
||
if stype not in unique_statements or get_confidence(stmt) > get_confidence(unique_statements[stype]):
|
||
unique_statements[stype] = stmt
|
||
|
||
final_statements = list(unique_statements.values())
|
||
|
||
# Update YAML file
|
||
if final_statements:
|
||
if update_custodian_yaml(yaml_path, custodian_data, final_statements, dry_run):
|
||
result['statements_added'] = len(final_statements)
|
||
|
||
return result
|
||
|
||
|
||
async def main():
|
||
parser = argparse.ArgumentParser(
|
||
description='Batch extract mission statements from heritage custodian websites'
|
||
)
|
||
parser.add_argument(
|
||
'--test', type=int, metavar='N',
|
||
help='Test mode: process only N custodians'
|
||
)
|
||
parser.add_argument(
|
||
'--province', type=str, metavar='PREFIX',
|
||
help='Process custodians matching GHCID prefix (e.g., NL-NH for Noord-Holland)'
|
||
)
|
||
parser.add_argument(
|
||
'--ghcid', type=str,
|
||
help='Process a single custodian by GHCID'
|
||
)
|
||
parser.add_argument(
|
||
'--all', action='store_true',
|
||
help='Process all Dutch custodians with websites'
|
||
)
|
||
parser.add_argument(
|
||
'--dry-run', action='store_true',
|
||
help='Show what would be done without making changes'
|
||
)
|
||
parser.add_argument(
|
||
'--verbose', '-v', action='store_true',
|
||
help='Show detailed progress'
|
||
)
|
||
parser.add_argument(
|
||
'--concurrency', type=int, default=3,
|
||
help='Number of concurrent requests (default: 3)'
|
||
)
|
||
parser.add_argument(
|
||
'--llm', action='store_true',
|
||
help='Use LLM (Z.AI GLM) for intelligent extraction instead of keyword matching'
|
||
)
|
||
parser.add_argument(
|
||
'--prefer-zai', action='store_true',
|
||
help='Use Z.AI Web Reader as primary fetcher instead of Linkup (better for JS-heavy sites)'
|
||
)
|
||
parser.add_argument(
|
||
'--skip-existing', action='store_true',
|
||
help='Skip custodians that already have mission statements'
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
if not any([args.test, args.province, args.ghcid, args.all]):
|
||
parser.print_help()
|
||
print("\nExample usage:")
|
||
print(" python scripts/batch_extract_mission_statements.py --test 5 --verbose")
|
||
print(" python scripts/batch_extract_mission_statements.py --test 5 --llm --verbose # With LLM extraction")
|
||
print(" python scripts/batch_extract_mission_statements.py --province NL --llm --prefer-zai # Use Z.AI Web Reader")
|
||
print(" python scripts/batch_extract_mission_statements.py --province NL-NH --llm")
|
||
print(" python scripts/batch_extract_mission_statements.py --ghcid NL-ZH-ZUI-M-LMT --llm")
|
||
sys.exit(1)
|
||
|
||
# Get API tokens
|
||
try:
|
||
tokens = get_api_tokens()
|
||
except ValueError as e:
|
||
print(f"Error: {e}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Initialize composite web reader with fallback capability
|
||
# This tries Linkup first, then falls back to Z.AI Web Reader for failed requests
|
||
# Significantly improves success rate for Japanese sites and JS-heavy pages
|
||
linkup_key = tokens.get('linkup')
|
||
zai_key = tokens.get('zai')
|
||
|
||
if linkup_key and zai_key:
|
||
prefer_zai = getattr(args, 'prefer_zai', False)
|
||
reader = CompositeWebReader(linkup_key=linkup_key, zai_key=zai_key, prefer_zai=prefer_zai)
|
||
if prefer_zai:
|
||
print("Using Z.AI Web Reader as PRIMARY with Linkup fallback for web fetching")
|
||
else:
|
||
print("Using Linkup API with Z.AI Web Reader fallback for web fetching")
|
||
elif linkup_key:
|
||
reader = CompositeWebReader(linkup_key=linkup_key)
|
||
print("Using Linkup API for web fetching (no fallback available)")
|
||
elif zai_key:
|
||
reader = CompositeWebReader(zai_key=zai_key, prefer_zai=True)
|
||
print("Using Z.AI Web Reader API for web fetching")
|
||
else:
|
||
print("Error: No API token available (need LINKUP_API_KEY or ZAI_API_TOKEN)", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Initialize LLM extractor if requested
|
||
llm_extractor = None
|
||
if args.llm:
|
||
if 'zai' not in tokens:
|
||
print("Error: --llm requires ZAI_API_TOKEN for LLM extraction", file=sys.stderr)
|
||
sys.exit(1)
|
||
llm_extractor = GLMMissionExtractor(tokens['zai'])
|
||
print(f"Using Z.AI GLM ({ZAI_GLM_MODEL}) for LLM-based extraction")
|
||
|
||
# Find custodians to process
|
||
if args.ghcid:
|
||
# Single custodian mode
|
||
custodian_dir = PROJECT_ROOT / "data" / "custodian"
|
||
yaml_files = list(custodian_dir.glob(f"{args.ghcid}*.yaml"))
|
||
|
||
if not yaml_files:
|
||
print(f"Error: No custodian file found for GHCID {args.ghcid}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
yaml_path = yaml_files[0]
|
||
with open(yaml_path, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
|
||
# Find website using same logic as find_custodians_with_websites
|
||
website = None
|
||
|
||
# 1. Direct website field
|
||
if 'website' in data and data['website']:
|
||
website = data['website']
|
||
|
||
# 2. Original entry webadres_organisatie
|
||
if not website and 'original_entry' in data:
|
||
oe = data['original_entry']
|
||
if isinstance(oe, dict) and oe.get('webadres_organisatie'):
|
||
website = oe['webadres_organisatie']
|
||
|
||
# 3. Museum register enrichment website_url
|
||
if not website and 'museum_register_enrichment' in data:
|
||
mre = data['museum_register_enrichment']
|
||
if isinstance(mre, dict) and mre.get('website_url'):
|
||
website = mre['website_url']
|
||
|
||
# 4. Wikidata enrichment official_website
|
||
if not website and 'wikidata_enrichment' in data:
|
||
we = data['wikidata_enrichment']
|
||
if isinstance(we, dict) and we.get('official_website'):
|
||
website = we['official_website']
|
||
|
||
# 5. Google Maps enrichment website
|
||
if not website and 'google_maps_enrichment' in data:
|
||
gm = data['google_maps_enrichment']
|
||
if isinstance(gm, dict) and gm.get('website'):
|
||
website = gm['website']
|
||
|
||
# 6. Location object website
|
||
if not website and 'location' in data:
|
||
loc = data['location']
|
||
if isinstance(loc, dict) and loc.get('website'):
|
||
website = loc['website']
|
||
|
||
# 7. Original entry identifiers (Website scheme)
|
||
if not website and 'original_entry' in data:
|
||
oe = data['original_entry']
|
||
if isinstance(oe, dict) and 'identifiers' in oe:
|
||
for ident in oe.get('identifiers', []):
|
||
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
|
||
website = ident.get('identifier_value') or ident.get('identifier_url')
|
||
if website:
|
||
break
|
||
|
||
# 8. Top-level identifiers array (Website scheme)
|
||
if not website and 'identifiers' in data:
|
||
for ident in data.get('identifiers', []):
|
||
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
|
||
website = ident.get('identifier_value') or ident.get('identifier_url')
|
||
if website:
|
||
break
|
||
|
||
if not website or not website.startswith('http'):
|
||
print(f"Error: No website found for {args.ghcid}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
custodians = [(yaml_path, data, website)]
|
||
else:
|
||
# Batch mode
|
||
limit = args.test if args.test else None
|
||
prefix = args.province if args.province else None
|
||
|
||
print(f"Finding custodians with websites...")
|
||
skip_existing = getattr(args, 'skip_existing', False)
|
||
custodians = find_custodians_with_websites(prefix=prefix, limit=limit, skip_existing=skip_existing)
|
||
if skip_existing:
|
||
print(f" (skipping custodians with existing mission statements)")
|
||
|
||
print(f"Found {len(custodians)} custodians with websites")
|
||
|
||
if args.dry_run:
|
||
print("\n[DRY RUN MODE - No changes will be made]\n")
|
||
|
||
# Process custodians
|
||
results = []
|
||
semaphore = asyncio.Semaphore(args.concurrency)
|
||
|
||
async def process_with_semaphore(custodian_tuple):
|
||
async with semaphore:
|
||
yaml_path, data, website = custodian_tuple
|
||
return await process_custodian(
|
||
reader, yaml_path, data, website,
|
||
dry_run=args.dry_run, verbose=args.verbose,
|
||
llm_extractor=llm_extractor
|
||
)
|
||
|
||
# Process in batches
|
||
tasks = [process_with_semaphore(c) for c in custodians]
|
||
|
||
print(f"\nProcessing {len(tasks)} custodians...")
|
||
|
||
for i, coro in enumerate(asyncio.as_completed(tasks), 1):
|
||
result = await coro
|
||
results.append(result)
|
||
|
||
if not args.verbose:
|
||
# Progress indicator
|
||
if result['statements_added'] > 0:
|
||
print(f"[{i}/{len(tasks)}] {result['ghcid']}: Added {result['statements_added']} statements")
|
||
elif i % 10 == 0:
|
||
print(f"[{i}/{len(tasks)}] Processing...")
|
||
|
||
# Summary statistics
|
||
print("\n" + "="*60)
|
||
print("SUMMARY")
|
||
print("="*60)
|
||
|
||
total_checked = sum(r['pages_checked'] for r in results)
|
||
total_with_content = sum(r['pages_with_content'] for r in results)
|
||
total_found = sum(r['statements_found'] for r in results)
|
||
total_added = sum(r['statements_added'] for r in results)
|
||
total_errors = sum(len(r['errors']) for r in results)
|
||
custodians_with_statements = sum(1 for r in results if r['statements_added'] > 0)
|
||
|
||
print(f"Custodians processed: {len(results)}")
|
||
print(f"Pages checked: {total_checked}")
|
||
print(f"Pages with content: {total_with_content}")
|
||
print(f"Statements found: {total_found}")
|
||
print(f"Statements added: {total_added}")
|
||
print(f"Custodians updated: {custodians_with_statements}")
|
||
print(f"Errors encountered: {total_errors}")
|
||
|
||
# Show web reader statistics (if using CompositeWebReader)
|
||
if hasattr(reader, 'get_stats'):
|
||
stats = reader.get_stats()
|
||
print(f"\nWeb Reader Statistics:")
|
||
print(f" Total requests: {stats['total_requests']}")
|
||
print(f" Linkup success: {stats['linkup_success']} ({stats.get('linkup_rate', 'N/A')})")
|
||
print(f" Linkup failures (triggering fallback): {stats['linkup_fail']}")
|
||
print(f" Z.AI fallback success: {stats['zai_fallback_success']}")
|
||
print(f" Z.AI fallback failures: {stats['zai_fallback_fail']}")
|
||
print(f" Overall success rate: {stats.get('overall_success_rate', 'N/A')}")
|
||
|
||
# Show custodians that got statements
|
||
if custodians_with_statements > 0:
|
||
print(f"\nCustodians with new mission statements:")
|
||
for r in results:
|
||
if r['statements_added'] > 0:
|
||
print(f" - {r['ghcid']}: {r['name']} ({r['statements_added']} statements)")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
asyncio.run(main())
|