2012 lines
77 KiB
Python
Executable file
2012 lines
77 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Batch extract mission statements from heritage custodian websites.
|
|
|
|
This script:
|
|
1. Finds Dutch custodians with websites
|
|
2. Discovers mission/vision/about pages
|
|
3. Uses Linkup API (primary) or Z.AI Web Reader (fallback) to fetch content
|
|
4. Creates LinkML-compliant mission_statement entries with full provenance
|
|
5. Updates custodian YAML files with extracted statements
|
|
|
|
Usage:
|
|
python scripts/batch_extract_mission_statements.py --test 5 # Test with 5 custodians
|
|
python scripts/batch_extract_mission_statements.py --province NL-NH # Noord-Holland only
|
|
python scripts/batch_extract_mission_statements.py --all # All Dutch custodians
|
|
python scripts/batch_extract_mission_statements.py --ghcid NL-ZH-ZUI-M-LMT # Single custodian
|
|
|
|
Requirements:
|
|
- httpx (pip install httpx)
|
|
- pyyaml
|
|
- LINKUP_API_KEY environment variable (primary)
|
|
- ZAI_API_TOKEN environment variable (fallback)
|
|
|
|
API Documentation:
|
|
- Linkup: https://docs.linkup.so/
|
|
- Z.AI: https://docs.z.ai/devpack/mcp/reader-mcp-server
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import base64
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Any, Union
|
|
from urllib.parse import urljoin, urlparse, quote
|
|
|
|
import httpx
|
|
import yaml
|
|
|
|
# Z.AI GLM API configuration (per Rule 11 in AGENTS.md)
|
|
ZAI_GLM_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
|
|
ZAI_GLM_MODEL = "glm-4.7" # Latest model with best quality
|
|
|
|
# Add project root to path
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
# API configurations
|
|
LINKUP_API_URL = "https://api.linkup.so/v1/fetch"
|
|
ZAI_MCP_URL = "https://api.z.ai/api/mcp/web_reader/mcp"
|
|
|
|
# Common mission page URL patterns for Dutch heritage institutions
|
|
# Ordered by likelihood of success (most common patterns first)
|
|
DUTCH_MISSION_PATTERNS = [
|
|
"/over-ons", # Most common Dutch pattern
|
|
"/missie", # Direct mission page
|
|
"/over", # Short version
|
|
"/missie-en-visie", # Combined mission/vision
|
|
"/organisatie", # Organization page often has mission
|
|
"/about", # English fallback
|
|
"/visie", # Vision page
|
|
"/over-ons/missie", # Nested mission page
|
|
"/onze-missie", # "Our mission"
|
|
"/over/missie",
|
|
"/organisatie/missie",
|
|
"/het-museum/missie",
|
|
"/het-museum/missie-en-visie",
|
|
"/museum/missie",
|
|
"/about/mission",
|
|
"/wie-zijn-wij",
|
|
"/about-us",
|
|
]
|
|
|
|
# Extended patterns for Dutch museum websites (discovered through testing)
|
|
DUTCH_MISSION_EXTENDED_PATTERNS = [
|
|
"/het-muzeeum-organisatie/missie-visie",
|
|
"/het-museum-organisatie/missie-visie",
|
|
"/organisatie/missie-visie",
|
|
"/over-het-museum/missie",
|
|
"/over-het-museum/missie-en-visie",
|
|
"/info/missie",
|
|
"/info/over-ons",
|
|
"/stichting/missie",
|
|
"/museum/over-ons",
|
|
"/museum/organisatie",
|
|
]
|
|
|
|
# Spanish mission page patterns (for Latin America)
|
|
SPANISH_MISSION_PATTERNS = [
|
|
"/sobre-nosotros", # About us
|
|
"/quienes-somos", # Who we are
|
|
"/mision", # Mission
|
|
"/mision-y-vision", # Mission and vision
|
|
"/institucional", # Institutional
|
|
"/historia", # History often contains mission
|
|
"/el-museo", # The museum
|
|
"/acerca-de", # About
|
|
"/nuestra-mision", # Our mission
|
|
"/conocenos", # Get to know us
|
|
"/institucion", # Institution
|
|
"/nosotros", # Us
|
|
"/about", # English fallback
|
|
"/about-us",
|
|
]
|
|
|
|
# Portuguese mission page patterns (for Brazil, Portugal)
|
|
PORTUGUESE_MISSION_PATTERNS = [
|
|
"/sobre", # About
|
|
"/sobre-nos", # About us
|
|
"/quem-somos", # Who we are
|
|
"/missao", # Mission
|
|
"/missao-e-visao", # Mission and vision
|
|
"/institucional", # Institutional
|
|
"/historia", # History
|
|
"/o-museu", # The museum
|
|
"/a-biblioteca", # The library
|
|
"/conheca", # Get to know
|
|
"/nossa-missao", # Our mission
|
|
"/about", # English fallback
|
|
]
|
|
|
|
# German mission page patterns
|
|
GERMAN_MISSION_PATTERNS = [
|
|
"/ueber-uns", # About us
|
|
"/uber-uns", # Without umlaut
|
|
"/leitbild", # Mission statement
|
|
"/mission", # Mission
|
|
"/das-museum", # The museum
|
|
"/institution", # Institution
|
|
"/wir-ueber-uns", # We about us
|
|
"/about", # English fallback
|
|
]
|
|
|
|
# French mission page patterns
|
|
FRENCH_MISSION_PATTERNS = [
|
|
"/a-propos", # About
|
|
"/qui-sommes-nous", # Who are we
|
|
"/mission", # Mission
|
|
"/notre-mission", # Our mission
|
|
"/le-musee", # The museum
|
|
"/presentation", # Presentation
|
|
"/historique", # Historical
|
|
"/about", # English fallback
|
|
]
|
|
|
|
# English mission page patterns (international fallback)
|
|
ENGLISH_MISSION_PATTERNS = [
|
|
"/about",
|
|
"/about-us",
|
|
"/mission",
|
|
"/our-mission",
|
|
"/mission-vision",
|
|
"/mission-and-vision",
|
|
"/who-we-are",
|
|
"/the-museum",
|
|
"/the-library",
|
|
"/the-archive",
|
|
"/history",
|
|
"/institutional",
|
|
]
|
|
|
|
# Combined patterns - use all languages for maximum coverage
|
|
ALL_MISSION_PATTERNS = (
|
|
DUTCH_MISSION_PATTERNS +
|
|
SPANISH_MISSION_PATTERNS +
|
|
PORTUGUESE_MISSION_PATTERNS +
|
|
GERMAN_MISSION_PATTERNS +
|
|
FRENCH_MISSION_PATTERNS +
|
|
ENGLISH_MISSION_PATTERNS
|
|
)
|
|
|
|
# Keywords indicating mission/vision content (multilingual)
|
|
MISSION_KEYWORDS = {
|
|
'mission': ['missie', 'mission', 'opdracht', 'kerntaak', 'misión', 'missão', 'leitbild'],
|
|
'vision': ['visie', 'vision', 'toekomst', 'ambitie', 'visión', 'visão'],
|
|
'goal': ['doelstelling', 'doel', 'doelen', 'goal', 'objective', 'objectives', 'ambitie',
|
|
'objetivo', 'objetivos', 'ziel', 'ziele'],
|
|
'value': ['waarde', 'waarden', 'kernwaarden', 'value', 'values', 'principle',
|
|
'valor', 'valores', 'wert', 'werte'],
|
|
'motto': ['motto', 'slogan', 'slagzin', 'lema'],
|
|
}
|
|
|
|
# ISO 3166-1 alpha-2 country code to ISO 639-1 language code mapping
|
|
# Maps country to primary/official language
|
|
COUNTRY_TO_LANGUAGE = {
|
|
# Dutch-speaking
|
|
'NL': 'nl', 'BE': 'nl', 'SR': 'nl', 'AW': 'nl', 'CW': 'nl', 'SX': 'nl',
|
|
# Spanish-speaking
|
|
'AR': 'es', 'BO': 'es', 'CL': 'es', 'CO': 'es', 'CR': 'es', 'CU': 'es',
|
|
'DO': 'es', 'EC': 'es', 'SV': 'es', 'GT': 'es', 'HN': 'es', 'MX': 'es',
|
|
'NI': 'es', 'PA': 'es', 'PY': 'es', 'PE': 'es', 'PR': 'es', 'ES': 'es',
|
|
'UY': 'es', 'VE': 'es', 'GQ': 'es',
|
|
# Portuguese-speaking
|
|
'BR': 'pt', 'PT': 'pt', 'AO': 'pt', 'MZ': 'pt', 'CV': 'pt', 'GW': 'pt',
|
|
'ST': 'pt', 'TL': 'pt',
|
|
# German-speaking
|
|
'DE': 'de', 'AT': 'de', 'CH': 'de', 'LI': 'de', 'LU': 'de',
|
|
# French-speaking
|
|
'FR': 'fr', 'MC': 'fr', 'SN': 'fr', 'CI': 'fr', 'ML': 'fr', 'BF': 'fr',
|
|
'NE': 'fr', 'TG': 'fr', 'BJ': 'fr', 'GA': 'fr', 'CG': 'fr', 'CD': 'fr',
|
|
'MG': 'fr', 'HT': 'fr', 'RE': 'fr', 'MQ': 'fr', 'GP': 'fr', 'GF': 'fr',
|
|
'NC': 'fr', 'PF': 'fr',
|
|
# Italian-speaking
|
|
'IT': 'it', 'SM': 'it', 'VA': 'it',
|
|
# English-speaking (default)
|
|
'US': 'en', 'GB': 'en', 'AU': 'en', 'NZ': 'en', 'CA': 'en', 'IE': 'en',
|
|
'ZA': 'en', 'JM': 'en', 'TT': 'en', 'BB': 'en', 'GH': 'en', 'NG': 'en',
|
|
'KE': 'en', 'UG': 'en', 'TZ': 'en', 'ZW': 'en', 'BW': 'en', 'MW': 'en',
|
|
'ZM': 'en', 'PH': 'en', 'SG': 'en', 'MY': 'en', 'IN': 'en', 'PK': 'en',
|
|
# Japanese
|
|
'JP': 'ja',
|
|
# Chinese
|
|
'CN': 'zh', 'TW': 'zh', 'HK': 'zh', 'MO': 'zh',
|
|
# Korean
|
|
'KR': 'ko', 'KP': 'ko',
|
|
# Russian
|
|
'RU': 'ru', 'BY': 'ru', 'KZ': 'ru', 'KG': 'ru', 'TJ': 'ru',
|
|
# Arabic
|
|
'SA': 'ar', 'AE': 'ar', 'QA': 'ar', 'KW': 'ar', 'BH': 'ar', 'OM': 'ar',
|
|
'YE': 'ar', 'JO': 'ar', 'SY': 'ar', 'LB': 'ar', 'IQ': 'ar', 'EG': 'ar',
|
|
'LY': 'ar', 'TN': 'ar', 'DZ': 'ar', 'MA': 'ar', 'SD': 'ar', 'MR': 'ar',
|
|
# Other
|
|
'CZ': 'cs', 'SK': 'sk', 'PL': 'pl', 'HU': 'hu', 'RO': 'ro', 'BG': 'bg',
|
|
'HR': 'hr', 'RS': 'sr', 'SI': 'sl', 'GR': 'el', 'TR': 'tr', 'IL': 'he',
|
|
'TH': 'th', 'VN': 'vi', 'ID': 'id', 'SE': 'sv', 'NO': 'no', 'DK': 'da',
|
|
'FI': 'fi', 'IS': 'is', 'EE': 'et', 'LV': 'lv', 'LT': 'lt', 'UA': 'uk',
|
|
}
|
|
|
|
|
|
def get_language_from_ghcid(ghcid: str) -> str:
|
|
"""Extract language code from GHCID country prefix.
|
|
|
|
Args:
|
|
ghcid: GHCID string (e.g., "AR-C-BUE-M-MAD")
|
|
|
|
Returns:
|
|
ISO 639-1 language code (e.g., "es" for Argentina)
|
|
"""
|
|
if not ghcid or len(ghcid) < 2:
|
|
return 'en' # Default to English
|
|
|
|
country_code = ghcid[:2].upper()
|
|
return COUNTRY_TO_LANGUAGE.get(country_code, 'en')
|
|
|
|
|
|
def compute_content_hash(text: str) -> str:
|
|
"""Compute SHA-256 hash of text in SRI format."""
|
|
sha256_hash = hashlib.sha256(text.encode('utf-8')).digest()
|
|
b64_hash = base64.b64encode(sha256_hash).decode('ascii')
|
|
return f"sha256-{b64_hash}"
|
|
|
|
|
|
def get_api_tokens() -> dict:
|
|
"""Get API tokens from environment.
|
|
|
|
Returns:
|
|
dict with 'linkup' and/or 'zai' keys containing API tokens
|
|
"""
|
|
tokens = {}
|
|
|
|
# Try environment variables first
|
|
linkup_token = os.environ.get('LINKUP_API_KEY')
|
|
zai_token = os.environ.get('ZAI_API_TOKEN')
|
|
|
|
# Try loading from .env file if not in environment
|
|
env_path = PROJECT_ROOT / '.env'
|
|
if env_path.exists():
|
|
with open(env_path, 'r') as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line.startswith('LINKUP_API_KEY=') and not linkup_token:
|
|
linkup_token = line.split('=', 1)[1].strip().strip('"\'')
|
|
elif line.startswith('ZAI_API_TOKEN=') and not zai_token:
|
|
zai_token = line.split('=', 1)[1].strip().strip('"\'')
|
|
|
|
if linkup_token:
|
|
tokens['linkup'] = linkup_token
|
|
if zai_token:
|
|
tokens['zai'] = zai_token
|
|
|
|
if not tokens:
|
|
raise ValueError(
|
|
"No API tokens found. Set LINKUP_API_KEY or ZAI_API_TOKEN environment variable."
|
|
)
|
|
|
|
return tokens
|
|
|
|
|
|
class LinkupWebReader:
|
|
"""
|
|
Client for Linkup API - simple and reliable web fetching.
|
|
|
|
Reference: https://docs.linkup.so/
|
|
"""
|
|
|
|
def __init__(self, api_key: str):
|
|
self.api_key = api_key
|
|
self.headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
|
|
"""
|
|
Read webpage content using Linkup API.
|
|
|
|
Returns:
|
|
dict with keys: content, success, error, url, retrieved_on
|
|
"""
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
try:
|
|
response = await client.post(
|
|
LINKUP_API_URL,
|
|
headers=self.headers,
|
|
json={"url": url}
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
return {
|
|
"success": False,
|
|
"url": url,
|
|
"error": f"HTTP {response.status_code}: {response.text[:200]}",
|
|
}
|
|
|
|
result = response.json()
|
|
|
|
# Linkup returns markdown content directly
|
|
content = result.get("markdown", result.get("content", ""))
|
|
|
|
if not content:
|
|
return {
|
|
"success": False,
|
|
"url": url,
|
|
"error": "No content returned",
|
|
}
|
|
|
|
return {
|
|
"success": True,
|
|
"url": url,
|
|
"content": content,
|
|
"retrieved_on": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
except httpx.TimeoutException:
|
|
return {
|
|
"success": False,
|
|
"url": url,
|
|
"error": "Request timed out",
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"url": url,
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
class ZAIWebReader:
|
|
"""
|
|
Client for Z.AI Web Reader MCP API using Streamable HTTP transport.
|
|
|
|
The MCP protocol requires:
|
|
1. Initialize session
|
|
2. Send notifications/initialized
|
|
3. Call tools
|
|
|
|
Reference: https://docs.z.ai/devpack/mcp/reader-mcp-server
|
|
"""
|
|
|
|
def __init__(self, api_token: str):
|
|
self.api_token = api_token
|
|
self.session_id = None
|
|
self.headers = {
|
|
"Authorization": f"Bearer {api_token}",
|
|
"Content-Type": "application/json",
|
|
"Accept": "application/json, text/event-stream", # Required for MCP Streamable HTTP
|
|
}
|
|
|
|
def _parse_sse_response(self, text: str) -> dict:
|
|
"""Parse Server-Sent Events (SSE) response format from MCP API.
|
|
|
|
SSE format:
|
|
id:1
|
|
event:message
|
|
data:{"jsonrpc":"2.0",...}
|
|
|
|
Returns the parsed JSON data from the 'data' field.
|
|
"""
|
|
result = {}
|
|
for line in text.strip().split('\n'):
|
|
if line.startswith('data:'):
|
|
data_content = line[5:].strip()
|
|
if data_content:
|
|
try:
|
|
result = json.loads(data_content)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return result
|
|
|
|
async def _send_request(self, client: httpx.AsyncClient, method: str, params: Optional[dict] = None, request_id: int = 1) -> dict:
|
|
"""Send a JSON-RPC request to the MCP server and parse SSE response.
|
|
|
|
Returns dict with keys:
|
|
- success: bool
|
|
- status_code: int
|
|
- data: parsed JSON-RPC result (if success)
|
|
- error: error message (if not success)
|
|
"""
|
|
request_body = {
|
|
"jsonrpc": "2.0",
|
|
"method": method,
|
|
"id": request_id
|
|
}
|
|
if params:
|
|
request_body["params"] = params
|
|
|
|
# Add session header if we have one
|
|
headers = self.headers.copy()
|
|
if self.session_id:
|
|
headers["mcp-session-id"] = self.session_id
|
|
|
|
response = await client.post(ZAI_MCP_URL, headers=headers, json=request_body)
|
|
|
|
# Check for session ID in response headers
|
|
if "mcp-session-id" in response.headers:
|
|
self.session_id = response.headers["mcp-session-id"]
|
|
|
|
if response.status_code != 200:
|
|
return {
|
|
"success": False,
|
|
"status_code": response.status_code,
|
|
"error": f"HTTP {response.status_code}: {response.text[:200]}"
|
|
}
|
|
|
|
# Parse SSE response
|
|
parsed = self._parse_sse_response(response.text)
|
|
if not parsed:
|
|
return {
|
|
"success": False,
|
|
"status_code": response.status_code,
|
|
"error": f"Failed to parse SSE response: {response.text[:200]}"
|
|
}
|
|
|
|
return {
|
|
"success": True,
|
|
"status_code": response.status_code,
|
|
"data": parsed
|
|
}
|
|
|
|
async def initialize(self, client: httpx.AsyncClient) -> bool:
|
|
"""Initialize MCP session."""
|
|
try:
|
|
response = await self._send_request(
|
|
client,
|
|
"initialize",
|
|
{
|
|
"protocolVersion": "2024-11-05",
|
|
"capabilities": {},
|
|
"clientInfo": {
|
|
"name": "glam-mission-extractor",
|
|
"version": "1.0.0"
|
|
}
|
|
},
|
|
request_id=1
|
|
)
|
|
|
|
if response.get("success"):
|
|
# Send initialized notification
|
|
await self._send_request(client, "notifications/initialized", {}, request_id=2)
|
|
return True
|
|
return False
|
|
except Exception as e:
|
|
print(f"Initialize error: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
async def read_webpage(self, url: str, timeout: float = 30.0) -> dict:
|
|
"""
|
|
Read webpage content using Z.AI Web Reader.
|
|
|
|
Returns:
|
|
dict with keys: title, content, metadata, links, success, error
|
|
"""
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
try:
|
|
# Initialize session first
|
|
if not self.session_id:
|
|
await self.initialize(client)
|
|
|
|
# Call webReader tool
|
|
response = await self._send_request(
|
|
client,
|
|
"tools/call",
|
|
{
|
|
"name": "webReader",
|
|
"arguments": {
|
|
"url": url
|
|
}
|
|
},
|
|
request_id=3
|
|
)
|
|
|
|
if not response.get("success"):
|
|
return {
|
|
"success": False,
|
|
"url": url,
|
|
"error": response.get("error", "Unknown error"),
|
|
}
|
|
|
|
result = response.get("data", {})
|
|
|
|
# Parse MCP response
|
|
if "result" in result:
|
|
content_data = result["result"]
|
|
|
|
# Extract content from MCP response format
|
|
if isinstance(content_data, dict):
|
|
# Check for content array (MCP tools/call response format)
|
|
if "content" in content_data and isinstance(content_data["content"], list):
|
|
text_parts = []
|
|
for item in content_data["content"]:
|
|
if isinstance(item, dict) and item.get("type") == "text":
|
|
text_parts.append(item.get("text", ""))
|
|
content_text = "\n".join(text_parts)
|
|
else:
|
|
content_text = content_data.get("content", content_data.get("text", ""))
|
|
|
|
return {
|
|
"success": True,
|
|
"url": url,
|
|
"title": content_data.get("title", ""),
|
|
"content": content_text,
|
|
"metadata": content_data.get("metadata", {}),
|
|
"links": content_data.get("links", []),
|
|
"retrieved_on": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
elif isinstance(content_data, list) and len(content_data) > 0:
|
|
# Array of content blocks
|
|
text_content = ""
|
|
for block in content_data:
|
|
if isinstance(block, dict):
|
|
if block.get("type") == "text":
|
|
text_content += block.get("text", "") + "\n"
|
|
elif "text" in block:
|
|
text_content += block["text"] + "\n"
|
|
elif isinstance(block, str):
|
|
text_content += block + "\n"
|
|
return {
|
|
"success": True,
|
|
"url": url,
|
|
"content": text_content.strip(),
|
|
"retrieved_on": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
# Check for error in response
|
|
if "error" in result:
|
|
return {
|
|
"success": False,
|
|
"url": url,
|
|
"error": f"MCP error: {result['error']}",
|
|
}
|
|
|
|
return {
|
|
"success": False,
|
|
"url": url,
|
|
"error": f"Unexpected response format: {str(result)[:200]}",
|
|
}
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
return {
|
|
"success": False,
|
|
"url": url,
|
|
"error": f"HTTP {e.response.status_code}: {e.response.text[:200]}",
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"url": url,
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
class GLMMissionExtractor:
|
|
"""
|
|
LLM-based mission statement extractor using Z.AI GLM API.
|
|
|
|
This provides intelligent extraction of mission, vision, and goal statements
|
|
from webpage content, replacing naive keyword matching with semantic understanding.
|
|
|
|
Uses Z.AI Coding Plan endpoint per Rule 11 in AGENTS.md.
|
|
Implements Rule 36: Original Language Preservation - NO TRANSLATION.
|
|
"""
|
|
|
|
# Language-specific prompt templates (Rule 36: Original Language Preservation)
|
|
# Each prompt explicitly instructs to NOT translate and preserve original language
|
|
|
|
EXTRACTION_PROMPT_NL = """Je bent een expert in het analyseren van websites van erfgoedinstellingen (musea, archieven, bibliotheken, etc.).
|
|
|
|
## KRITIEK - NIET VERTALEN:
|
|
Extraheer de tekst EXACT zoals deze op de webpagina staat.
|
|
VERTAAL NIET naar een andere taal. Behoud de originele tekst in de originele taal.
|
|
Als de bron in het Nederlands is, moet de output in het Nederlands zijn.
|
|
Als de bron in het Engels is, moet de output in het Engels zijn (niet vertalen naar Nederlands).
|
|
|
|
Analyseer de volgende webpagina-inhoud en extraheer de missie, visie en/of doelstellingen van de organisatie.
|
|
|
|
## Instructies:
|
|
1. Zoek naar expliciete missie- of visie-statements
|
|
2. Let op zinnen die beginnen met "Onze missie is...", "Wij streven naar...", "Het museum heeft als doel...", etc.
|
|
3. Negeer navigatie-elementen, footer-tekst, contactgegevens, openingstijden
|
|
4. Negeer advertenties, nieuwsberichten, en evenement-aankondigingen
|
|
5. Als er GEEN duidelijke missie/visie/doelstelling te vinden is, retourneer een leeg resultaat
|
|
6. KOPIEER de tekst letterlijk - NIET PARAFRASEREN of VERTALEN
|
|
|
|
## Output Format (JSON):
|
|
Retourneer ALLEEN een JSON object in dit exacte formaat:
|
|
```json
|
|
{{
|
|
"mission": "De originele missie-tekst hier (NIET VERTAALD), of null als niet gevonden",
|
|
"vision": "De originele visie-tekst hier (NIET VERTAALD), of null als niet gevonden",
|
|
"goals": "De originele doelstellingen hier (NIET VERTAALD), of null als niet gevonden",
|
|
"confidence": 0.85,
|
|
"source_section": "Naam van de sectie waar dit gevonden is (bijv. 'Over ons', 'Missie en Visie')",
|
|
"detected_language": "nl"
|
|
}}
|
|
```
|
|
|
|
## Webpagina inhoud:
|
|
{content}
|
|
|
|
## Let op:
|
|
- Retourneer ALLEEN het JSON object, geen andere tekst
|
|
- Confidence moet tussen 0.0 en 1.0 zijn
|
|
- NOOIT VERTALEN - behoud originele taal
|
|
- Als niets gevonden: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
|
"""
|
|
|
|
EXTRACTION_PROMPT_ES = """Eres un experto en analizar sitios web de instituciones patrimoniales (museos, archivos, bibliotecas, etc.).
|
|
|
|
## CRITICO - NO TRADUCIR:
|
|
Extrae el texto EXACTAMENTE como aparece en la pagina web.
|
|
NO TRADUZCAS a otro idioma. Preserva el texto original en su idioma original.
|
|
Si la fuente esta en espanol, la salida debe estar en espanol.
|
|
Si la fuente esta en ingles, la salida debe estar en ingles (no traducir al espanol).
|
|
|
|
Analiza el siguiente contenido de la pagina web y extrae la mision, vision y/o objetivos de la organizacion.
|
|
|
|
## Instrucciones:
|
|
1. Busca declaraciones explicitas de mision o vision
|
|
2. Presta atencion a frases como "Nuestra mision es...", "Tenemos como objetivo...", "El museo busca...", etc.
|
|
3. Ignora elementos de navegacion, texto de pie de pagina, informacion de contacto, horarios
|
|
4. Ignora anuncios, noticias y anuncios de eventos
|
|
5. Si NO hay una mision/vision/objetivo claro, devuelve un resultado vacio
|
|
6. COPIA el texto literalmente - NO PARAFRASEAR ni TRADUCIR
|
|
|
|
## Formato de salida (JSON):
|
|
Devuelve SOLO un objeto JSON en este formato exacto:
|
|
```json
|
|
{{
|
|
"mission": "El texto original de la mision aqui (SIN TRADUCIR), o null si no se encuentra",
|
|
"vision": "El texto original de la vision aqui (SIN TRADUCIR), o null si no se encuentra",
|
|
"goals": "Los objetivos originales aqui (SIN TRADUCIR), o null si no se encuentran",
|
|
"confidence": 0.85,
|
|
"source_section": "Nombre de la seccion donde se encontro (ej. 'Sobre nosotros', 'Mision y Vision')",
|
|
"detected_language": "es"
|
|
}}
|
|
```
|
|
|
|
## Contenido de la pagina web:
|
|
{content}
|
|
|
|
## Nota:
|
|
- Devuelve SOLO el objeto JSON, sin otro texto
|
|
- La confianza debe estar entre 0.0 y 1.0
|
|
- NUNCA TRADUCIR - preservar idioma original
|
|
- Si no se encuentra nada: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
|
"""
|
|
|
|
EXTRACTION_PROMPT_PT = """Voce e um especialista em analisar sites de instituicoes patrimoniais (museus, arquivos, bibliotecas, etc.).
|
|
|
|
## CRITICO - NAO TRADUZIR:
|
|
Extraia o texto EXATAMENTE como aparece na pagina web.
|
|
NAO TRADUZA para outro idioma. Preserve o texto original em seu idioma original.
|
|
Se a fonte esta em portugues, a saida deve estar em portugues.
|
|
Se a fonte esta em ingles, a saida deve estar em ingles (nao traduzir para portugues).
|
|
|
|
Analise o seguinte conteudo da pagina web e extraia a missao, visao e/ou objetivos da organizacao.
|
|
|
|
## Instrucoes:
|
|
1. Procure declaracoes explicitas de missao ou visao
|
|
2. Preste atencao a frases como "Nossa missao e...", "Temos como objetivo...", "O museu busca...", etc.
|
|
3. Ignore elementos de navegacao, texto de rodape, informacoes de contato, horarios
|
|
4. Ignore anuncios, noticias e anuncios de eventos
|
|
5. Se NAO houver uma missao/visao/objetivo claro, retorne um resultado vazio
|
|
6. COPIE o texto literalmente - NAO PARAFRASEAR nem TRADUZIR
|
|
|
|
## Formato de saida (JSON):
|
|
Retorne APENAS um objeto JSON neste formato exato:
|
|
```json
|
|
{{
|
|
"mission": "O texto original da missao aqui (SEM TRADUZIR), ou null se nao encontrado",
|
|
"vision": "O texto original da visao aqui (SEM TRADUZIR), ou null se nao encontrado",
|
|
"goals": "Os objetivos originais aqui (SEM TRADUZIR), ou null se nao encontrados",
|
|
"confidence": 0.85,
|
|
"source_section": "Nome da secao onde foi encontrado (ex. 'Sobre nos', 'Missao e Visao')",
|
|
"detected_language": "pt"
|
|
}}
|
|
```
|
|
|
|
## Conteudo da pagina web:
|
|
{content}
|
|
|
|
## Nota:
|
|
- Retorne APENAS o objeto JSON, sem outro texto
|
|
- A confianca deve estar entre 0.0 e 1.0
|
|
- NUNCA TRADUZIR - preservar idioma original
|
|
- Se nada encontrado: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
|
"""
|
|
|
|
EXTRACTION_PROMPT_DE = """Sie sind ein Experte fur die Analyse von Websites von Kulturerbe-Institutionen (Museen, Archive, Bibliotheken, etc.).
|
|
|
|
## KRITISCH - NICHT UBERSETZEN:
|
|
Extrahieren Sie den Text GENAU so, wie er auf der Webseite erscheint.
|
|
NICHT in eine andere Sprache UBERSETZEN. Bewahren Sie den Originaltext in seiner Originalsprache.
|
|
Wenn die Quelle auf Deutsch ist, muss die Ausgabe auf Deutsch sein.
|
|
Wenn die Quelle auf Englisch ist, muss die Ausgabe auf Englisch sein (nicht ins Deutsche ubersetzen).
|
|
|
|
Analysieren Sie den folgenden Webseiteninhalt und extrahieren Sie die Mission, Vision und/oder Ziele der Organisation.
|
|
|
|
## Anweisungen:
|
|
1. Suchen Sie nach expliziten Missions- oder Visionserklarungen
|
|
2. Achten Sie auf Satze wie "Unsere Mission ist...", "Wir streben an...", "Das Museum hat zum Ziel...", etc.
|
|
3. Ignorieren Sie Navigationselemente, Fusszeilen, Kontaktdaten, Offnungszeiten
|
|
4. Ignorieren Sie Werbung, Nachrichten und Veranstaltungsankundigungen
|
|
5. Wenn KEINE klare Mission/Vision/Ziel zu finden ist, geben Sie ein leeres Ergebnis zuruck
|
|
6. KOPIEREN Sie den Text wortlich - NICHT PARAPHRASIEREN oder UBERSETZEN
|
|
|
|
## Ausgabeformat (JSON):
|
|
Geben Sie NUR ein JSON-Objekt in diesem genauen Format zuruck:
|
|
```json
|
|
{{
|
|
"mission": "Der originale Missionstext hier (NICHT UBERSETZT), oder null wenn nicht gefunden",
|
|
"vision": "Der originale Visionstext hier (NICHT UBERSETZT), oder null wenn nicht gefunden",
|
|
"goals": "Die originalen Ziele hier (NICHT UBERSETZT), oder null wenn nicht gefunden",
|
|
"confidence": 0.85,
|
|
"source_section": "Name des Abschnitts, in dem dies gefunden wurde (z.B. 'Uber uns', 'Mission und Vision')",
|
|
"detected_language": "de"
|
|
}}
|
|
```
|
|
|
|
## Webseiteninhalt:
|
|
{content}
|
|
|
|
## Hinweis:
|
|
- Geben Sie NUR das JSON-Objekt zuruck, keinen anderen Text
|
|
- Confidence muss zwischen 0.0 und 1.0 liegen
|
|
- NIEMALS UBERSETZEN - Originalsprache bewahren
|
|
- Wenn nichts gefunden: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
|
"""
|
|
|
|
EXTRACTION_PROMPT_FR = """Vous etes un expert dans l'analyse des sites web d'institutions patrimoniales (musees, archives, bibliotheques, etc.).
|
|
|
|
## CRITIQUE - NE PAS TRADUIRE:
|
|
Extrayez le texte EXACTEMENT tel qu'il apparait sur la page web.
|
|
NE TRADUISEZ PAS dans une autre langue. Preservez le texte original dans sa langue originale.
|
|
Si la source est en francais, la sortie doit etre en francais.
|
|
Si la source est en anglais, la sortie doit etre en anglais (ne pas traduire en francais).
|
|
|
|
Analysez le contenu de la page web suivante et extrayez la mission, la vision et/ou les objectifs de l'organisation.
|
|
|
|
## Instructions:
|
|
1. Recherchez des declarations explicites de mission ou de vision
|
|
2. Faites attention aux phrases comme "Notre mission est...", "Nous visons a...", "Le musee a pour but...", etc.
|
|
3. Ignorez les elements de navigation, le texte de pied de page, les coordonnees, les horaires
|
|
4. Ignorez les publicites, les actualites et les annonces d'evenements
|
|
5. S'il n'y a PAS de mission/vision/objectif clair, retournez un resultat vide
|
|
6. COPIEZ le texte litteralement - NE PAS PARAPHRASER ni TRADUIRE
|
|
|
|
## Format de sortie (JSON):
|
|
Retournez UNIQUEMENT un objet JSON dans ce format exact:
|
|
```json
|
|
{{
|
|
"mission": "Le texte original de la mission ici (NON TRADUIT), ou null si non trouve",
|
|
"vision": "Le texte original de la vision ici (NON TRADUIT), ou null si non trouve",
|
|
"goals": "Les objectifs originaux ici (NON TRADUITS), ou null si non trouves",
|
|
"confidence": 0.85,
|
|
"source_section": "Nom de la section ou cela a ete trouve (ex. 'A propos', 'Mission et Vision')",
|
|
"detected_language": "fr"
|
|
}}
|
|
```
|
|
|
|
## Contenu de la page web:
|
|
{content}
|
|
|
|
## Note:
|
|
- Retournez UNIQUEMENT l'objet JSON, pas d'autre texte
|
|
- La confiance doit etre entre 0.0 et 1.0
|
|
- JAMAIS TRADUIRE - preserver la langue originale
|
|
- Si rien trouve: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
|
"""
|
|
|
|
EXTRACTION_PROMPT_EN = """You are an expert in analyzing heritage institution websites (museums, archives, libraries, etc.).
|
|
|
|
## CRITICAL - DO NOT TRANSLATE:
|
|
Extract the text EXACTLY as it appears on the webpage.
|
|
DO NOT TRANSLATE to another language. Preserve the original text in its original language.
|
|
If the source is in English, the output must be in English.
|
|
If the source is in Dutch, the output must be in Dutch (do not translate to English).
|
|
If the source is in Spanish, the output must be in Spanish (do not translate to English).
|
|
If the source is in any other language, preserve that language.
|
|
|
|
Analyze the following webpage content and extract the mission, vision and/or goals of the organization.
|
|
|
|
## Instructions:
|
|
1. Look for explicit mission or vision statements
|
|
2. Pay attention to phrases like "Our mission is...", "We aim to...", "The museum seeks to...", etc.
|
|
3. Ignore navigation elements, footer text, contact information, opening hours
|
|
4. Ignore advertisements, news, and event announcements
|
|
5. If there is NO clear mission/vision/goal, return an empty result
|
|
6. COPY the text verbatim - DO NOT PARAPHRASE or TRANSLATE
|
|
|
|
## Output Format (JSON):
|
|
Return ONLY a JSON object in this exact format:
|
|
```json
|
|
{{
|
|
"mission": "The original mission text here (NOT TRANSLATED), or null if not found",
|
|
"vision": "The original vision text here (NOT TRANSLATED), or null if not found",
|
|
"goals": "The original goals here (NOT TRANSLATED), or null if not found",
|
|
"confidence": 0.85,
|
|
"source_section": "Name of the section where this was found (e.g., 'About us', 'Mission and Vision')",
|
|
"detected_language": "en"
|
|
}}
|
|
```
|
|
|
|
## Webpage content:
|
|
{content}
|
|
|
|
## Note:
|
|
- Return ONLY the JSON object, no other text
|
|
- Confidence must be between 0.0 and 1.0
|
|
- NEVER TRANSLATE - preserve original language
|
|
- If nothing found: {{"mission": null, "vision": null, "goals": null, "confidence": 0.0, "source_section": null, "detected_language": null}}
|
|
"""
|
|
|
|
# Map language codes to prompts
|
|
LANGUAGE_PROMPTS = {
|
|
'nl': EXTRACTION_PROMPT_NL,
|
|
'es': EXTRACTION_PROMPT_ES,
|
|
'pt': EXTRACTION_PROMPT_PT,
|
|
'de': EXTRACTION_PROMPT_DE,
|
|
'fr': EXTRACTION_PROMPT_FR,
|
|
'en': EXTRACTION_PROMPT_EN,
|
|
}
|
|
|
|
# Default prompt for languages without specific template
|
|
EXTRACTION_PROMPT = EXTRACTION_PROMPT_EN # Fallback to English prompt
|
|
|
|
def __init__(self, api_token: str, model: str = ZAI_GLM_MODEL):
|
|
self.api_token = api_token
|
|
self.model = model
|
|
self.headers = {
|
|
"Authorization": f"Bearer {api_token}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
# Language-specific system messages (Rule 36: preserve original language)
|
|
SYSTEM_MESSAGES = {
|
|
'nl': "Je bent een assistent die JSON-gestructureerde data extraheert uit webpagina's. Antwoord ALLEEN met valid JSON. KRITIEK: Vertaal NOOIT de geëxtraheerde tekst - behoud de originele taal.",
|
|
'es': "Eres un asistente que extrae datos estructurados en JSON de paginas web. Responde SOLO con JSON valido. CRITICO: NUNCA traduzcas el texto extraido - preserva el idioma original.",
|
|
'pt': "Voce e um assistente que extrai dados estruturados em JSON de paginas web. Responda APENAS com JSON valido. CRITICO: NUNCA traduza o texto extraido - preserve o idioma original.",
|
|
'de': "Sie sind ein Assistent, der JSON-strukturierte Daten aus Webseiten extrahiert. Antworten Sie NUR mit validem JSON. KRITISCH: Übersetzen Sie NIEMALS den extrahierten Text - bewahren Sie die Originalsprache.",
|
|
'fr': "Vous etes un assistant qui extrait des donnees structurees JSON des pages web. Repondez UNIQUEMENT avec du JSON valide. CRITIQUE: Ne traduisez JAMAIS le texte extrait - preservez la langue originale.",
|
|
'en': "You are an assistant that extracts JSON-structured data from webpages. Respond ONLY with valid JSON. CRITICAL: NEVER translate the extracted text - preserve the original language.",
|
|
}
|
|
|
|
async def extract_mission_from_content(
|
|
self,
|
|
content: str,
|
|
source_url: str,
|
|
language: str = 'en',
|
|
timeout: float = 60.0
|
|
) -> dict:
|
|
"""
|
|
Use LLM to extract mission statement from webpage content.
|
|
|
|
Implements Rule 36: Original Language Preservation - NO TRANSLATION.
|
|
|
|
Args:
|
|
content: The webpage text content (markdown or plain text)
|
|
source_url: URL of the source page (for context)
|
|
language: ISO 639-1 language code (e.g., 'nl', 'es', 'de') for prompt selection
|
|
timeout: Request timeout in seconds
|
|
|
|
Returns:
|
|
dict with keys: success, mission, vision, goals, confidence, detected_language, error
|
|
"""
|
|
# Truncate content if too long (GLM has context limits)
|
|
max_chars = 12000
|
|
if len(content) > max_chars:
|
|
content = content[:max_chars] + "\n\n[... content truncated ...]"
|
|
|
|
# Select language-appropriate prompt (Rule 36: Original Language Preservation)
|
|
prompt_template = self.LANGUAGE_PROMPTS.get(language, self.EXTRACTION_PROMPT)
|
|
prompt = prompt_template.format(content=content)
|
|
|
|
# Select language-appropriate system message
|
|
system_message = self.SYSTEM_MESSAGES.get(language, self.SYSTEM_MESSAGES['en'])
|
|
|
|
request_body = {
|
|
"model": self.model,
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": system_message
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": prompt
|
|
}
|
|
],
|
|
"temperature": 0.1, # Low temperature for consistent extraction
|
|
"max_tokens": 2048,
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
try:
|
|
response = await client.post(
|
|
ZAI_GLM_API_URL,
|
|
headers=self.headers,
|
|
json=request_body
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
return {
|
|
"success": False,
|
|
"error": f"API error {response.status_code}: {response.text[:200]}",
|
|
}
|
|
|
|
result = response.json()
|
|
|
|
# Extract the assistant's response
|
|
if "choices" not in result or len(result["choices"]) == 0:
|
|
return {
|
|
"success": False,
|
|
"error": "No response from API",
|
|
}
|
|
|
|
assistant_message = result["choices"][0]["message"]["content"]
|
|
|
|
# Parse JSON from response
|
|
# Handle markdown code blocks if present
|
|
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', assistant_message)
|
|
if json_match:
|
|
json_str = json_match.group(1)
|
|
else:
|
|
json_str = assistant_message.strip()
|
|
|
|
try:
|
|
extracted = json.loads(json_str)
|
|
except json.JSONDecodeError as e:
|
|
return {
|
|
"success": False,
|
|
"error": f"Failed to parse JSON response: {e}",
|
|
"raw_response": assistant_message[:500],
|
|
}
|
|
|
|
# Validate and return
|
|
return {
|
|
"success": True,
|
|
"mission": extracted.get("mission"),
|
|
"vision": extracted.get("vision"),
|
|
"goals": extracted.get("goals"),
|
|
"confidence": extracted.get("confidence", 0.0),
|
|
"source_section": extracted.get("source_section"),
|
|
"model": self.model,
|
|
}
|
|
|
|
except httpx.TimeoutException:
|
|
return {
|
|
"success": False,
|
|
"error": "Request timed out",
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
def find_custodians_with_websites(
|
|
prefix: Optional[str] = None,
|
|
limit: Optional[int] = None
|
|
) -> list[tuple[Path, dict, str]]:
|
|
"""
|
|
Find custodian YAML files that have website URLs.
|
|
|
|
Args:
|
|
prefix: Filter by GHCID prefix (e.g., "NL-NH" for Noord-Holland)
|
|
limit: Maximum number of custodians to return
|
|
|
|
Returns:
|
|
List of (path, custodian_data, website_url) tuples
|
|
"""
|
|
custodian_dir = PROJECT_ROOT / "data" / "custodian"
|
|
results = []
|
|
|
|
pattern = f"{prefix}*.yaml" if prefix else "NL-*.yaml"
|
|
|
|
for yaml_path in custodian_dir.glob(pattern):
|
|
if limit and len(results) >= limit:
|
|
break
|
|
|
|
try:
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
continue
|
|
|
|
# Extract website URL from various possible locations (priority order)
|
|
website = None
|
|
|
|
# 1. Direct website field
|
|
if 'website' in data and data['website']:
|
|
website = data['website']
|
|
|
|
# 2. Original entry webadres_organisatie
|
|
if not website and 'original_entry' in data:
|
|
oe = data['original_entry']
|
|
if isinstance(oe, dict) and oe.get('webadres_organisatie'):
|
|
website = oe['webadres_organisatie']
|
|
|
|
# 3. Museum register enrichment website_url
|
|
if not website and 'museum_register_enrichment' in data:
|
|
mre = data['museum_register_enrichment']
|
|
if isinstance(mre, dict) and mre.get('website_url'):
|
|
website = mre['website_url']
|
|
|
|
# 4. Wikidata enrichment official_website
|
|
if not website and 'wikidata_enrichment' in data:
|
|
we = data['wikidata_enrichment']
|
|
if isinstance(we, dict) and we.get('official_website'):
|
|
website = we['official_website']
|
|
|
|
# 5. Google Maps enrichment website
|
|
if not website and 'google_maps_enrichment' in data:
|
|
gm = data['google_maps_enrichment']
|
|
if isinstance(gm, dict) and gm.get('website'):
|
|
website = gm['website']
|
|
|
|
# 6. Location object website
|
|
if not website and 'location' in data:
|
|
loc = data['location']
|
|
if isinstance(loc, dict) and loc.get('website'):
|
|
website = loc['website']
|
|
|
|
# 7. Original entry identifiers (Website scheme)
|
|
if not website and 'original_entry' in data:
|
|
oe = data['original_entry']
|
|
if isinstance(oe, dict) and 'identifiers' in oe:
|
|
for ident in oe.get('identifiers', []):
|
|
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
|
|
website = ident.get('identifier_value') or ident.get('identifier_url')
|
|
if website:
|
|
break
|
|
|
|
# 8. Top-level identifiers array (Website scheme)
|
|
if not website and 'identifiers' in data:
|
|
for ident in data.get('identifiers', []):
|
|
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
|
|
website = ident.get('identifier_value') or ident.get('identifier_url')
|
|
if website:
|
|
break
|
|
|
|
if website and website.startswith('http'):
|
|
results.append((yaml_path, data, website))
|
|
|
|
except Exception as e:
|
|
print(f"Warning: Failed to parse {yaml_path}: {e}", file=sys.stderr)
|
|
|
|
return results
|
|
|
|
|
|
def discover_mission_page_urls(base_url: str) -> list[str]:
|
|
"""
|
|
Generate candidate URLs for mission/vision pages.
|
|
|
|
Args:
|
|
base_url: The custodian's main website URL
|
|
|
|
Returns:
|
|
List of URLs to check for mission content
|
|
"""
|
|
# Normalize base URL - prefer https
|
|
parsed = urlparse(base_url)
|
|
scheme = 'https' if parsed.scheme == 'http' else parsed.scheme
|
|
base = f"{scheme}://{parsed.netloc}"
|
|
|
|
candidates = []
|
|
# Use ALL_MISSION_PATTERNS for multilingual support
|
|
for pattern in ALL_MISSION_PATTERNS:
|
|
candidates.append(urljoin(base, pattern))
|
|
|
|
# Also add the homepage as it might contain mission info
|
|
candidates.append(base_url)
|
|
|
|
return candidates
|
|
|
|
|
|
# Keywords to look for in links when discovering mission pages (multilingual)
|
|
MISSION_LINK_KEYWORDS = [
|
|
# Dutch
|
|
'missie', 'visie', 'over-ons', 'over', 'organisatie', 'doelstelling',
|
|
'wie-zijn-wij', 'wie-we-zijn', 'onze-missie', 'het-museum', 'het-archief',
|
|
'de-bibliotheek', 'stichting', 'vereniging', 'kernwaarden', 'ambitie',
|
|
# Spanish
|
|
'mision', 'vision', 'sobre-nosotros', 'quienes-somos', 'institucional',
|
|
'historia', 'el-museo', 'la-biblioteca', 'el-archivo', 'acerca-de',
|
|
'nuestra-mision', 'conocenos', 'nosotros',
|
|
# Portuguese
|
|
'missao', 'visao', 'sobre', 'sobre-nos', 'quem-somos', 'o-museu',
|
|
'a-biblioteca', 'o-arquivo', 'nossa-missao', 'conheca',
|
|
# German
|
|
'leitbild', 'ueber-uns', 'uber-uns', 'das-museum', 'wir-ueber-uns',
|
|
# French
|
|
'a-propos', 'qui-sommes-nous', 'notre-mission', 'le-musee', 'presentation',
|
|
# English
|
|
'about', 'about-us', 'mission', 'vision', 'organization', 'who-we-are',
|
|
]
|
|
|
|
|
|
def extract_links_from_markdown(content: str, base_url: str) -> list[str]:
|
|
"""
|
|
Extract all links from markdown content.
|
|
|
|
Args:
|
|
content: Markdown text content
|
|
base_url: Base URL for resolving relative links
|
|
|
|
Returns:
|
|
List of absolute URLs found in the content
|
|
"""
|
|
links = []
|
|
|
|
# Match markdown links: [text](url)
|
|
md_link_pattern = r'\[([^\]]*)\]\(([^)]+)\)'
|
|
for match in re.finditer(md_link_pattern, content):
|
|
url = match.group(2).strip()
|
|
if url:
|
|
# Skip anchors, mailto, tel, etc.
|
|
if url.startswith('#') or url.startswith('mailto:') or url.startswith('tel:'):
|
|
continue
|
|
# Resolve relative URLs
|
|
if not url.startswith('http'):
|
|
url = urljoin(base_url, url)
|
|
links.append(url)
|
|
|
|
# Also match plain URLs in text
|
|
url_pattern = r'https?://[^\s<>\)\]"\']+'
|
|
for match in re.finditer(url_pattern, content):
|
|
url = match.group(0).rstrip('.,;:')
|
|
if url not in links:
|
|
links.append(url)
|
|
|
|
return links
|
|
|
|
|
|
def filter_mission_links(links: list[str], base_domain: str) -> list[str]:
|
|
"""
|
|
Filter links to only those likely to contain mission/vision content.
|
|
|
|
Args:
|
|
links: List of URLs to filter
|
|
base_domain: Domain of the custodian website (only keep same-domain links)
|
|
|
|
Returns:
|
|
List of URLs that likely contain mission content
|
|
"""
|
|
mission_urls = []
|
|
|
|
for url in links:
|
|
parsed = urlparse(url)
|
|
|
|
# Only keep links from the same domain
|
|
if parsed.netloc and base_domain not in parsed.netloc:
|
|
continue
|
|
|
|
# Check if path contains mission-related keywords
|
|
path_lower = parsed.path.lower()
|
|
for keyword in MISSION_LINK_KEYWORDS:
|
|
if keyword in path_lower:
|
|
if url not in mission_urls:
|
|
mission_urls.append(url)
|
|
break
|
|
|
|
return mission_urls
|
|
|
|
|
|
async def discover_mission_links_from_homepage(
|
|
reader: Union['LinkupWebReader', 'ZAIWebReader'],
|
|
homepage_url: str,
|
|
verbose: bool = False
|
|
) -> tuple[list[str], str, str]:
|
|
"""
|
|
Fetch homepage and discover links to mission/vision pages.
|
|
|
|
This is more reliable than guessing URL patterns because it finds
|
|
the actual links used by the website.
|
|
|
|
Args:
|
|
reader: Web reader instance
|
|
homepage_url: The custodian's homepage URL
|
|
verbose: Whether to print progress
|
|
|
|
Returns:
|
|
Tuple of (discovered_urls, homepage_content, retrieved_on)
|
|
Returns ([], '', '') if homepage fetch fails
|
|
"""
|
|
# Fetch homepage
|
|
result = await reader.read_webpage(homepage_url)
|
|
|
|
if not result['success']:
|
|
if verbose:
|
|
print(f" Homepage fetch failed: {result.get('error', 'Unknown')[:50]}")
|
|
return [], '', ''
|
|
|
|
content = result.get('content', '')
|
|
retrieved_on = result.get('retrieved_on', datetime.now(timezone.utc).isoformat())
|
|
|
|
if not content:
|
|
return [], content, retrieved_on
|
|
|
|
# Extract base domain for filtering
|
|
parsed = urlparse(homepage_url)
|
|
base_domain = parsed.netloc.lower()
|
|
|
|
# Extract all links from homepage
|
|
all_links = extract_links_from_markdown(content, homepage_url)
|
|
|
|
if verbose:
|
|
print(f" Found {len(all_links)} links on homepage")
|
|
|
|
# Filter to mission-related links
|
|
mission_links = filter_mission_links(all_links, base_domain)
|
|
|
|
if verbose and mission_links:
|
|
print(f" Found {len(mission_links)} mission-related links:")
|
|
for link in mission_links[:5]: # Show first 5
|
|
print(f" - {link}")
|
|
|
|
return mission_links, content, retrieved_on
|
|
|
|
|
|
def extract_statements_from_content(
|
|
content: str,
|
|
source_url: str,
|
|
retrieved_on: str,
|
|
ghcid: str,
|
|
) -> list[dict]:
|
|
"""
|
|
Extract mission, vision, and goal statements from webpage content.
|
|
|
|
This uses keyword matching and section detection. For production,
|
|
consider using an LLM for more intelligent extraction.
|
|
|
|
Args:
|
|
content: The webpage text content
|
|
source_url: URL of the source page
|
|
retrieved_on: ISO timestamp when page was retrieved
|
|
ghcid: GHCID of the custodian
|
|
|
|
Returns:
|
|
List of mission statement dictionaries
|
|
"""
|
|
statements = []
|
|
content_lower = content.lower()
|
|
|
|
# Skip error pages (404, 500, etc.)
|
|
error_indicators = [
|
|
'pagina niet gevonden', 'page not found', '404',
|
|
'niet gevonden', 'not found', 'error', 'fout',
|
|
'deze pagina bestaat niet', 'this page does not exist'
|
|
]
|
|
# Check title and first 500 chars for error indicators
|
|
if any(indicator in content_lower[:500] for indicator in error_indicators):
|
|
return []
|
|
|
|
# Also check if content looks like raw JSON (Z.AI sometimes returns this)
|
|
if content.strip().startswith('{"') or content.strip().startswith('"{'):
|
|
return []
|
|
|
|
# Check if this page has mission-related content
|
|
has_mission_content = any(
|
|
keyword in content_lower
|
|
for keywords in MISSION_KEYWORDS.values()
|
|
for keyword in keywords
|
|
)
|
|
|
|
if not has_mission_content:
|
|
return []
|
|
|
|
# Split content into sections (by headings or blank lines)
|
|
sections = re.split(r'\n\s*\n|\n#+\s+|\n\*\*[^*]+\*\*\n', content)
|
|
|
|
for section in sections:
|
|
section = section.strip()
|
|
if len(section) < 20: # Skip very short sections
|
|
continue
|
|
|
|
section_lower = section.lower()
|
|
|
|
# Detect statement type based on keywords
|
|
statement_type = None
|
|
confidence = 0.7
|
|
|
|
for stype, keywords in MISSION_KEYWORDS.items():
|
|
for keyword in keywords:
|
|
if keyword in section_lower[:200]: # Check beginning of section
|
|
statement_type = stype
|
|
confidence = 0.85 if keyword in section_lower[:50] else 0.75
|
|
break
|
|
if statement_type:
|
|
break
|
|
|
|
if not statement_type:
|
|
continue
|
|
|
|
# Clean up the section text
|
|
# Remove markdown formatting
|
|
clean_text = re.sub(r'\*\*([^*]+)\*\*', r'\1', section)
|
|
clean_text = re.sub(r'\*([^*]+)\*', r'\1', clean_text)
|
|
clean_text = re.sub(r'#+\s*', '', clean_text)
|
|
clean_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', clean_text)
|
|
clean_text = clean_text.strip()
|
|
|
|
if len(clean_text) < 20:
|
|
continue
|
|
|
|
# Skip boilerplate/footer content
|
|
boilerplate_indicators = [
|
|
'©', 'copyright', 'all rights reserved', 'alle rechten voorbehouden',
|
|
'privacybeleid', 'privacy policy', 'cookie', 'algemene voorwaarden',
|
|
'terms and conditions', 'nieuwsbrief', 'newsletter', 'subscribe',
|
|
'volg ons', 'follow us', 'social media', 'facebook', 'instagram',
|
|
'twitter', 'linkedin', 'youtube', 'contact', 'openingstijden',
|
|
'opening hours', 'bereikbaarheid', 'route', 'adres:', 'address:',
|
|
]
|
|
clean_lower = clean_text.lower()
|
|
boilerplate_count = sum(1 for ind in boilerplate_indicators if ind in clean_lower)
|
|
# If more than 2 boilerplate indicators in a short text, skip it
|
|
if boilerplate_count >= 2 and len(clean_text) < 200:
|
|
continue
|
|
# If the text is primarily copyright/footer (starts with ©)
|
|
if clean_text.strip().startswith('©'):
|
|
continue
|
|
|
|
# Skip navigation/intro text (too short to be actual mission content)
|
|
# Actual mission statements are usually at least 50 characters
|
|
if len(clean_text) < 50:
|
|
continue
|
|
|
|
# Skip text that looks like a link/intro (e.g., "Lees alles over...")
|
|
skip_patterns = [
|
|
r'^lees\s+(alles\s+)?over',
|
|
r'^klik\s+hier',
|
|
r'^meer\s+(info|informatie)',
|
|
r'^bekijk\s+',
|
|
r'^ga\s+naar',
|
|
r'^read\s+(more\s+)?about',
|
|
r'^click\s+here',
|
|
r'^view\s+',
|
|
]
|
|
if any(re.match(pattern, clean_lower) for pattern in skip_patterns):
|
|
continue
|
|
|
|
# Generate statement ID
|
|
year = datetime.now().year
|
|
statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{statement_type}-{year}"
|
|
|
|
# Compute content hash
|
|
content_hash = compute_content_hash(clean_text)
|
|
|
|
# Current timestamp for statement creation (Rule 35: Dual Timestamps)
|
|
statement_created_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
statement = {
|
|
'statement_id': statement_id,
|
|
'statement_type': statement_type,
|
|
'statement_text': clean_text,
|
|
'statement_language': get_language_from_ghcid(ghcid), # Detect from GHCID country
|
|
'extracted_verbatim': True, # Rule 36: Confirms no translation occurred
|
|
'source_url': source_url,
|
|
'content_hash': content_hash,
|
|
# Rule 35: Dual Timestamp Provenance
|
|
'provenance': {
|
|
'statement_created_at': statement_created_at, # When this claim was extracted
|
|
'source_archived_at': retrieved_on, # When the webpage was fetched
|
|
'retrieval_agent': 'linkup-api',
|
|
'extraction_agent': 'keyword-matching/batch',
|
|
'extraction_confidence': confidence,
|
|
# W3C PROV-O compatible fields
|
|
'prov:wasDerivedFrom': source_url,
|
|
'prov:generatedAtTime': statement_created_at,
|
|
}
|
|
}
|
|
|
|
statements.append(statement)
|
|
|
|
return statements
|
|
|
|
|
|
async def extract_statements_with_llm(
|
|
llm_extractor: GLMMissionExtractor,
|
|
content: str,
|
|
source_url: str,
|
|
retrieved_on: str,
|
|
ghcid: str,
|
|
) -> list[dict]:
|
|
"""
|
|
Extract mission, vision, and goal statements using LLM (Z.AI GLM).
|
|
|
|
This provides much better quality extraction than keyword matching
|
|
by using semantic understanding of the content.
|
|
|
|
Args:
|
|
llm_extractor: GLMMissionExtractor instance
|
|
content: The webpage text content
|
|
source_url: URL of the source page
|
|
retrieved_on: ISO timestamp when page was retrieved
|
|
ghcid: GHCID of the custodian
|
|
|
|
Returns:
|
|
List of mission statement dictionaries
|
|
"""
|
|
# Quick pre-filter: skip obvious error pages
|
|
content_lower = content.lower()
|
|
error_indicators = [
|
|
'pagina niet gevonden', 'page not found', '404',
|
|
'niet gevonden', 'not found', 'deze pagina bestaat niet',
|
|
'oeps', 'error', 'no routes match', 'routing error'
|
|
]
|
|
if any(indicator in content_lower[:500] for indicator in error_indicators):
|
|
return []
|
|
|
|
# Skip raw JSON responses
|
|
if content.strip().startswith('{"') or content.strip().startswith('"{'):
|
|
return []
|
|
|
|
# Skip very short content (likely empty page)
|
|
if len(content.strip()) < 200:
|
|
return []
|
|
|
|
# Detect language from GHCID (Rule 36: Original Language Preservation)
|
|
language = get_language_from_ghcid(ghcid)
|
|
|
|
# Call LLM for extraction with language-specific prompt
|
|
result = await llm_extractor.extract_mission_from_content(
|
|
content=content,
|
|
source_url=source_url,
|
|
language=language
|
|
)
|
|
|
|
if not result['success']:
|
|
return []
|
|
|
|
statements = []
|
|
year = datetime.now().year
|
|
|
|
# Get expected language from GHCID (Rule 36: Original Language Preservation)
|
|
expected_language = get_language_from_ghcid(ghcid)
|
|
# Use detected language from LLM if available, else fall back to expected
|
|
detected_language = result.get('detected_language') or expected_language
|
|
|
|
# Process each statement type
|
|
for statement_type in ['mission', 'vision', 'goals']:
|
|
text = result.get(statement_type)
|
|
if not text or text == 'null' or len(str(text).strip()) < 20:
|
|
continue
|
|
|
|
# Map 'goals' to 'goal' for consistency with schema
|
|
schema_type = 'goal' if statement_type == 'goals' else statement_type
|
|
|
|
# Generate statement ID
|
|
statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{schema_type}-{year}"
|
|
|
|
# Compute content hash
|
|
content_hash = compute_content_hash(str(text))
|
|
|
|
# Current timestamp for statement creation (Rule 35: Dual Timestamps)
|
|
statement_created_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
statement = {
|
|
'statement_id': statement_id,
|
|
'statement_type': schema_type,
|
|
'statement_text': str(text).strip(),
|
|
'statement_language': detected_language, # Use LLM-detected language
|
|
'extracted_verbatim': True, # Rule 36: Confirms no translation occurred
|
|
'source_url': source_url,
|
|
'content_hash': content_hash,
|
|
# Rule 35: Dual Timestamp Provenance
|
|
'provenance': {
|
|
'statement_created_at': statement_created_at, # When this claim was extracted
|
|
'source_archived_at': retrieved_on, # When the webpage was fetched
|
|
'retrieval_agent': 'linkup-api',
|
|
'extraction_agent': f'zai-glm/{result.get("model", ZAI_GLM_MODEL)}',
|
|
'extraction_confidence': result.get('confidence', 0.0),
|
|
# W3C PROV-O compatible fields
|
|
'prov:wasDerivedFrom': source_url,
|
|
'prov:generatedAtTime': statement_created_at,
|
|
}
|
|
}
|
|
|
|
# Add source section if available
|
|
if result.get('source_section'):
|
|
statement['source_section'] = result['source_section']
|
|
|
|
# Document language mismatch if detected language differs from expected (Rule 36)
|
|
if detected_language != expected_language:
|
|
statement['language_note'] = f"Content in {detected_language}, expected {expected_language} based on GHCID country code"
|
|
|
|
statements.append(statement)
|
|
|
|
return statements
|
|
|
|
|
|
def update_custodian_yaml(
|
|
yaml_path: Path,
|
|
custodian_data: dict,
|
|
statements: list[dict],
|
|
dry_run: bool = False
|
|
) -> bool:
|
|
"""
|
|
Update custodian YAML file with extracted mission statements.
|
|
|
|
Args:
|
|
yaml_path: Path to the custodian YAML file
|
|
custodian_data: Current custodian data
|
|
statements: List of extracted statements
|
|
dry_run: If True, don't write changes
|
|
|
|
Returns:
|
|
True if updated successfully
|
|
"""
|
|
if not statements:
|
|
return False
|
|
|
|
# Initialize or update mission_statement field
|
|
if 'mission_statement' not in custodian_data:
|
|
custodian_data['mission_statement'] = []
|
|
|
|
existing_ids = {
|
|
s.get('statement_id') for s in custodian_data['mission_statement']
|
|
if isinstance(s, dict)
|
|
}
|
|
|
|
# Add new statements
|
|
added = 0
|
|
for statement in statements:
|
|
if statement['statement_id'] not in existing_ids:
|
|
custodian_data['mission_statement'].append(statement)
|
|
added += 1
|
|
|
|
if added == 0:
|
|
return False
|
|
|
|
if dry_run:
|
|
print(f" Would add {added} statements to {yaml_path.name}")
|
|
return True
|
|
|
|
# Write updated YAML
|
|
try:
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
custodian_data,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120
|
|
)
|
|
print(f" Added {added} statements to {yaml_path.name}")
|
|
return True
|
|
except Exception as e:
|
|
print(f" Error writing {yaml_path.name}: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
|
|
async def process_custodian(
|
|
reader: Union[LinkupWebReader, ZAIWebReader],
|
|
yaml_path: Path,
|
|
custodian_data: dict,
|
|
website: str,
|
|
dry_run: bool = False,
|
|
verbose: bool = False,
|
|
llm_extractor: Optional[GLMMissionExtractor] = None,
|
|
) -> dict:
|
|
"""
|
|
Process a single custodian: discover pages, fetch content, extract statements.
|
|
|
|
IMPROVED: Now uses two-phase discovery:
|
|
1. First fetch homepage and extract actual mission page links from navigation
|
|
2. Fall back to URL pattern guessing only if no links found
|
|
|
|
Args:
|
|
reader: Web reader instance (Linkup or ZAI)
|
|
yaml_path: Path to custodian YAML file
|
|
custodian_data: Current custodian data
|
|
website: Website URL to process
|
|
dry_run: If True, don't write changes
|
|
verbose: If True, show detailed progress
|
|
llm_extractor: Optional LLM extractor for intelligent extraction
|
|
|
|
Returns:
|
|
dict with processing results
|
|
"""
|
|
ghcid = yaml_path.stem.split('-')[0:5] # Extract base GHCID from filename
|
|
ghcid = '-'.join(ghcid[:5]) if len(ghcid) >= 5 else yaml_path.stem
|
|
|
|
# Get name for display
|
|
name = custodian_data.get('custodian_name', {}).get('emic_name')
|
|
if not name:
|
|
name = custodian_data.get('name', ghcid)
|
|
|
|
result = {
|
|
'ghcid': ghcid,
|
|
'name': name,
|
|
'website': website,
|
|
'pages_checked': 0,
|
|
'pages_with_content': 0,
|
|
'statements_found': 0,
|
|
'statements_added': 0,
|
|
'discovery_method': 'none',
|
|
'errors': [],
|
|
}
|
|
|
|
if verbose:
|
|
print(f"\nProcessing {ghcid}: {name}")
|
|
print(f" Website: {website}")
|
|
|
|
all_statements = []
|
|
homepage_content = None
|
|
homepage_retrieved_on = None
|
|
|
|
# PHASE 1: Discover mission pages from homepage links (preferred method)
|
|
if verbose:
|
|
print(f" Phase 1: Discovering mission pages from homepage...")
|
|
|
|
discovered_links, homepage_content, homepage_retrieved_on = await discover_mission_links_from_homepage(
|
|
reader, website, verbose
|
|
)
|
|
|
|
result['pages_checked'] += 1 # Homepage was fetched
|
|
|
|
if discovered_links:
|
|
result['discovery_method'] = 'homepage_links'
|
|
candidate_urls = discovered_links[:5] # Limit to 5 discovered links
|
|
if verbose:
|
|
print(f" Using {len(candidate_urls)} discovered mission links")
|
|
else:
|
|
# PHASE 2: Fall back to URL pattern guessing
|
|
result['discovery_method'] = 'pattern_guessing'
|
|
if verbose:
|
|
print(f" Phase 2: No mission links found, falling back to URL patterns...")
|
|
candidate_urls = discover_mission_page_urls(website)[:5]
|
|
|
|
# First, try to extract from homepage content (if we have it)
|
|
if homepage_content and len(homepage_content) > 200:
|
|
result['pages_with_content'] += 1
|
|
|
|
if llm_extractor:
|
|
statements = await extract_statements_with_llm(
|
|
llm_extractor, homepage_content, website,
|
|
homepage_retrieved_on or datetime.now(timezone.utc).isoformat(), ghcid
|
|
)
|
|
if verbose and statements:
|
|
print(f" [LLM] Found {len(statements)} statements on homepage")
|
|
else:
|
|
statements = extract_statements_from_content(
|
|
homepage_content, website,
|
|
homepage_retrieved_on or datetime.now(timezone.utc).isoformat(), ghcid
|
|
)
|
|
if verbose and statements:
|
|
print(f" [Keyword] Found {len(statements)} statements on homepage")
|
|
|
|
if statements:
|
|
all_statements.extend(statements)
|
|
# If we found a mission statement on homepage with high confidence, skip dedicated pages
|
|
# (unless using keyword extraction which has lower accuracy)
|
|
# Helper to get confidence (handles nested provenance structure)
|
|
def get_stmt_confidence(s):
|
|
if 'provenance' in s and 'extraction_confidence' in s['provenance']:
|
|
return s['provenance']['extraction_confidence']
|
|
return s.get('extraction_confidence', 0)
|
|
|
|
if llm_extractor and any(s['statement_type'] == 'mission' and get_stmt_confidence(s) > 0.7 for s in statements):
|
|
if verbose:
|
|
print(f" Found high-confidence mission on homepage, skipping dedicated pages")
|
|
result['discovery_method'] = 'homepage_content'
|
|
result['statements_found'] = len(all_statements)
|
|
# Deduplicate and return early
|
|
unique_statements = {}
|
|
for stmt in all_statements:
|
|
stype = stmt['statement_type']
|
|
if stype not in unique_statements or get_stmt_confidence(stmt) > get_stmt_confidence(unique_statements[stype]):
|
|
unique_statements[stype] = stmt
|
|
final_statements = list(unique_statements.values())
|
|
if final_statements:
|
|
if update_custodian_yaml(yaml_path, custodian_data, final_statements, dry_run):
|
|
result['statements_added'] = len(final_statements)
|
|
return result
|
|
|
|
# Check candidate mission page URLs
|
|
for url in candidate_urls:
|
|
# Skip if this is the homepage (already processed)
|
|
if url.rstrip('/') == website.rstrip('/'):
|
|
continue
|
|
|
|
result['pages_checked'] += 1
|
|
|
|
if verbose:
|
|
print(f" Checking: {url}")
|
|
|
|
# Fetch page content
|
|
page_result = await reader.read_webpage(url)
|
|
|
|
if not page_result['success']:
|
|
if verbose:
|
|
print(f" Failed: {page_result.get('error', 'Unknown error')[:50]}")
|
|
result['errors'].append(f"{url}: {page_result.get('error', 'Unknown')[:50]}")
|
|
continue
|
|
|
|
content = page_result.get('content', '')
|
|
if not content or len(content) < 100:
|
|
if verbose:
|
|
print(f" No content")
|
|
continue
|
|
|
|
result['pages_with_content'] += 1
|
|
|
|
# Extract statements from content
|
|
retrieved_on = page_result.get('retrieved_on', datetime.now(timezone.utc).isoformat())
|
|
|
|
# Use LLM extraction if available, otherwise fall back to keyword-based
|
|
if llm_extractor:
|
|
statements = await extract_statements_with_llm(
|
|
llm_extractor, content, url, retrieved_on, ghcid
|
|
)
|
|
if verbose and statements:
|
|
print(f" [LLM] Found {len(statements)} statements")
|
|
else:
|
|
statements = extract_statements_from_content(content, url, retrieved_on, ghcid)
|
|
if verbose and statements:
|
|
print(f" [Keyword] Found {len(statements)} statements")
|
|
|
|
if statements:
|
|
all_statements.extend(statements)
|
|
|
|
# If we found mission content on a dedicated page, prefer it over homepage
|
|
if any(s['statement_type'] == 'mission' for s in statements):
|
|
break
|
|
|
|
result['statements_found'] = len(all_statements)
|
|
|
|
# Helper function to get confidence from statement (handles nested provenance)
|
|
def get_confidence(stmt):
|
|
if 'provenance' in stmt and 'extraction_confidence' in stmt['provenance']:
|
|
return stmt['provenance']['extraction_confidence']
|
|
return stmt.get('extraction_confidence', 0)
|
|
|
|
# Deduplicate statements by type (keep highest confidence)
|
|
unique_statements = {}
|
|
for stmt in all_statements:
|
|
stype = stmt['statement_type']
|
|
if stype not in unique_statements or get_confidence(stmt) > get_confidence(unique_statements[stype]):
|
|
unique_statements[stype] = stmt
|
|
|
|
final_statements = list(unique_statements.values())
|
|
|
|
# Update YAML file
|
|
if final_statements:
|
|
if update_custodian_yaml(yaml_path, custodian_data, final_statements, dry_run):
|
|
result['statements_added'] = len(final_statements)
|
|
|
|
return result
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Batch extract mission statements from heritage custodian websites'
|
|
)
|
|
parser.add_argument(
|
|
'--test', type=int, metavar='N',
|
|
help='Test mode: process only N custodians'
|
|
)
|
|
parser.add_argument(
|
|
'--province', type=str, metavar='PREFIX',
|
|
help='Process custodians matching GHCID prefix (e.g., NL-NH for Noord-Holland)'
|
|
)
|
|
parser.add_argument(
|
|
'--ghcid', type=str,
|
|
help='Process a single custodian by GHCID'
|
|
)
|
|
parser.add_argument(
|
|
'--all', action='store_true',
|
|
help='Process all Dutch custodians with websites'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run', action='store_true',
|
|
help='Show what would be done without making changes'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose', '-v', action='store_true',
|
|
help='Show detailed progress'
|
|
)
|
|
parser.add_argument(
|
|
'--concurrency', type=int, default=3,
|
|
help='Number of concurrent requests (default: 3)'
|
|
)
|
|
parser.add_argument(
|
|
'--llm', action='store_true',
|
|
help='Use LLM (Z.AI GLM) for intelligent extraction instead of keyword matching'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not any([args.test, args.province, args.ghcid, args.all]):
|
|
parser.print_help()
|
|
print("\nExample usage:")
|
|
print(" python scripts/batch_extract_mission_statements.py --test 5 --verbose")
|
|
print(" python scripts/batch_extract_mission_statements.py --test 5 --llm --verbose # With LLM extraction")
|
|
print(" python scripts/batch_extract_mission_statements.py --province NL-NH --llm")
|
|
print(" python scripts/batch_extract_mission_statements.py --ghcid NL-ZH-ZUI-M-LMT --llm")
|
|
sys.exit(1)
|
|
|
|
# Get API tokens
|
|
try:
|
|
tokens = get_api_tokens()
|
|
except ValueError as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Initialize web reader - prefer Linkup (more reliable), fall back to Z.AI
|
|
if 'linkup' in tokens:
|
|
reader = LinkupWebReader(tokens['linkup'])
|
|
print("Using Linkup API for web fetching")
|
|
elif 'zai' in tokens:
|
|
reader = ZAIWebReader(tokens['zai'])
|
|
print("Using Z.AI Web Reader API for web fetching")
|
|
else:
|
|
print("Error: No API token available", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Initialize LLM extractor if requested
|
|
llm_extractor = None
|
|
if args.llm:
|
|
if 'zai' not in tokens:
|
|
print("Error: --llm requires ZAI_API_TOKEN for LLM extraction", file=sys.stderr)
|
|
sys.exit(1)
|
|
llm_extractor = GLMMissionExtractor(tokens['zai'])
|
|
print(f"Using Z.AI GLM ({ZAI_GLM_MODEL}) for LLM-based extraction")
|
|
|
|
# Find custodians to process
|
|
if args.ghcid:
|
|
# Single custodian mode
|
|
custodian_dir = PROJECT_ROOT / "data" / "custodian"
|
|
yaml_files = list(custodian_dir.glob(f"{args.ghcid}*.yaml"))
|
|
|
|
if not yaml_files:
|
|
print(f"Error: No custodian file found for GHCID {args.ghcid}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
yaml_path = yaml_files[0]
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Find website using same logic as find_custodians_with_websites
|
|
website = None
|
|
|
|
# 1. Direct website field
|
|
if 'website' in data and data['website']:
|
|
website = data['website']
|
|
|
|
# 2. Original entry webadres_organisatie
|
|
if not website and 'original_entry' in data:
|
|
oe = data['original_entry']
|
|
if isinstance(oe, dict) and oe.get('webadres_organisatie'):
|
|
website = oe['webadres_organisatie']
|
|
|
|
# 3. Museum register enrichment website_url
|
|
if not website and 'museum_register_enrichment' in data:
|
|
mre = data['museum_register_enrichment']
|
|
if isinstance(mre, dict) and mre.get('website_url'):
|
|
website = mre['website_url']
|
|
|
|
# 4. Wikidata enrichment official_website
|
|
if not website and 'wikidata_enrichment' in data:
|
|
we = data['wikidata_enrichment']
|
|
if isinstance(we, dict) and we.get('official_website'):
|
|
website = we['official_website']
|
|
|
|
# 5. Google Maps enrichment website
|
|
if not website and 'google_maps_enrichment' in data:
|
|
gm = data['google_maps_enrichment']
|
|
if isinstance(gm, dict) and gm.get('website'):
|
|
website = gm['website']
|
|
|
|
# 6. Location object website
|
|
if not website and 'location' in data:
|
|
loc = data['location']
|
|
if isinstance(loc, dict) and loc.get('website'):
|
|
website = loc['website']
|
|
|
|
# 7. Original entry identifiers (Website scheme)
|
|
if not website and 'original_entry' in data:
|
|
oe = data['original_entry']
|
|
if isinstance(oe, dict) and 'identifiers' in oe:
|
|
for ident in oe.get('identifiers', []):
|
|
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
|
|
website = ident.get('identifier_value') or ident.get('identifier_url')
|
|
if website:
|
|
break
|
|
|
|
# 8. Top-level identifiers array (Website scheme)
|
|
if not website and 'identifiers' in data:
|
|
for ident in data.get('identifiers', []):
|
|
if isinstance(ident, dict) and ident.get('identifier_scheme') == 'Website':
|
|
website = ident.get('identifier_value') or ident.get('identifier_url')
|
|
if website:
|
|
break
|
|
|
|
if not website or not website.startswith('http'):
|
|
print(f"Error: No website found for {args.ghcid}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
custodians = [(yaml_path, data, website)]
|
|
else:
|
|
# Batch mode
|
|
limit = args.test if args.test else None
|
|
prefix = args.province if args.province else None
|
|
|
|
print(f"Finding custodians with websites...")
|
|
custodians = find_custodians_with_websites(prefix=prefix, limit=limit)
|
|
|
|
print(f"Found {len(custodians)} custodians with websites")
|
|
|
|
if args.dry_run:
|
|
print("\n[DRY RUN MODE - No changes will be made]\n")
|
|
|
|
# Process custodians
|
|
results = []
|
|
semaphore = asyncio.Semaphore(args.concurrency)
|
|
|
|
async def process_with_semaphore(custodian_tuple):
|
|
async with semaphore:
|
|
yaml_path, data, website = custodian_tuple
|
|
return await process_custodian(
|
|
reader, yaml_path, data, website,
|
|
dry_run=args.dry_run, verbose=args.verbose,
|
|
llm_extractor=llm_extractor
|
|
)
|
|
|
|
# Process in batches
|
|
tasks = [process_with_semaphore(c) for c in custodians]
|
|
|
|
print(f"\nProcessing {len(tasks)} custodians...")
|
|
|
|
for i, coro in enumerate(asyncio.as_completed(tasks), 1):
|
|
result = await coro
|
|
results.append(result)
|
|
|
|
if not args.verbose:
|
|
# Progress indicator
|
|
if result['statements_added'] > 0:
|
|
print(f"[{i}/{len(tasks)}] {result['ghcid']}: Added {result['statements_added']} statements")
|
|
elif i % 10 == 0:
|
|
print(f"[{i}/{len(tasks)}] Processing...")
|
|
|
|
# Summary statistics
|
|
print("\n" + "="*60)
|
|
print("SUMMARY")
|
|
print("="*60)
|
|
|
|
total_checked = sum(r['pages_checked'] for r in results)
|
|
total_with_content = sum(r['pages_with_content'] for r in results)
|
|
total_found = sum(r['statements_found'] for r in results)
|
|
total_added = sum(r['statements_added'] for r in results)
|
|
total_errors = sum(len(r['errors']) for r in results)
|
|
custodians_with_statements = sum(1 for r in results if r['statements_added'] > 0)
|
|
|
|
print(f"Custodians processed: {len(results)}")
|
|
print(f"Pages checked: {total_checked}")
|
|
print(f"Pages with content: {total_with_content}")
|
|
print(f"Statements found: {total_found}")
|
|
print(f"Statements added: {total_added}")
|
|
print(f"Custodians updated: {custodians_with_statements}")
|
|
print(f"Errors encountered: {total_errors}")
|
|
|
|
# Show custodians that got statements
|
|
if custodians_with_statements > 0:
|
|
print(f"\nCustodians with new mission statements:")
|
|
for r in results:
|
|
if r['statements_added'] > 0:
|
|
print(f" - {r['ghcid']}: {r['name']} ({r['statements_added']} statements)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(main())
|