glam/scripts/extract_mission_statement.py
2025-12-30 03:43:31 +01:00

398 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Extract mission statements from heritage custodian websites.
This script:
1. Reads a custodian YAML file
2. Discovers mission/vision/about pages on the website
3. Extracts mission, vision, and goal statements
4. Saves the archived HTML and metadata
5. Updates the custodian YAML with mission_statement data
Usage:
python scripts/extract_mission_statement.py NL-ZH-ZUI-M-LMT
python scripts/extract_mission_statement.py --batch NL-NH # All Noord-Holland custodians
python scripts/extract_mission_statement.py --url https://example.org/about # Direct URL
Requirements:
- playwright (pip install playwright && playwright install chromium)
- pyyaml
- httpx
"""
import argparse
import hashlib
import base64
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import yaml
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
# Common Dutch mission page URL patterns
DUTCH_MISSION_PATTERNS = [
"/missie",
"/missie-en-visie",
"/missie-visie",
"/onze-missie",
"/over-ons/missie",
"/over/missie",
"/visie",
"/doelstellingen",
"/about/mission",
"/about-us/mission",
"/about",
"/over-ons",
"/over",
"/organisatie",
"/wie-zijn-wij",
"/het-museum/missie",
"/het-museum/over",
"/museum/missie",
]
# Keywords that indicate mission/vision content
MISSION_KEYWORDS_NL = [
"missie", "visie", "doelstelling", "doelen", "ambitie",
"waar we voor staan", "onze opdracht", "ons doel",
"wat willen we", "wie zijn wij"
]
MISSION_KEYWORDS_EN = [
"mission", "vision", "goals", "objectives", "purpose",
"what we do", "our aim", "about us"
]
def compute_content_hash(text: str) -> str:
"""Compute SHA-256 hash of text in SRI format."""
sha256_hash = hashlib.sha256(text.encode('utf-8')).digest()
b64_hash = base64.b64encode(sha256_hash).decode('ascii')
return f"sha256-{b64_hash}"
def load_custodian(ghcid: str) -> dict:
"""Load a custodian YAML file by GHCID."""
custodian_dir = PROJECT_ROOT / "data" / "custodian"
# Try direct filename match
yaml_path = custodian_dir / f"{ghcid}.yaml"
if yaml_path.exists():
with open(yaml_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
# Try with suffix pattern (e.g., NL-GE-AAL-M-NOM-nationaal_onderduikmuseum.yaml)
for path in custodian_dir.glob(f"{ghcid}*.yaml"):
with open(path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
raise FileNotFoundError(f"No custodian file found for GHCID: {ghcid}")
def get_custodian_website(custodian: dict) -> Optional[str]:
"""Extract the primary website URL from a custodian record."""
# Try multiple locations where website might be stored
locations = [
lambda c: c.get('website'),
lambda c: c.get('wikidata_enrichment', {}).get('official_website'),
lambda c: c.get('google_maps_enrichment', {}).get('website'),
lambda c: c.get('location', {}).get('website'),
lambda c: c.get('original_entry', {}).get('website'),
]
for getter in locations:
try:
url = getter(custodian)
if url and url.startswith('http'):
return url
except (KeyError, TypeError):
continue
return None
def discover_mission_pages(base_url: str) -> list[str]:
"""
Discover potential mission/vision pages on a website.
Returns list of URLs to check.
"""
# Normalize base URL
if not base_url.endswith('/'):
base_url = base_url.rstrip('/')
# Generate candidate URLs
candidates = []
for pattern in DUTCH_MISSION_PATTERNS:
candidates.append(f"{base_url}{pattern}")
# Also try with trailing slash
candidates.append(f"{base_url}{pattern}/")
return candidates
def extract_mission_from_html(html: str, url: str) -> list[dict]:
"""
Extract mission statements from HTML content.
Returns a list of statement dictionaries.
This is a simplified extraction - in practice, you'd use
an LLM like Claude to intelligently extract and classify statements.
"""
statements = []
# Simple heuristic extraction - look for common patterns
# In production, use Claude API for intelligent extraction
# Look for h2/h3 headings with mission keywords
import re
# Find sections with mission-related headings
heading_pattern = r'<h[23][^>]*>(.*?)</h[23]>'
headings = re.findall(heading_pattern, html, re.IGNORECASE | re.DOTALL)
for heading in headings:
heading_text = re.sub(r'<[^>]+>', '', heading).strip().lower()
statement_type = None
if any(kw in heading_text for kw in ['missie', 'mission']):
statement_type = 'mission'
elif any(kw in heading_text for kw in ['visie', 'vision']):
statement_type = 'vision'
elif any(kw in heading_text for kw in ['doel', 'goal', 'objective', 'ambitie']):
statement_type = 'goal'
if statement_type:
statements.append({
'type': statement_type,
'heading': heading_text,
'needs_extraction': True # Flag for LLM extraction
})
return statements
async def fetch_and_archive_page(url: str, ghcid: str) -> dict:
"""
Fetch a page using Playwright and archive it.
Returns metadata about the archived page.
"""
try:
from playwright.async_api import async_playwright
except ImportError:
print("Error: playwright not installed. Run: pip install playwright && playwright install chromium")
sys.exit(1)
timestamp = datetime.now(timezone.utc)
timestamp_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
# Parse URL for directory structure
from urllib.parse import urlparse
parsed = urlparse(url)
domain = parsed.netloc
path = parsed.path.strip('/') or 'index'
# Create archive directory
archive_dir = PROJECT_ROOT / "data" / "custodian" / "web" / ghcid / domain / path
archive_dir.mkdir(parents=True, exist_ok=True)
result = {
'url': url,
'archive_dir': str(archive_dir.relative_to(PROJECT_ROOT)),
'timestamp': timestamp_str,
'success': False
}
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
try:
response = await page.goto(url, wait_until='networkidle', timeout=30000)
if response and response.ok:
# Get rendered HTML
html_content = await page.content()
# Save rendered HTML
html_path = archive_dir / 'rendered.html'
with open(html_path, 'w', encoding='utf-8') as f:
f.write(html_content)
# Take screenshot
screenshot_path = archive_dir / 'screenshot.png'
await page.screenshot(path=str(screenshot_path), full_page=True)
# Save metadata
metadata = {
'url': url,
'retrieved_on': timestamp_str,
'status_code': response.status,
'content_type': response.headers.get('content-type', ''),
'files': {
'html': 'rendered.html',
'screenshot': 'screenshot.png'
}
}
metadata_path = archive_dir / 'metadata.yaml'
with open(metadata_path, 'w', encoding='utf-8') as f:
yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True)
result['success'] = True
result['html_file'] = str(html_path.relative_to(PROJECT_ROOT))
result['html_content'] = html_content
except Exception as e:
result['error'] = str(e)
await browser.close()
return result
def create_mission_statement_entry(
statement_type: str,
statement_text: str,
ghcid: str,
source_url: str,
retrieved_on: str,
xpath: str,
html_file: str,
page_section: Optional[str] = None,
summary: Optional[str] = None,
confidence: float = 0.90
) -> dict:
"""Create a mission_statement entry following the LinkML schema."""
# Generate statement ID
year = datetime.now().year
statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{statement_type}-{year}"
# Compute content hash
content_hash = compute_content_hash(statement_text)
entry = {
'statement_id': statement_id,
'statement_type': statement_type,
'statement_text': statement_text,
'statement_language': 'nl', # Default to Dutch for NL custodians
'source_url': source_url,
'retrieved_on': retrieved_on,
'xpath': xpath,
'html_file': html_file,
'extraction_agent': 'claude-opus-4',
'extraction_timestamp': retrieved_on,
'extraction_confidence': confidence,
'content_hash': {
'algorithm': 'sha256',
'value': content_hash,
'scope': 'statement_text'
},
'prov': {
'wasDerivedFrom': source_url,
'wasAttributedTo': 'unknown', # To be filled with organization name
'generatedAtTime': retrieved_on
}
}
if page_section:
entry['page_section'] = page_section
if summary:
entry['statement_summary'] = summary
return entry
def main():
parser = argparse.ArgumentParser(
description='Extract mission statements from heritage custodian websites'
)
parser.add_argument(
'ghcid',
nargs='?',
help='GHCID of the custodian to process (e.g., NL-ZH-ZUI-M-LMT)'
)
parser.add_argument(
'--url',
help='Direct URL to process (skips website discovery)'
)
parser.add_argument(
'--batch',
help='Process all custodians matching prefix (e.g., NL-NH for Noord-Holland)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be done without making changes'
)
parser.add_argument(
'--discover-only',
action='store_true',
help='Only discover mission pages, do not extract'
)
args = parser.parse_args()
if not args.ghcid and not args.batch and not args.url:
parser.print_help()
sys.exit(1)
if args.ghcid:
try:
custodian = load_custodian(args.ghcid)
website = get_custodian_website(custodian)
if not website:
print(f"No website found for {args.ghcid}")
sys.exit(1)
print(f"Custodian: {args.ghcid}")
print(f"Website: {website}")
if args.discover_only:
candidates = discover_mission_pages(website)
print(f"\nPotential mission pages to check:")
for url in candidates[:10]:
print(f" - {url}")
else:
print("\nTo extract mission statements, run with --url and specify the mission page URL")
print("Example:")
print(f" python {sys.argv[0]} {args.ghcid} --url {website}/missie-en-visie")
except FileNotFoundError as e:
print(f"Error: {e}")
sys.exit(1)
elif args.url and args.ghcid:
# Fetch and archive the specified URL
import asyncio
print(f"Fetching: {args.url}")
result = asyncio.run(fetch_and_archive_page(args.url, args.ghcid))
if result['success']:
print(f"Archived to: {result['archive_dir']}")
print(f"HTML file: {result['html_file']}")
# Extract potential statements (simplified)
statements = extract_mission_from_html(result['html_content'], args.url)
if statements:
print(f"\nFound {len(statements)} potential statement sections:")
for s in statements:
print(f" - {s['type']}: {s['heading']}")
print("\nNote: Use Claude to extract actual statement text and XPaths")
else:
print(f"Failed: {result.get('error', 'Unknown error')}")
if __name__ == '__main__':
main()