#!/usr/bin/env python3 """ Extract mission statements from heritage custodian websites. This script: 1. Reads a custodian YAML file 2. Discovers mission/vision/about pages on the website 3. Extracts mission, vision, and goal statements 4. Saves the archived HTML and metadata 5. Updates the custodian YAML with mission_statement data Usage: python scripts/extract_mission_statement.py NL-ZH-ZUI-M-LMT python scripts/extract_mission_statement.py --batch NL-NH # All Noord-Holland custodians python scripts/extract_mission_statement.py --url https://example.org/about # Direct URL Requirements: - playwright (pip install playwright && playwright install chromium) - pyyaml - httpx """ import argparse import hashlib import base64 import json import os import re import sys from datetime import datetime, timezone from pathlib import Path from typing import Optional import yaml # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) # Common Dutch mission page URL patterns DUTCH_MISSION_PATTERNS = [ "/missie", "/missie-en-visie", "/missie-visie", "/onze-missie", "/over-ons/missie", "/over/missie", "/visie", "/doelstellingen", "/about/mission", "/about-us/mission", "/about", "/over-ons", "/over", "/organisatie", "/wie-zijn-wij", "/het-museum/missie", "/het-museum/over", "/museum/missie", ] # Keywords that indicate mission/vision content MISSION_KEYWORDS_NL = [ "missie", "visie", "doelstelling", "doelen", "ambitie", "waar we voor staan", "onze opdracht", "ons doel", "wat willen we", "wie zijn wij" ] MISSION_KEYWORDS_EN = [ "mission", "vision", "goals", "objectives", "purpose", "what we do", "our aim", "about us" ] def compute_content_hash(text: str) -> str: """Compute SHA-256 hash of text in SRI format.""" sha256_hash = hashlib.sha256(text.encode('utf-8')).digest() b64_hash = base64.b64encode(sha256_hash).decode('ascii') return f"sha256-{b64_hash}" def load_custodian(ghcid: str) -> dict: """Load a custodian YAML file by GHCID.""" custodian_dir = PROJECT_ROOT / "data" / "custodian" # Try direct filename match yaml_path = custodian_dir / f"{ghcid}.yaml" if yaml_path.exists(): with open(yaml_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) # Try with suffix pattern (e.g., NL-GE-AAL-M-NOM-nationaal_onderduikmuseum.yaml) for path in custodian_dir.glob(f"{ghcid}*.yaml"): with open(path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) raise FileNotFoundError(f"No custodian file found for GHCID: {ghcid}") def get_custodian_website(custodian: dict) -> Optional[str]: """Extract the primary website URL from a custodian record.""" # Try multiple locations where website might be stored locations = [ lambda c: c.get('website'), lambda c: c.get('wikidata_enrichment', {}).get('official_website'), lambda c: c.get('google_maps_enrichment', {}).get('website'), lambda c: c.get('location', {}).get('website'), lambda c: c.get('original_entry', {}).get('website'), ] for getter in locations: try: url = getter(custodian) if url and url.startswith('http'): return url except (KeyError, TypeError): continue return None def discover_mission_pages(base_url: str) -> list[str]: """ Discover potential mission/vision pages on a website. Returns list of URLs to check. """ # Normalize base URL if not base_url.endswith('/'): base_url = base_url.rstrip('/') # Generate candidate URLs candidates = [] for pattern in DUTCH_MISSION_PATTERNS: candidates.append(f"{base_url}{pattern}") # Also try with trailing slash candidates.append(f"{base_url}{pattern}/") return candidates def extract_mission_from_html(html: str, url: str) -> list[dict]: """ Extract mission statements from HTML content. Returns a list of statement dictionaries. This is a simplified extraction - in practice, you'd use an LLM like Claude to intelligently extract and classify statements. """ statements = [] # Simple heuristic extraction - look for common patterns # In production, use Claude API for intelligent extraction # Look for h2/h3 headings with mission keywords import re # Find sections with mission-related headings heading_pattern = r']*>(.*?)' headings = re.findall(heading_pattern, html, re.IGNORECASE | re.DOTALL) for heading in headings: heading_text = re.sub(r'<[^>]+>', '', heading).strip().lower() statement_type = None if any(kw in heading_text for kw in ['missie', 'mission']): statement_type = 'mission' elif any(kw in heading_text for kw in ['visie', 'vision']): statement_type = 'vision' elif any(kw in heading_text for kw in ['doel', 'goal', 'objective', 'ambitie']): statement_type = 'goal' if statement_type: statements.append({ 'type': statement_type, 'heading': heading_text, 'needs_extraction': True # Flag for LLM extraction }) return statements async def fetch_and_archive_page(url: str, ghcid: str) -> dict: """ Fetch a page using Playwright and archive it. Returns metadata about the archived page. """ try: from playwright.async_api import async_playwright except ImportError: print("Error: playwright not installed. Run: pip install playwright && playwright install chromium") sys.exit(1) timestamp = datetime.now(timezone.utc) timestamp_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') # Parse URL for directory structure from urllib.parse import urlparse parsed = urlparse(url) domain = parsed.netloc path = parsed.path.strip('/') or 'index' # Create archive directory archive_dir = PROJECT_ROOT / "data" / "custodian" / "web" / ghcid / domain / path archive_dir.mkdir(parents=True, exist_ok=True) result = { 'url': url, 'archive_dir': str(archive_dir.relative_to(PROJECT_ROOT)), 'timestamp': timestamp_str, 'success': False } async with async_playwright() as p: browser = await p.chromium.launch() page = await browser.new_page() try: response = await page.goto(url, wait_until='networkidle', timeout=30000) if response and response.ok: # Get rendered HTML html_content = await page.content() # Save rendered HTML html_path = archive_dir / 'rendered.html' with open(html_path, 'w', encoding='utf-8') as f: f.write(html_content) # Take screenshot screenshot_path = archive_dir / 'screenshot.png' await page.screenshot(path=str(screenshot_path), full_page=True) # Save metadata metadata = { 'url': url, 'retrieved_on': timestamp_str, 'status_code': response.status, 'content_type': response.headers.get('content-type', ''), 'files': { 'html': 'rendered.html', 'screenshot': 'screenshot.png' } } metadata_path = archive_dir / 'metadata.yaml' with open(metadata_path, 'w', encoding='utf-8') as f: yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True) result['success'] = True result['html_file'] = str(html_path.relative_to(PROJECT_ROOT)) result['html_content'] = html_content except Exception as e: result['error'] = str(e) await browser.close() return result def create_mission_statement_entry( statement_type: str, statement_text: str, ghcid: str, source_url: str, retrieved_on: str, xpath: str, html_file: str, page_section: Optional[str] = None, summary: Optional[str] = None, confidence: float = 0.90 ) -> dict: """Create a mission_statement entry following the LinkML schema.""" # Generate statement ID year = datetime.now().year statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{statement_type}-{year}" # Compute content hash content_hash = compute_content_hash(statement_text) entry = { 'statement_id': statement_id, 'statement_type': statement_type, 'statement_text': statement_text, 'statement_language': 'nl', # Default to Dutch for NL custodians 'source_url': source_url, 'retrieved_on': retrieved_on, 'xpath': xpath, 'html_file': html_file, 'extraction_agent': 'claude-opus-4', 'extraction_timestamp': retrieved_on, 'extraction_confidence': confidence, 'content_hash': { 'algorithm': 'sha256', 'value': content_hash, 'scope': 'statement_text' }, 'prov': { 'wasDerivedFrom': source_url, 'wasAttributedTo': 'unknown', # To be filled with organization name 'generatedAtTime': retrieved_on } } if page_section: entry['page_section'] = page_section if summary: entry['statement_summary'] = summary return entry def main(): parser = argparse.ArgumentParser( description='Extract mission statements from heritage custodian websites' ) parser.add_argument( 'ghcid', nargs='?', help='GHCID of the custodian to process (e.g., NL-ZH-ZUI-M-LMT)' ) parser.add_argument( '--url', help='Direct URL to process (skips website discovery)' ) parser.add_argument( '--batch', help='Process all custodians matching prefix (e.g., NL-NH for Noord-Holland)' ) parser.add_argument( '--dry-run', action='store_true', help='Show what would be done without making changes' ) parser.add_argument( '--discover-only', action='store_true', help='Only discover mission pages, do not extract' ) args = parser.parse_args() if not args.ghcid and not args.batch and not args.url: parser.print_help() sys.exit(1) if args.ghcid: try: custodian = load_custodian(args.ghcid) website = get_custodian_website(custodian) if not website: print(f"No website found for {args.ghcid}") sys.exit(1) print(f"Custodian: {args.ghcid}") print(f"Website: {website}") if args.discover_only: candidates = discover_mission_pages(website) print(f"\nPotential mission pages to check:") for url in candidates[:10]: print(f" - {url}") else: print("\nTo extract mission statements, run with --url and specify the mission page URL") print("Example:") print(f" python {sys.argv[0]} {args.ghcid} --url {website}/missie-en-visie") except FileNotFoundError as e: print(f"Error: {e}") sys.exit(1) elif args.url and args.ghcid: # Fetch and archive the specified URL import asyncio print(f"Fetching: {args.url}") result = asyncio.run(fetch_and_archive_page(args.url, args.ghcid)) if result['success']: print(f"Archived to: {result['archive_dir']}") print(f"HTML file: {result['html_file']}") # Extract potential statements (simplified) statements = extract_mission_from_html(result['html_content'], args.url) if statements: print(f"\nFound {len(statements)} potential statement sections:") for s in statements: print(f" - {s['type']}: {s['heading']}") print("\nNote: Use Claude to extract actual statement text and XPaths") else: print(f"Failed: {result.get('error', 'Unknown error')}") if __name__ == '__main__': main()