glam/scripts/extract_mission_statement.py

#!/usr/bin/env python3
"""
Extract mission statements from heritage custodian websites.

This script:
1. Reads a custodian YAML file
2. Discovers mission/vision/about pages on the website
3. Extracts mission, vision, and goal statements
4. Saves the archived HTML and metadata
5. Updates the custodian YAML with mission_statement data

Usage:
    python scripts/extract_mission_statement.py NL-ZH-ZUI-M-LMT
    python scripts/extract_mission_statement.py --batch NL-NH  # All Noord-Holland custodians
    python scripts/extract_mission_statement.py --url https://example.org/about  # Direct URL

Requirements:
    - playwright (pip install playwright && playwright install chromium)
    - pyyaml
    - httpx
"""

import argparse
import hashlib
import base64
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

import yaml

# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# Common Dutch mission page URL patterns
DUTCH_MISSION_PATTERNS = [
    "/missie",
    "/missie-en-visie",
    "/missie-visie",
    "/onze-missie",
    "/over-ons/missie",
    "/over/missie",
    "/visie",
    "/doelstellingen",
    "/about/mission",
    "/about-us/mission",
    "/about",
    "/over-ons",
    "/over",
    "/organisatie",
    "/wie-zijn-wij",
    "/het-museum/missie",
    "/het-museum/over",
    "/museum/missie",
]

# Keywords that indicate mission/vision content
MISSION_KEYWORDS_NL = [
    "missie", "visie", "doelstelling", "doelen", "ambitie",
    "waar we voor staan", "onze opdracht", "ons doel",
    "wat willen we", "wie zijn wij"
]

MISSION_KEYWORDS_EN = [
    "mission", "vision", "goals", "objectives", "purpose",
    "what we do", "our aim", "about us"
]


def compute_content_hash(text: str) -> str:
    """Compute SHA-256 hash of text in SRI format."""
    sha256_hash = hashlib.sha256(text.encode('utf-8')).digest()
    b64_hash = base64.b64encode(sha256_hash).decode('ascii')
    return f"sha256-{b64_hash}"


def load_custodian(ghcid: str) -> dict:
    """Load a custodian YAML file by GHCID."""
    custodian_dir = PROJECT_ROOT / "data" / "custodian"

    # Try direct filename match
    yaml_path = custodian_dir / f"{ghcid}.yaml"
    if yaml_path.exists():
        with open(yaml_path, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)

    # Try with suffix pattern (e.g., NL-GE-AAL-M-NOM-nationaal_onderduikmuseum.yaml)
    for path in custodian_dir.glob(f"{ghcid}*.yaml"):
        with open(path, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)

    raise FileNotFoundError(f"No custodian file found for GHCID: {ghcid}")


def get_custodian_website(custodian: dict) -> Optional[str]:
    """Extract the primary website URL from a custodian record."""
    # Try multiple locations where website might be stored
    locations = [
        lambda c: c.get('website'),
        lambda c: c.get('wikidata_enrichment', {}).get('official_website'),
        lambda c: c.get('google_maps_enrichment', {}).get('website'),
        lambda c: c.get('location', {}).get('website'),
        lambda c: c.get('original_entry', {}).get('website'),
    ]

    for getter in locations:
        try:
            url = getter(custodian)
            if url and url.startswith('http'):
                return url
        except (KeyError, TypeError):
            continue

    return None


def discover_mission_pages(base_url: str) -> list[str]:
    """
    Discover potential mission/vision pages on a website.
    Returns list of URLs to check.
    """
    # Normalize base URL
    if not base_url.endswith('/'):
        base_url = base_url.rstrip('/')

    # Generate candidate URLs
    candidates = []
    for pattern in DUTCH_MISSION_PATTERNS:
        candidates.append(f"{base_url}{pattern}")
        # Also try with trailing slash
        candidates.append(f"{base_url}{pattern}/")

    return candidates


def extract_mission_from_html(html: str, url: str) -> list[dict]:
    """
    Extract mission statements from HTML content.
    Returns a list of statement dictionaries.

    This is a simplified extraction - in practice, you'd use
    an LLM like Claude to intelligently extract and classify statements.
    """
    statements = []

    # Simple heuristic extraction - look for common patterns
    # In production, use Claude API for intelligent extraction

    # Look for h2/h3 headings with mission keywords
    import re

    # Find sections with mission-related headings
    heading_pattern = r'<h[23][^>]*>(.*?)</h[23]>'
    headings = re.findall(heading_pattern, html, re.IGNORECASE | re.DOTALL)

    for heading in headings:
        heading_text = re.sub(r'<[^>]+>', '', heading).strip().lower()

        statement_type = None
        if any(kw in heading_text for kw in ['missie', 'mission']):
            statement_type = 'mission'
        elif any(kw in heading_text for kw in ['visie', 'vision']):
            statement_type = 'vision'
        elif any(kw in heading_text for kw in ['doel', 'goal', 'objective', 'ambitie']):
            statement_type = 'goal'

        if statement_type:
            statements.append({
                'type': statement_type,
                'heading': heading_text,
                'needs_extraction': True  # Flag for LLM extraction
            })

    return statements


async def fetch_and_archive_page(url: str, ghcid: str) -> dict:
    """
    Fetch a page using Playwright and archive it.
    Returns metadata about the archived page.
    """
    try:
        from playwright.async_api import async_playwright
    except ImportError:
        print("Error: playwright not installed. Run: pip install playwright && playwright install chromium")
        sys.exit(1)

    timestamp = datetime.now(timezone.utc)
    timestamp_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')

    # Parse URL for directory structure
    from urllib.parse import urlparse
    parsed = urlparse(url)
    domain = parsed.netloc
    path = parsed.path.strip('/') or 'index'

    # Create archive directory
    archive_dir = PROJECT_ROOT / "data" / "custodian" / "web" / ghcid / domain / path
    archive_dir.mkdir(parents=True, exist_ok=True)

    result = {
        'url': url,
        'archive_dir': str(archive_dir.relative_to(PROJECT_ROOT)),
        'timestamp': timestamp_str,
        'success': False
    }

    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()

        try:
            response = await page.goto(url, wait_until='networkidle', timeout=30000)

            if response and response.ok:
                # Get rendered HTML
                html_content = await page.content()

                # Save rendered HTML
                html_path = archive_dir / 'rendered.html'
                with open(html_path, 'w', encoding='utf-8') as f:
                    f.write(html_content)

                # Take screenshot
                screenshot_path = archive_dir / 'screenshot.png'
                await page.screenshot(path=str(screenshot_path), full_page=True)

                # Save metadata
                metadata = {
                    'url': url,
                    'retrieved_on': timestamp_str,
                    'status_code': response.status,
                    'content_type': response.headers.get('content-type', ''),
                    'files': {
                        'html': 'rendered.html',
                        'screenshot': 'screenshot.png'
                    }
                }

                metadata_path = archive_dir / 'metadata.yaml'
                with open(metadata_path, 'w', encoding='utf-8') as f:
                    yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True)

                result['success'] = True
                result['html_file'] = str(html_path.relative_to(PROJECT_ROOT))
                result['html_content'] = html_content

        except Exception as e:
            result['error'] = str(e)

        await browser.close()

    return result


def create_mission_statement_entry(
    statement_type: str,
    statement_text: str,
    ghcid: str,
    source_url: str,
    retrieved_on: str,
    xpath: str,
    html_file: str,
    page_section: Optional[str] = None,
    summary: Optional[str] = None,
    confidence: float = 0.90
) -> dict:
    """Create a mission_statement entry following the LinkML schema."""

    # Generate statement ID
    year = datetime.now().year
    statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{statement_type}-{year}"

    # Compute content hash
    content_hash = compute_content_hash(statement_text)

    entry = {
        'statement_id': statement_id,
        'statement_type': statement_type,
        'statement_text': statement_text,
        'statement_language': 'nl',  # Default to Dutch for NL custodians
        'source_url': source_url,
        'retrieved_on': retrieved_on,
        'xpath': xpath,
        'html_file': html_file,
        'extraction_agent': 'claude-opus-4',
        'extraction_timestamp': retrieved_on,
        'extraction_confidence': confidence,
        'content_hash': {
            'algorithm': 'sha256',
            'value': content_hash,
            'scope': 'statement_text'
        },
        'prov': {
            'wasDerivedFrom': source_url,
            'wasAttributedTo': 'unknown',  # To be filled with organization name
            'generatedAtTime': retrieved_on
        }
    }

    if page_section:
        entry['page_section'] = page_section

    if summary:
        entry['statement_summary'] = summary

    return entry


def main():
    parser = argparse.ArgumentParser(
        description='Extract mission statements from heritage custodian websites'
    )
    parser.add_argument(
        'ghcid',
        nargs='?',
        help='GHCID of the custodian to process (e.g., NL-ZH-ZUI-M-LMT)'
    )
    parser.add_argument(
        '--url',
        help='Direct URL to process (skips website discovery)'
    )
    parser.add_argument(
        '--batch',
        help='Process all custodians matching prefix (e.g., NL-NH for Noord-Holland)'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be done without making changes'
    )
    parser.add_argument(
        '--discover-only',
        action='store_true',
        help='Only discover mission pages, do not extract'
    )

    args = parser.parse_args()

    if not args.ghcid and not args.batch and not args.url:
        parser.print_help()
        sys.exit(1)

    if args.ghcid:
        try:
            custodian = load_custodian(args.ghcid)
            website = get_custodian_website(custodian)

            if not website:
                print(f"No website found for {args.ghcid}")
                sys.exit(1)

            print(f"Custodian: {args.ghcid}")
            print(f"Website: {website}")

            if args.discover_only:
                candidates = discover_mission_pages(website)
                print(f"\nPotential mission pages to check:")
                for url in candidates[:10]:
                    print(f"  - {url}")
            else:
                print("\nTo extract mission statements, run with --url and specify the mission page URL")
                print("Example:")
                print(f"  python {sys.argv[0]} {args.ghcid} --url {website}/missie-en-visie")

        except FileNotFoundError as e:
            print(f"Error: {e}")
            sys.exit(1)

    elif args.url and args.ghcid:
        # Fetch and archive the specified URL
        import asyncio

        print(f"Fetching: {args.url}")
        result = asyncio.run(fetch_and_archive_page(args.url, args.ghcid))

        if result['success']:
            print(f"Archived to: {result['archive_dir']}")
            print(f"HTML file: {result['html_file']}")

            # Extract potential statements (simplified)
            statements = extract_mission_from_html(result['html_content'], args.url)
            if statements:
                print(f"\nFound {len(statements)} potential statement sections:")
                for s in statements:
                    print(f"  - {s['type']}: {s['heading']}")
                print("\nNote: Use Claude to extract actual statement text and XPaths")
        else:
            print(f"Failed: {result.get('error', 'Unknown error')}")


if __name__ == '__main__':
    main()