398 lines
13 KiB
Python
398 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract mission statements from heritage custodian websites.
|
|
|
|
This script:
|
|
1. Reads a custodian YAML file
|
|
2. Discovers mission/vision/about pages on the website
|
|
3. Extracts mission, vision, and goal statements
|
|
4. Saves the archived HTML and metadata
|
|
5. Updates the custodian YAML with mission_statement data
|
|
|
|
Usage:
|
|
python scripts/extract_mission_statement.py NL-ZH-ZUI-M-LMT
|
|
python scripts/extract_mission_statement.py --batch NL-NH # All Noord-Holland custodians
|
|
python scripts/extract_mission_statement.py --url https://example.org/about # Direct URL
|
|
|
|
Requirements:
|
|
- playwright (pip install playwright && playwright install chromium)
|
|
- pyyaml
|
|
- httpx
|
|
"""
|
|
|
|
import argparse
|
|
import hashlib
|
|
import base64
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import yaml
|
|
|
|
# Add project root to path
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
# Common Dutch mission page URL patterns
|
|
DUTCH_MISSION_PATTERNS = [
|
|
"/missie",
|
|
"/missie-en-visie",
|
|
"/missie-visie",
|
|
"/onze-missie",
|
|
"/over-ons/missie",
|
|
"/over/missie",
|
|
"/visie",
|
|
"/doelstellingen",
|
|
"/about/mission",
|
|
"/about-us/mission",
|
|
"/about",
|
|
"/over-ons",
|
|
"/over",
|
|
"/organisatie",
|
|
"/wie-zijn-wij",
|
|
"/het-museum/missie",
|
|
"/het-museum/over",
|
|
"/museum/missie",
|
|
]
|
|
|
|
# Keywords that indicate mission/vision content
|
|
MISSION_KEYWORDS_NL = [
|
|
"missie", "visie", "doelstelling", "doelen", "ambitie",
|
|
"waar we voor staan", "onze opdracht", "ons doel",
|
|
"wat willen we", "wie zijn wij"
|
|
]
|
|
|
|
MISSION_KEYWORDS_EN = [
|
|
"mission", "vision", "goals", "objectives", "purpose",
|
|
"what we do", "our aim", "about us"
|
|
]
|
|
|
|
|
|
def compute_content_hash(text: str) -> str:
|
|
"""Compute SHA-256 hash of text in SRI format."""
|
|
sha256_hash = hashlib.sha256(text.encode('utf-8')).digest()
|
|
b64_hash = base64.b64encode(sha256_hash).decode('ascii')
|
|
return f"sha256-{b64_hash}"
|
|
|
|
|
|
def load_custodian(ghcid: str) -> dict:
|
|
"""Load a custodian YAML file by GHCID."""
|
|
custodian_dir = PROJECT_ROOT / "data" / "custodian"
|
|
|
|
# Try direct filename match
|
|
yaml_path = custodian_dir / f"{ghcid}.yaml"
|
|
if yaml_path.exists():
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
# Try with suffix pattern (e.g., NL-GE-AAL-M-NOM-nationaal_onderduikmuseum.yaml)
|
|
for path in custodian_dir.glob(f"{ghcid}*.yaml"):
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
raise FileNotFoundError(f"No custodian file found for GHCID: {ghcid}")
|
|
|
|
|
|
def get_custodian_website(custodian: dict) -> Optional[str]:
|
|
"""Extract the primary website URL from a custodian record."""
|
|
# Try multiple locations where website might be stored
|
|
locations = [
|
|
lambda c: c.get('website'),
|
|
lambda c: c.get('wikidata_enrichment', {}).get('official_website'),
|
|
lambda c: c.get('google_maps_enrichment', {}).get('website'),
|
|
lambda c: c.get('location', {}).get('website'),
|
|
lambda c: c.get('original_entry', {}).get('website'),
|
|
]
|
|
|
|
for getter in locations:
|
|
try:
|
|
url = getter(custodian)
|
|
if url and url.startswith('http'):
|
|
return url
|
|
except (KeyError, TypeError):
|
|
continue
|
|
|
|
return None
|
|
|
|
|
|
def discover_mission_pages(base_url: str) -> list[str]:
|
|
"""
|
|
Discover potential mission/vision pages on a website.
|
|
Returns list of URLs to check.
|
|
"""
|
|
# Normalize base URL
|
|
if not base_url.endswith('/'):
|
|
base_url = base_url.rstrip('/')
|
|
|
|
# Generate candidate URLs
|
|
candidates = []
|
|
for pattern in DUTCH_MISSION_PATTERNS:
|
|
candidates.append(f"{base_url}{pattern}")
|
|
# Also try with trailing slash
|
|
candidates.append(f"{base_url}{pattern}/")
|
|
|
|
return candidates
|
|
|
|
|
|
def extract_mission_from_html(html: str, url: str) -> list[dict]:
|
|
"""
|
|
Extract mission statements from HTML content.
|
|
Returns a list of statement dictionaries.
|
|
|
|
This is a simplified extraction - in practice, you'd use
|
|
an LLM like Claude to intelligently extract and classify statements.
|
|
"""
|
|
statements = []
|
|
|
|
# Simple heuristic extraction - look for common patterns
|
|
# In production, use Claude API for intelligent extraction
|
|
|
|
# Look for h2/h3 headings with mission keywords
|
|
import re
|
|
|
|
# Find sections with mission-related headings
|
|
heading_pattern = r'<h[23][^>]*>(.*?)</h[23]>'
|
|
headings = re.findall(heading_pattern, html, re.IGNORECASE | re.DOTALL)
|
|
|
|
for heading in headings:
|
|
heading_text = re.sub(r'<[^>]+>', '', heading).strip().lower()
|
|
|
|
statement_type = None
|
|
if any(kw in heading_text for kw in ['missie', 'mission']):
|
|
statement_type = 'mission'
|
|
elif any(kw in heading_text for kw in ['visie', 'vision']):
|
|
statement_type = 'vision'
|
|
elif any(kw in heading_text for kw in ['doel', 'goal', 'objective', 'ambitie']):
|
|
statement_type = 'goal'
|
|
|
|
if statement_type:
|
|
statements.append({
|
|
'type': statement_type,
|
|
'heading': heading_text,
|
|
'needs_extraction': True # Flag for LLM extraction
|
|
})
|
|
|
|
return statements
|
|
|
|
|
|
async def fetch_and_archive_page(url: str, ghcid: str) -> dict:
|
|
"""
|
|
Fetch a page using Playwright and archive it.
|
|
Returns metadata about the archived page.
|
|
"""
|
|
try:
|
|
from playwright.async_api import async_playwright
|
|
except ImportError:
|
|
print("Error: playwright not installed. Run: pip install playwright && playwright install chromium")
|
|
sys.exit(1)
|
|
|
|
timestamp = datetime.now(timezone.utc)
|
|
timestamp_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
# Parse URL for directory structure
|
|
from urllib.parse import urlparse
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc
|
|
path = parsed.path.strip('/') or 'index'
|
|
|
|
# Create archive directory
|
|
archive_dir = PROJECT_ROOT / "data" / "custodian" / "web" / ghcid / domain / path
|
|
archive_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
result = {
|
|
'url': url,
|
|
'archive_dir': str(archive_dir.relative_to(PROJECT_ROOT)),
|
|
'timestamp': timestamp_str,
|
|
'success': False
|
|
}
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch()
|
|
page = await browser.new_page()
|
|
|
|
try:
|
|
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
|
|
|
if response and response.ok:
|
|
# Get rendered HTML
|
|
html_content = await page.content()
|
|
|
|
# Save rendered HTML
|
|
html_path = archive_dir / 'rendered.html'
|
|
with open(html_path, 'w', encoding='utf-8') as f:
|
|
f.write(html_content)
|
|
|
|
# Take screenshot
|
|
screenshot_path = archive_dir / 'screenshot.png'
|
|
await page.screenshot(path=str(screenshot_path), full_page=True)
|
|
|
|
# Save metadata
|
|
metadata = {
|
|
'url': url,
|
|
'retrieved_on': timestamp_str,
|
|
'status_code': response.status,
|
|
'content_type': response.headers.get('content-type', ''),
|
|
'files': {
|
|
'html': 'rendered.html',
|
|
'screenshot': 'screenshot.png'
|
|
}
|
|
}
|
|
|
|
metadata_path = archive_dir / 'metadata.yaml'
|
|
with open(metadata_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(metadata, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
result['success'] = True
|
|
result['html_file'] = str(html_path.relative_to(PROJECT_ROOT))
|
|
result['html_content'] = html_content
|
|
|
|
except Exception as e:
|
|
result['error'] = str(e)
|
|
|
|
await browser.close()
|
|
|
|
return result
|
|
|
|
|
|
def create_mission_statement_entry(
|
|
statement_type: str,
|
|
statement_text: str,
|
|
ghcid: str,
|
|
source_url: str,
|
|
retrieved_on: str,
|
|
xpath: str,
|
|
html_file: str,
|
|
page_section: Optional[str] = None,
|
|
summary: Optional[str] = None,
|
|
confidence: float = 0.90
|
|
) -> dict:
|
|
"""Create a mission_statement entry following the LinkML schema."""
|
|
|
|
# Generate statement ID
|
|
year = datetime.now().year
|
|
statement_id = f"https://nde.nl/ontology/hc/mission/{ghcid.lower()}/{statement_type}-{year}"
|
|
|
|
# Compute content hash
|
|
content_hash = compute_content_hash(statement_text)
|
|
|
|
entry = {
|
|
'statement_id': statement_id,
|
|
'statement_type': statement_type,
|
|
'statement_text': statement_text,
|
|
'statement_language': 'nl', # Default to Dutch for NL custodians
|
|
'source_url': source_url,
|
|
'retrieved_on': retrieved_on,
|
|
'xpath': xpath,
|
|
'html_file': html_file,
|
|
'extraction_agent': 'claude-opus-4',
|
|
'extraction_timestamp': retrieved_on,
|
|
'extraction_confidence': confidence,
|
|
'content_hash': {
|
|
'algorithm': 'sha256',
|
|
'value': content_hash,
|
|
'scope': 'statement_text'
|
|
},
|
|
'prov': {
|
|
'wasDerivedFrom': source_url,
|
|
'wasAttributedTo': 'unknown', # To be filled with organization name
|
|
'generatedAtTime': retrieved_on
|
|
}
|
|
}
|
|
|
|
if page_section:
|
|
entry['page_section'] = page_section
|
|
|
|
if summary:
|
|
entry['statement_summary'] = summary
|
|
|
|
return entry
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Extract mission statements from heritage custodian websites'
|
|
)
|
|
parser.add_argument(
|
|
'ghcid',
|
|
nargs='?',
|
|
help='GHCID of the custodian to process (e.g., NL-ZH-ZUI-M-LMT)'
|
|
)
|
|
parser.add_argument(
|
|
'--url',
|
|
help='Direct URL to process (skips website discovery)'
|
|
)
|
|
parser.add_argument(
|
|
'--batch',
|
|
help='Process all custodians matching prefix (e.g., NL-NH for Noord-Holland)'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Show what would be done without making changes'
|
|
)
|
|
parser.add_argument(
|
|
'--discover-only',
|
|
action='store_true',
|
|
help='Only discover mission pages, do not extract'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.ghcid and not args.batch and not args.url:
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
if args.ghcid:
|
|
try:
|
|
custodian = load_custodian(args.ghcid)
|
|
website = get_custodian_website(custodian)
|
|
|
|
if not website:
|
|
print(f"No website found for {args.ghcid}")
|
|
sys.exit(1)
|
|
|
|
print(f"Custodian: {args.ghcid}")
|
|
print(f"Website: {website}")
|
|
|
|
if args.discover_only:
|
|
candidates = discover_mission_pages(website)
|
|
print(f"\nPotential mission pages to check:")
|
|
for url in candidates[:10]:
|
|
print(f" - {url}")
|
|
else:
|
|
print("\nTo extract mission statements, run with --url and specify the mission page URL")
|
|
print("Example:")
|
|
print(f" python {sys.argv[0]} {args.ghcid} --url {website}/missie-en-visie")
|
|
|
|
except FileNotFoundError as e:
|
|
print(f"Error: {e}")
|
|
sys.exit(1)
|
|
|
|
elif args.url and args.ghcid:
|
|
# Fetch and archive the specified URL
|
|
import asyncio
|
|
|
|
print(f"Fetching: {args.url}")
|
|
result = asyncio.run(fetch_and_archive_page(args.url, args.ghcid))
|
|
|
|
if result['success']:
|
|
print(f"Archived to: {result['archive_dir']}")
|
|
print(f"HTML file: {result['html_file']}")
|
|
|
|
# Extract potential statements (simplified)
|
|
statements = extract_mission_from_html(result['html_content'], args.url)
|
|
if statements:
|
|
print(f"\nFound {len(statements)} potential statement sections:")
|
|
for s in statements:
|
|
print(f" - {s['type']}: {s['heading']}")
|
|
print("\nNote: Use Claude to extract actual statement text and XPaths")
|
|
else:
|
|
print(f"Failed: {result.get('error', 'Unknown error')}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|