glam/scripts/extract_custodian_name.py

#!/usr/bin/env python3
"""
Extract CustodianName from archived website HTML with XPath provenance.

This script extracts the official emic institution name from archived websites
following the WebObservation provenance rules defined in AGENTS.md Rule 6.

CustodianName sources (in priority order):
1. <title> tag - Often contains "Museum Name - Tagline" pattern
2. og:site_name meta tag - Clean site/organization name
3. og:title meta tag - Page title for social sharing
4. First <h1> element - Often the main institution name
5. Footer "Over" section heading - Dutch pattern "Over [Institution]"

Output: Adds custodian_name field to entry YAML with XPath provenance

Usage:
    python scripts/extract_custodian_name.py [--limit N] [--entry ENTRY_NUM] [--dry-run]
"""

import argparse
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

import yaml

try:
    from bs4 import BeautifulSoup
    HAS_DEPS = True
except ImportError:
    HAS_DEPS = False
    print("Warning: Missing dependency: beautifulsoup4")
    print("Install with: pip install beautifulsoup4")


# Directories
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
WEB_DIR = ENTRIES_DIR / 'web'


# Common title suffixes/taglines to strip (case-insensitive patterns)
TITLE_STRIP_PATTERNS = [
    # Dutch patterns
    r'\s*[-–—|]\s*(?:home(?:page)?|welkom|startpagina|website).*$',
    r'\s*[-–—|]\s*(?:het\s+)?(?:museum|archief|bibliotheek|galerie).*$',
    r'\s*[-–—|]\s*(?:mooie\s+)?tentoonstellingen.*$',
    r'\s*[-–—|]\s*ontdek\s+.*$',
    r'\s*[-–—|]\s*bezoek\s+.*$',
    # English patterns
    r'\s*[-–—|]\s*(?:the\s+)?official\s+(?:website|site).*$',
    r'\s*[-–—|]\s*(?:welcome|home|main).*$',
    r'\s*[-–—|]\s*(?:museum|archive|library|gallery).*$',
    # Generic separators with taglines
    r'\s*[-–—|:]\s*[^|–—-]{30,}$',  # Long taglines after separator
]


def clean_institution_name(name: str) -> str:
    """
    Clean a raw title/name to extract the institution name.

    Removes common suffixes, taglines, and normalizes whitespace.
    """
    if not name:
        return ""

    # Normalize whitespace first
    name = ' '.join(name.split())

    # Apply strip patterns
    for pattern in TITLE_STRIP_PATTERNS:
        name = re.sub(pattern, '', name, flags=re.IGNORECASE)

    # Final cleanup
    name = name.strip(' -–—|:')
    name = ' '.join(name.split())

    return name


def get_xpath(element) -> str:
    """Generate XPath for an element (same as fetch_website_playwright.py)."""
    parts = []
    while element and element.name:
        siblings = element.find_previous_siblings(element.name)
        index = len(siblings) + 1
        parts.insert(0, f"{element.name}[{index}]")
        element = element.parent
    return '/' + '/'.join(parts) if parts else '/'


def extract_name_from_title(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
    """Extract institution name from <title> tag."""
    title = soup.find('title')
    if title and title.string:
        raw_title = title.string.strip()
        cleaned = clean_institution_name(raw_title)
        if cleaned and len(cleaned) > 2:
            return {
                'claim_type': 'custodian_name',
                'claim_value': cleaned,
                'raw_value': raw_title,
                'extraction_source': 'title_tag',
                'xpath': get_xpath(title),
                'html_file': html_file,
                'xpath_match_score': 1.0,
            }
    return None


def extract_name_from_meta_og(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
    """Extract institution name from og:site_name or og:title meta tags."""
    # Prefer og:site_name as it's usually the clean organization name
    for og_property in ['og:site_name', 'og:title']:
        meta = soup.find('meta', property=og_property)
        if meta and meta.get('content'):
            raw_value = meta['content'].strip()
            cleaned = clean_institution_name(raw_value)
            if cleaned and len(cleaned) > 2:
                return {
                    'claim_type': 'custodian_name',
                    'claim_value': cleaned,
                    'raw_value': raw_value,
                    'extraction_source': f'meta_{og_property.replace(":", "_")}',
                    'xpath': get_xpath(meta),
                    'html_file': html_file,
                    'xpath_match_score': 1.0,
                }
    return None


def extract_name_from_h1(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
    """Extract institution name from first <h1> element."""
    h1 = soup.find('h1')
    if h1:
        text = h1.get_text(strip=True)
        if text and len(text) > 2 and len(text) < 100:
            return {
                'claim_type': 'custodian_name',
                'claim_value': text,
                'raw_value': text,
                'extraction_source': 'h1_tag',
                'xpath': get_xpath(h1),
                'html_file': html_file,
                'xpath_match_score': 1.0,
            }
    return None


def extract_name_from_footer_over(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
    """
    Extract institution name from Dutch "Over [Institution]" pattern.

    Common pattern in Dutch museum websites:
    - Footer section heading: "Over Museum Catharijneconvent"
    - Link text: "Over het museum"
    """
    # Look for h3 tags in footer containing "Over" pattern
    footer = soup.find('footer')
    if footer:
        for h3 in footer.find_all('h3'):
            text = h3.get_text(strip=True)
            match = re.match(r'^Over\s+(.+)$', text, re.IGNORECASE)
            if match:
                name = match.group(1).strip()
                if name and len(name) > 2 and name.lower() not in ['ons', 'het museum', 'de organisatie']:
                    return {
                        'claim_type': 'custodian_name',
                        'claim_value': name,
                        'raw_value': text,
                        'extraction_source': 'footer_over_heading',
                        'xpath': get_xpath(h3),
                        'html_file': html_file,
                        'xpath_match_score': 0.9,  # Slightly lower confidence
                    }
    return None


def extract_name_from_metadata_title(metadata: dict) -> Optional[dict]:
    """Extract institution name from metadata.yaml pages[0].title."""
    pages = metadata.get('pages', [])
    if pages and pages[0].get('title'):
        raw_title = pages[0]['title']
        cleaned = clean_institution_name(raw_title)
        if cleaned and len(cleaned) > 2:
            html_file = pages[0].get('archived_file', 'unknown')
            return {
                'claim_type': 'custodian_name',
                'claim_value': cleaned,
                'raw_value': raw_title,
                'extraction_source': 'metadata_page_title',
                'html_file': html_file,
                'xpath_match_score': 1.0,  # Title is reliable
            }
    return None


def extract_custodian_name(
    html_content: str,
    html_file_path: str,
    metadata: Optional[dict] = None
) -> Optional[dict]:
    """
    Extract CustodianName from HTML content with XPath provenance.

    Tries multiple sources in priority order:
    1. metadata.yaml page title (if available, most reliable)
    2. <title> tag
    3. og:site_name / og:title meta tags
    4. First <h1> element
    5. Footer "Over [Name]" pattern

    Returns dict with claim_type, claim_value, xpath, html_file, etc.
    """
    # Try metadata page title first (already extracted by archiver)
    if metadata:
        result = extract_name_from_metadata_title(metadata)
        if result:
            return result

    # Parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Try sources in priority order
    extractors = [
        extract_name_from_title,
        extract_name_from_meta_og,
        extract_name_from_h1,
        extract_name_from_footer_over,
    ]

    for extractor in extractors:
        result = extractor(soup, html_file_path)
        if result:
            return result

    return None


def get_web_archive_path(entry_data: dict, entry_num: str) -> Optional[Path]:
    """Get the web archive directory path for an entry."""
    web_enrichment = entry_data.get('web_enrichment', {})
    web_archives = web_enrichment.get('web_archives', [])

    if web_archives:
        # Use first archive
        archive = web_archives[0]
        directory = archive.get('directory')
        if directory:
            return ENTRIES_DIR / directory

    # Fallback: look for directory in web/{entry_num}/
    entry_web_dir = WEB_DIR / entry_num
    if entry_web_dir.exists():
        subdirs = [d for d in entry_web_dir.iterdir() if d.is_dir()]
        if subdirs:
            return subdirs[0]

    return None


def load_html_and_metadata(archive_path: Path) -> tuple[Optional[str], Optional[dict]]:
    """Load HTML content and metadata from archive directory."""
    html_content = None
    metadata = None

    # Load metadata
    metadata_file = archive_path / 'metadata.yaml'
    if metadata_file.exists():
        try:
            with open(metadata_file, 'r', encoding='utf-8') as f:
                metadata = yaml.safe_load(f)
        except Exception as e:
            print(f"  Warning: Failed to load {metadata_file}: {e}")

    # Load HTML from pages/ or rendered.html
    html_paths = [
        archive_path / 'pages' / 'index.html',
        archive_path / 'rendered.html',
    ]

    # Also check mirror directories for legacy archives
    mirror_dir = archive_path / 'mirror'
    if mirror_dir.exists():
        for subdir in mirror_dir.iterdir():
            if subdir.is_dir():
                html_paths.append(subdir / 'index.html')

    for html_path in html_paths:
        if html_path.exists():
            try:
                with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
                    html_content = f.read()
                break
            except Exception as e:
                print(f"  Warning: Failed to load {html_path}: {e}")

    return html_content, metadata


def extract_entry_number(filename: str) -> str:
    """Extract entry number from filename."""
    match = re.match(r'^(\d+)', filename)
    return match.group(1) if match else filename.replace('.yaml', '')


def process_entry(filepath: Path, dry_run: bool = False) -> tuple[bool, Optional[str], list[str]]:
    """
    Process a single entry file to extract CustodianName.

    Returns: (extracted, custodian_name, errors)
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    if not data:
        return False, None, ["Empty file"]

    # Skip if already has custodian_name
    if data.get('custodian_name') and data['custodian_name'].get('claim_value'):
        return False, data['custodian_name'].get('claim_value'), []

    entry_num = extract_entry_number(filepath.name)
    errors = []

    # Get web archive path
    archive_path = get_web_archive_path(data, entry_num)
    if not archive_path or not archive_path.exists():
        return False, None, [f"No web archive found for entry {entry_num}"]

    # Load HTML and metadata
    html_content, metadata = load_html_and_metadata(archive_path)
    if not html_content and not metadata:
        return False, None, [f"No HTML content or metadata in {archive_path}"]

    # Determine HTML file path for provenance
    html_file_path = str(archive_path.relative_to(ENTRIES_DIR))
    if metadata and metadata.get('pages'):
        html_file_path = metadata['pages'][0].get('archived_file', html_file_path)

    # Extract custodian name
    name_claim = extract_custodian_name(html_content or '', html_file_path, metadata)

    if not name_claim:
        return False, None, [f"Could not extract CustodianName from {entry_num}"]

    # Add provenance metadata
    name_claim['source_url'] = data.get('web_enrichment', {}).get('web_archives', [{}])[0].get('url', '')
    if not name_claim['source_url']:
        name_claim['source_url'] = data.get('original_entry', {}).get('webadres_organisatie', '')

    name_claim['retrieved_on'] = metadata.get('archive_timestamp') if metadata else None
    name_claim['extraction_timestamp'] = datetime.now(timezone.utc).isoformat()

    if not dry_run:
        # Add custodian_name to entry data
        data['custodian_name'] = name_claim

        # Write back
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return True, name_claim.get('claim_value'), errors


def main():
    parser = argparse.ArgumentParser(description='Extract CustodianName from archived websites')
    parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
    parser.add_argument('--entry', type=str, default=None, help='Process specific entry number')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done without writing')
    parser.add_argument('--force', action='store_true', help='Re-extract even if custodian_name exists')
    args = parser.parse_args()

    if not HAS_DEPS:
        print("Error: Required dependency beautifulsoup4 not installed.")
        print("Run: pip install beautifulsoup4")
        return 1

    # Find entry files
    if args.entry:
        files = list(ENTRIES_DIR.glob(f'{args.entry}*.yaml'))
    else:
        files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])

    if args.limit:
        files = files[:args.limit]

    total_extracted = 0
    total_skipped = 0
    total_failed = 0

    print(f"Processing {len(files)} entries...")

    for filepath in files:
        if filepath.is_dir():
            continue

        # Skip if already has custodian_name (unless --force)
        if not args.force:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
            if data and data.get('custodian_name', {}).get('claim_value'):
                total_skipped += 1
                continue

        extracted, name, errors = process_entry(filepath, dry_run=args.dry_run)

        if extracted:
            total_extracted += 1
            print(f"  ✓ {filepath.name}: {name}")
        elif name:
            total_skipped += 1  # Already had name
        else:
            total_failed += 1
            for e in errors:
                print(f"  ✗ {filepath.name}: {e}")

    print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
    print(f"  Extracted: {total_extracted}")
    print(f"  Skipped (already have name): {total_skipped}")
    print(f"  Failed (no archive/name): {total_failed}")

    return 0 if total_failed == 0 else 1


if __name__ == '__main__':
    sys.exit(main())