glam/scripts/batch_crawl4ai_recrawl.py

#!/usr/bin/env python3
"""
Batch re-crawl failed URLs using crawl4ai (free, local) and transform to digital_platform_v2.

This script:
1. Reads the list of failed crawl URLs
2. Uses crawl4ai to fetch content (free, no API limits)
3. Transforms results to digital_platform_v2 format
4. Updates the custodian YAML files

Usage:
    python scripts/batch_crawl4ai_recrawl.py --limit 100 --start 0
"""

import argparse
import asyncio
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from urllib.parse import urlparse

import yaml
from crawl4ai import AsyncWebCrawler

# Configuration
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt")

# Platform type detection patterns
PLATFORM_PATTERNS = {
    'DISCOVERY_PORTAL': [
        r'/collectie', r'/collection', r'/catalogus', r'/catalog',
        r'/zoeken', r'/search', r'/archief', r'/archive',
        r'/beeldbank', r'/images', r'/foto', r'/photo',
    ],
    'DIGITAL_ARCHIVE': [
        r'archieven\.nl', r'archief', r'archive',
        r'/inventaris', r'/inventory', r'/toegang',
    ],
    'EDUCATION': [
        r'/educatie', r'/education', r'/onderwijs', r'/leren',
        r'/scholen', r'/schools', r'/lesmateriaal',
    ],
    'INSTITUTIONAL_WEBSITE': [
        r'/over-ons', r'/about', r'/contact', r'/bezoek',
        r'/visit', r'/openingstijden', r'/hours',
    ],
}


def detect_platform_type(url: str, links: list[str] | None = None) -> str:
    """Detect the platform type based on URL patterns and extracted links."""
    url_lower = url.lower()
    all_urls = [url_lower] + [l.lower() for l in (links or [])]

    for platform_type, patterns in PLATFORM_PATTERNS.items():
        for pattern in patterns:
            for check_url in all_urls:
                if re.search(pattern, check_url):
                    return platform_type

    return 'INSTITUTIONAL_WEBSITE'


def extract_collection_urls(links: list[str], base_url: str) -> list[str]:
    """Extract URLs that appear to be collection/catalog pages."""
    collection_patterns = [
        r'/collectie', r'/collection', r'/catalogus', r'/catalog',
        r'/zoeken', r'/search', r'/beeldbank', r'/inventaris',
        r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen',
    ]

    collection_urls = []
    base_domain = urlparse(base_url).netloc

    for link in links:
        try:
            parsed = urlparse(link)
            if base_domain in parsed.netloc or parsed.netloc in base_domain:
                for pattern in collection_patterns:
                    if re.search(pattern, link.lower()):
                        if link not in collection_urls:
                            collection_urls.append(link)
                        break
        except Exception:
            continue

    return collection_urls[:10]


def extract_auxiliary_platforms(links: list[str], base_url: str) -> list[dict]:
    """Extract external platform links (aggregators, portals, etc.)."""
    external_patterns = {
        'archieven.nl': {'name': 'Archieven.nl', 'type': 'AGGREGATOR'},
        'europeana.eu': {'name': 'Europeana', 'type': 'AGGREGATOR'},
        'collectienederland.nl': {'name': 'Collectie Nederland', 'type': 'AGGREGATOR'},
        'erfgoedthesaurus.nl': {'name': 'Erfgoedthesaurus', 'type': 'THESAURUS'},
        'delpher.nl': {'name': 'Delpher', 'type': 'DIGITAL_ARCHIVE'},
        'geheugen.nl': {'name': 'Geheugen van Nederland', 'type': 'AGGREGATOR'},
    }

    base_domain = urlparse(base_url).netloc
    auxiliary = []
    seen_domains = set()

    for link in links:
        try:
            parsed = urlparse(link)
            domain = parsed.netloc.replace('www.', '')

            if base_domain in domain or domain in base_domain:
                continue

            for pattern, info in external_patterns.items():
                if pattern in domain and domain not in seen_domains:
                    seen_domains.add(domain)
                    auxiliary.append({
                        'platform_name': info['name'],
                        'platform_url': link,
                        'platform_type': info['type'],
                        'integration_type': 'external_aggregator',
                    })
                    break
        except Exception:
            continue

    return auxiliary[:5]


def is_generic_title(title: str) -> bool:
    """Check if a title is too generic to use as platform name."""
    generic_patterns = [
        'home', 'homepage', 'welkom', 'welcome', 'startpagina',
        'index', 'main', 'website', 'webpagina', 'homepagina',
    ]
    if not title:
        return True
    title_lower = title.lower().strip()
    for pattern in generic_patterns:
        if title_lower == pattern or title_lower.startswith(f"{pattern} -") or title_lower.startswith(f"{pattern} |"):
            return True
    return len(title) < 3


def transform_to_platform_v2(crawl_result, source_url: str, org_name: str) -> dict[str, Any]:
    """Transform crawl4ai result to digital_platform_v2 format."""
    metadata = crawl_result.metadata or {}

    # Get internal links
    internal_links = []
    if crawl_result.links:
        internal_links = [l.get('href', '') for l in crawl_result.links.get('internal', []) if l.get('href')]

    # Extract title, checking for generic titles
    candidate_titles = [
        metadata.get('og:title'),
        metadata.get('title', '').split(' - ')[0].strip(),
        metadata.get('title', '').split(' | ')[0].strip(),
        metadata.get('og:site_name'),
    ]

    title = org_name  # Default fallback
    for candidate in candidate_titles:
        if candidate and not is_generic_title(candidate):
            title = candidate
            break

    # Generate platform ID
    domain = urlparse(source_url).netloc.replace('www.', '').replace('.', '_')
    platform_id = f"primary_website_{domain}"

    # Detect platform type
    platform_type = detect_platform_type(source_url, internal_links)

    # Extract collection URLs
    collection_urls = extract_collection_urls(internal_links, source_url)

    # Extract auxiliary platforms
    auxiliary_platforms = extract_auxiliary_platforms(internal_links, source_url)

    # Build digital_platform_v2 structure
    platform_v2: dict[str, Any] = {
        'transformation_metadata': {
            'transformed_from': 'crawl4ai_recrawl',
            'transformation_date': datetime.now(timezone.utc).isoformat(),
            'transformation_version': '2.0',
            'source_status_code': crawl_result.status_code,
        },
        'primary_platform': {
            'platform_id': platform_id,
            'platform_name': f"{title} Website" if 'website' not in title.lower() else title,
            'platform_url': source_url,
            'platform_type': platform_type,
            'description': metadata.get('description') or metadata.get('og:description', ''),
            'language': metadata.get('language', 'nl'),
            'og_image': metadata.get('og:image'),
            'favicon': metadata.get('favicon'),
        },
    }

    if collection_urls:
        platform_v2['primary_platform']['collection_urls'] = collection_urls

    if auxiliary_platforms:
        platform_v2['auxiliary_platforms'] = auxiliary_platforms

    if internal_links:
        platform_v2['navigation_links'] = internal_links[:20]

    return platform_v2


def update_custodian_file(filepath: Path, platform_v2: dict) -> bool:
    """Update a custodian YAML file with digital_platform_v2 data."""
    try:
        with open(filepath, 'r') as f:
            data = yaml.safe_load(f)

        if data is None:
            data = {}

        data['digital_platform_v2'] = platform_v2

        if 'crawl4ai_enrichment' in data:
            data['crawl4ai_enrichment']['recrawled_with'] = 'crawl4ai_v2'
            data['crawl4ai_enrichment']['recrawl_date'] = datetime.now(timezone.utc).isoformat()

        with open(filepath, 'w') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        return True

    except Exception as e:
        print(f"  Error updating {filepath}: {e}")
        return False


def load_failed_urls() -> list[tuple[str, str]]:
    """Load the list of failed URLs with their file paths."""
    urls = []
    with open(FAILED_URLS_FILE, 'r') as f:
        for line in f:
            line = line.strip()
            if '\t' in line:
                filename, url = line.split('\t', 1)
                urls.append((filename, url))
    return urls


def get_org_name(filepath: Path) -> str:
    """Extract organization name from custodian file."""
    try:
        with open(filepath, 'r') as f:
            data = yaml.safe_load(f)

        if data:
            if 'original_entry' in data and data['original_entry'].get('organisatie'):
                return data['original_entry']['organisatie']
            if 'custodian_name' in data:
                return data['custodian_name'].get('emic_name', '') or data['custodian_name'].get('preferred_name', '')
            if 'name' in data:
                return data['name']

        stem = filepath.stem
        parts = stem.split('-')
        return parts[-1] if parts else stem

    except Exception:
        return filepath.stem


async def scrape_single_url(crawler: AsyncWebCrawler, url: str) -> Any:
    """Scrape a single URL using crawl4ai."""
    try:
        result = await crawler.arun(url, verbose=False)
        if result.success:
            return result
        print(f"  Crawl failed: {result.error_message}")
        return None
    except Exception as e:
        print(f"  Exception: {e}")
        return None


async def main_async(args):
    """Async main function."""
    all_urls = load_failed_urls()
    print(f"Loaded {len(all_urls)} failed URLs")

    if args.limit > 0:
        urls_to_process = all_urls[args.start:args.start + args.limit]
    else:
        urls_to_process = all_urls[args.start:]

    print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit or 'all'})")

    if args.dry_run:
        print("\n[DRY RUN MODE - No changes will be made]")
        for filename, url in urls_to_process[:10]:
            print(f"  Would scrape: {filename} -> {url}")
        print(f"  ... and {len(urls_to_process) - 10} more")
        return

    success_count = 0
    fail_count = 0
    skip_count = 0

    async with AsyncWebCrawler(verbose=False) as crawler:
        for i, (filename, url) in enumerate(urls_to_process):
            filepath = CUSTODIAN_DIR / filename

            print(f"\n[{i+1}/{len(urls_to_process)}] {filename}")
            print(f"  URL: {url}")

            if not filepath.exists():
                print(f"  SKIP: File not found")
                skip_count += 1
                continue

            # Check if already has digital_platform_v2
            with open(filepath, 'r') as f:
                content = f.read()
                if 'digital_platform_v2:' in content:
                    print(f"  SKIP: Already has digital_platform_v2")
                    skip_count += 1
                    continue

            org_name = get_org_name(filepath)

            result = await scrape_single_url(crawler, url)

            if result:
                platform_v2 = transform_to_platform_v2(result, url, org_name)

                if update_custodian_file(filepath, platform_v2):
                    success_count += 1
                    print(f"  SUCCESS: {platform_v2['primary_platform']['platform_name']}")
                else:
                    fail_count += 1
            else:
                fail_count += 1
                print(f"  FAILED: Could not scrape URL")

            # Small delay to be polite
            await asyncio.sleep(args.delay)

            if (i + 1) % 50 == 0:
                print(f"\n=== Progress: {i+1}/{len(urls_to_process)} (success={success_count}, skip={skip_count}, fail={fail_count}) ===\n")

    print(f"\n=== Final Results ===")
    print(f"Success: {success_count}")
    print(f"Skipped: {skip_count}")
    print(f"Failed: {fail_count}")
    print(f"Total: {len(urls_to_process)}")


def main():
    parser = argparse.ArgumentParser(description='Batch re-crawl failed URLs with crawl4ai')
    parser.add_argument('--start', type=int, default=0, help='Starting index')
    parser.add_argument('--limit', type=int, default=0, help='Maximum URLs to process (0=all)')
    parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
    parser.add_argument('--delay', type=float, default=0.5, help='Delay between requests in seconds')
    args = parser.parse_args()

    asyncio.run(main_async(args))


if __name__ == '__main__':
    main()