glam/scripts/crawl_kien_playwright.py

#!/usr/bin/env python3
"""
Crawl immaterieelerfgoed.nl (KIEN) using Playwright to extract custodian data.

This script uses browser automation to handle JavaScript-rendered content.

Author: GLAM Project
Date: 2025-12-03
"""

import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path

from playwright.async_api import async_playwright, Page

# Configuration
BASE_URL = "https://www.immaterieelerfgoed.nl"
INVENTORY_URL = f"{BASE_URL}/immaterieelerfgoed"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage")
DELAY_MS = 1500  # Delay between page loads

# Ensure output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


async def get_all_heritage_urls(page: Page) -> list[dict]:
    """Get all heritage form URLs by paginating through inventory."""
    heritage_items = []

    # Navigate to inventory
    print("Navigating to inventory...")
    await page.goto(INVENTORY_URL)
    await page.wait_for_selector('a[href*="/nl/"]', timeout=10000)
    await asyncio.sleep(2)  # Let JS render

    page_num = 1
    while True:
        print(f"\nProcessing page {page_num}...")

        # Extract heritage item links
        items = await page.evaluate('''() => {
            const items = [];
            const links = document.querySelectorAll('ul li a[href^="/nl/"]');

            links.forEach(link => {
                const href = link.getAttribute('href');
                const heading = link.querySelector('h2, h3, h4');

                // Skip navigation links
                const skipPatterns = [
                    'search', 'contact', 'over-ons', 'nieuws', 'kennisbank',
                    'watisimmaterieelerfgoed', 'veelgestelde-vragen', 'spotten',
                    'immaterieel-erfgoed-in-jouw-provincie', 'disclaimer', 'colofon',
                    'copyright', 'cookiesverklaring', 'privacy', 'nieuwsbrief',
                    'pers', 'partners', 'publicaties', 'Activiteitenplannen',
                    'missie-visie', 'oudepublicaties', 'gemeenteenprovincie',
                    'linksnaarpartners', 'immaterieel-erfgoed-films', 'inventaris-immaterieel-erfgoed'
                ];

                const shouldSkip = skipPatterns.some(pattern => href.toLowerCase().includes(pattern));

                if (heading && !shouldSkip) {
                    items.push({
                        title: heading.textContent.trim(),
                        url: href,
                        slug: href.replace('/nl/', '').replace('/page/', '')
                    });
                }
            });

            return items;
        }''')

        new_items = 0
        for item in items:
            full_url = BASE_URL + item['url']
            if not any(h['url'] == full_url for h in heritage_items):
                item['url'] = full_url
                heritage_items.append(item)
                new_items += 1

        print(f"  Found {new_items} new items (total: {len(heritage_items)})")

        # Check for next page
        next_button = await page.query_selector(f'a[href*="page={page_num + 1}"]')
        if next_button:
            await next_button.click()
            await asyncio.sleep(DELAY_MS / 1000)
            page_num += 1
        else:
            print("  No more pages")
            break

        # Safety limit
        if page_num > 50:
            print("  Reached page limit")
            break

    return heritage_items


async def extract_custodians_from_page(page: Page, url: str, title: str) -> dict:
    """Extract custodian information from a heritage form page."""
    result = {
        'heritage_form': title,
        'url': url,
        'domain': None,
        'date_added': None,
        'unesco_status': None,
        'description': None,
        'custodians': [],
        'related_links': [],
        'locations': [],
        'extracted_at': datetime.now(timezone.utc).isoformat()
    }

    try:
        await page.goto(url, wait_until='networkidle')
        await asyncio.sleep(1)

        # Extract page data using JavaScript
        data = await page.evaluate('''() => {
            const result = {
                domain: null,
                dateAdded: null,
                unesco: false,
                description: null,
                custodians: [],
                relatedLinks: [],
                pageText: document.body.innerText
            };

            // Get meta description
            const metaDesc = document.querySelector('meta[name="description"]');
            if (metaDesc) {
                result.description = metaDesc.getAttribute('content');
            }

            // Look for domain info
            const text = document.body.innerText;
            const domainMatch = text.match(/Domein[:\\s]+([^\\n]+)/i);
            if (domainMatch) {
                result.domain = domainMatch[1].trim();
            }

            // Look for date added
            const dateMatch = text.match(/Bijgeschreven.*?(\\d{4}|\\d{1,2}[-\\/]\\d{1,2}[-\\/]\\d{4})/i);
            if (dateMatch) {
                result.dateAdded = dateMatch[1];
            }

            // Check for UNESCO
            if (text.toLowerCase().includes('unesco') ||
                text.toLowerCase().includes('representatieve lijst')) {
                result.unesco = true;
            }

            // Find all external links (potential custodians)
            const links = document.querySelectorAll('a[href^="http"]');
            const seenUrls = new Set();

            links.forEach(link => {
                const href = link.getAttribute('href');
                const text = link.textContent.trim();

                // Skip social media and KIEN itself
                const skipDomains = [
                    'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com',
                    'youtube.com', 'immaterieelerfgoed.nl', 'google.com', 'maps.google'
                ];

                const shouldSkip = skipDomains.some(d => href.includes(d));

                if (!shouldSkip && text.length > 2 && !seenUrls.has(href)) {
                    seenUrls.add(href);
                    result.relatedLinks.push({
                        text: text,
                        url: href
                    });
                }
            });

            // Find internal links that might be custodian profiles
            const internalLinks = document.querySelectorAll('a[href^="/nl/page/"]');
            internalLinks.forEach(link => {
                const href = link.getAttribute('href');
                const text = link.textContent.trim();

                // Skip navigation
                if (text.length > 3 && !href.includes('kennisbank') && !href.includes('contact')) {
                    result.custodians.push({
                        name: text,
                        kienUrl: 'https://www.immaterieelerfgoed.nl' + href
                    });
                }
            });

            return result;
        }''')

        result['domain'] = data.get('domain')
        result['date_added'] = data.get('dateAdded')
        result['unesco_status'] = 'mentioned' if data.get('unesco') else None
        result['description'] = data.get('description')
        result['custodians'] = data.get('custodians', [])
        result['related_links'] = data.get('relatedLinks', [])

        # Extract locations from page text
        dutch_cities = ['Amsterdam', 'Rotterdam', 'Den Haag', 'Utrecht', 'Eindhoven',
                        'Groningen', 'Tilburg', 'Almere', 'Breda', 'Nijmegen',
                        'Arnhem', 'Haarlem', 'Enschede', 'Apeldoorn', 'Amersfoort',
                        'Zaanstad', 'Maastricht', 'Leiden', 'Dordrecht', 'Zoetermeer',
                        'Zwolle', 'Deventer', 'Alkmaar', 'Delft', 'Venlo', 'Leeuwarden',
                        'Genemuiden', 'Schiermonnikoog', 'Texel', 'Vlieland', 'Terschelling']

        page_text = data.get('pageText', '')
        for city in dutch_cities:
            if city in page_text and city not in result['locations']:
                result['locations'].append(city)

    except Exception as e:
        result['error'] = str(e)

    return result


async def main():
    """Main crawl function."""
    print("=" * 60)
    print("KIEN Heritage Custodian Crawler (Playwright)")
    print("=" * 60)
    print(f"Started at: {datetime.now().isoformat()}")
    print()

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="GLAM-Heritage-Research/1.0 (Academic Research)"
        )
        page = await context.new_page()

        # Step 1: Get all heritage form URLs
        print("Step 1: Fetching heritage form URLs from inventory...")
        heritage_items = await get_all_heritage_urls(page)
        print(f"\nFound {len(heritage_items)} heritage forms")

        # Save the URL list
        urls_file = OUTPUT_DIR / "heritage_urls.json"
        with open(urls_file, 'w', encoding='utf-8') as f:
            json.dump(heritage_items, f, ensure_ascii=False, indent=2)
        print(f"Saved URL list to: {urls_file}")

        # Step 2: Extract custodian data from each page
        print("\nStep 2: Extracting custodian data from each heritage form...")
        all_results = []

        for i, item in enumerate(heritage_items, 1):
            print(f"[{i}/{len(heritage_items)}] {item['title']}")
            result = await extract_custodians_from_page(page, item['url'], item['title'])
            all_results.append(result)

            # Rate limiting
            await asyncio.sleep(DELAY_MS / 1000)

            # Progress save every 50 items
            if i % 50 == 0:
                progress_file = OUTPUT_DIR / f"custodians_progress_{i}.json"
                with open(progress_file, 'w', encoding='utf-8') as f:
                    json.dump(all_results, f, ensure_ascii=False, indent=2)
                print(f"  [PROGRESS] Saved {i} items")

        await browser.close()

    # Step 3: Save final results
    print("\nStep 3: Saving results...")

    output_file = OUTPUT_DIR / "kien_custodians.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)
    print(f"Saved full results to: {output_file}")

    # Summary statistics
    total_custodians = sum(len(r['custodians']) for r in all_results)
    total_links = sum(len(r['related_links']) for r in all_results)
    with_custodians = sum(1 for r in all_results if r['custodians'])
    with_links = sum(1 for r in all_results if r['related_links'])
    with_unesco = sum(1 for r in all_results if r['unesco_status'])

    summary = {
        'crawl_date': datetime.now(timezone.utc).isoformat(),
        'total_heritage_forms': len(all_results),
        'forms_with_custodian_profiles': with_custodians,
        'forms_with_external_links': with_links,
        'total_custodian_profiles_found': total_custodians,
        'total_external_links_found': total_links,
        'forms_with_unesco_mention': with_unesco,
        'unique_domains': list(set(r['domain'] for r in all_results if r['domain'])),
        'unique_locations': list(set(loc for r in all_results for loc in r.get('locations', [])))
    }

    summary_file = OUTPUT_DIR / "crawl_summary.json"
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("\n" + "=" * 60)
    print("CRAWL COMPLETE")
    print("=" * 60)
    print(f"Total heritage forms: {summary['total_heritage_forms']}")
    print(f"Forms with custodian profiles: {summary['forms_with_custodian_profiles']}")
    print(f"Forms with external links: {summary['forms_with_external_links']}")
    print(f"Total custodian profiles: {summary['total_custodian_profiles_found']}")
    print(f"Total external links: {summary['total_external_links_found']}")
    print(f"Forms with UNESCO mention: {summary['forms_with_unesco_mention']}")
    print(f"\nResults saved to: {OUTPUT_DIR}")


if __name__ == "__main__":
    asyncio.run(main())