glam/scripts/crawl_kien_custodians.py

#!/usr/bin/env python3
"""
Crawl immaterieelerfgoed.nl (KIEN) to extract custodian data from heritage forms.

This script:
1. Fetches all heritage form URLs from the inventory pagination
2. Visits each heritage form page to extract custodian information
3. Outputs structured YAML data for integration with GLAM schema

Author: GLAM Project
Date: 2025-12-03
"""

import json
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

# Configuration
BASE_URL = "https://www.immaterieelerfgoed.nl"
INVENTORY_URL = f"{BASE_URL}/nl/search"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage")
CACHE_DIR = OUTPUT_DIR / "cache"
DELAY_SECONDS = 1.5  # Be polite to the server

# Ensure output directories exist
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

# Session with headers
session = requests.Session()
session.headers.update({
    "User-Agent": "GLAM-Heritage-Research/1.0 (Academic Research; contact: heritage@example.org)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "nl,en;q=0.5",
})


def fetch_page(url: str, use_cache: bool = True) -> str:
    """Fetch a page with caching support."""
    # Create cache key from URL
    cache_key = re.sub(r'[^\w\-]', '_', url.replace(BASE_URL, ''))
    cache_file = CACHE_DIR / f"{cache_key}.html"

    if use_cache and cache_file.exists():
        print(f"  [CACHE] {url}")
        return cache_file.read_text(encoding='utf-8')

    print(f"  [FETCH] {url}")
    time.sleep(DELAY_SECONDS)

    try:
        response = session.get(url, timeout=30)
        response.raise_for_status()
        html = response.text

        # Cache the result
        cache_file.write_text(html, encoding='utf-8')
        return html
    except requests.RequestException as e:
        print(f"  [ERROR] Failed to fetch {url}: {e}")
        return ""


def get_all_heritage_urls() -> list[dict]:
    """Get all heritage form URLs from the inventory pages."""
    heritage_items = []
    page = 1

    while True:
        url = f"{INVENTORY_URL}?page={page}"
        html = fetch_page(url)

        if not html:
            break

        soup = BeautifulSoup(html, 'html.parser')

        # Find heritage item links in the list
        # Looking for links that go to heritage form pages
        items_found = 0
        for link in soup.find_all('a', href=True):
            href = link.get('href', '')
            # Heritage form URLs look like /nl/somename or /nl/page/XXXX/name
            if href.startswith('/nl/') and not any(skip in href for skip in [
                'search', 'contact', 'over-ons', 'nieuws', 'kennisbank',
                'watisimmaterieelerfgoed', 'veelgestelde-vragen', 'spotten',
                'immaterieel-erfgoed-in-jouw-provincie', 'disclaimer', 'colofon',
                'copyright', 'cookiesverklaring', 'privacy', 'nieuwsbrief',
                'pers', 'partners', 'publicaties', 'Activiteitenplannen',
                'missie-visie', 'oudepublicaties', 'gemeenteenprovincie',
                'linksnaarpartners', 'immaterieel-erfgoed-films'
            ]):
                # Check if it has a heading child (inventory items do)
                heading = link.find(['h2', 'h3', 'h4'])
                if heading:
                    title = heading.get_text(strip=True)
                    full_url = urljoin(BASE_URL, href)

                    # Avoid duplicates
                    if not any(item['url'] == full_url for item in heritage_items):
                        heritage_items.append({
                            'title': title,
                            'url': full_url,
                            'slug': href.replace('/nl/', '').replace('/page/', '')
                        })
                        items_found += 1

        print(f"Page {page}: Found {items_found} heritage forms (total: {len(heritage_items)})")

        # Check if there's a next page
        next_link = soup.find('a', href=re.compile(rf'page={page + 1}'))
        if not next_link or items_found == 0:
            break

        page += 1

        # Safety limit
        if page > 50:
            print("  [WARN] Reached page limit, stopping")
            break

    return heritage_items


def extract_custodians_from_page(url: str, title: str) -> dict:
    """Extract custodian information from a heritage form page."""
    html = fetch_page(url)

    if not html:
        return {
            'heritage_form': title,
            'url': url,
            'custodians': [],
            'error': 'Failed to fetch page'
        }

    soup = BeautifulSoup(html, 'html.parser')

    result = {
        'heritage_form': title,
        'url': url,
        'domain': None,
        'date_added': None,
        'unesco_status': None,
        'description': None,
        'custodians': [],
        'related_links': [],
        'locations': [],
        'extracted_at': datetime.now(timezone.utc).isoformat()
    }

    # Extract domain/category
    # Look for domain indicators
    for text in soup.stripped_strings:
        if 'domein' in text.lower():
            # Try to find the domain value nearby
            parent = soup.find(string=re.compile('domein', re.I))
            if parent:
                next_text = parent.find_next(string=True)
                if next_text:
                    result['domain'] = next_text.strip()

    # Look for structured data sections
    # Many pages have a sidebar or info section with custodian data

    # Method 1: Look for "Gemeenschap" or "Erfgoedbeoefenaars" sections
    for section_name in ['Gemeenschap', 'Erfgoedbeoefenaars', 'Beoefenaars', 'Organisaties', 'Contact']:
        section = soup.find(string=re.compile(section_name, re.I))
        if section:
            parent = section.find_parent(['div', 'section', 'article'])
            if parent:
                # Extract links within this section
                for link in parent.find_all('a', href=True):
                    href = link.get('href', '')
                    text = link.get_text(strip=True)

                    # Skip navigation links
                    if href.startswith('/nl/') and len(text) > 3:
                        # This might be a custodian page
                        if 'page/' in href:
                            result['custodians'].append({
                                'name': text,
                                'kien_url': urljoin(BASE_URL, href),
                                'type': 'organization'
                            })
                    elif href.startswith('http') and BASE_URL not in href:
                        # External website
                        result['custodians'].append({
                            'name': text,
                            'website': href,
                            'type': 'external'
                        })

    # Method 2: Look for any external links that might be custodian websites
    for link in soup.find_all('a', href=True):
        href = link.get('href', '')
        text = link.get_text(strip=True)

        # External links (not to KIEN, not social media, not generic)
        if (href.startswith('http') and
            BASE_URL not in href and
            not any(social in href.lower() for social in ['facebook', 'twitter', 'instagram', 'linkedin', 'youtube']) and
            len(text) > 3):

            # Check if this looks like an organization
            if not any(c['website'] == href for c in result['custodians'] if 'website' in c):
                result['related_links'].append({
                    'text': text,
                    'url': href
                })

    # Method 3: Look for location mentions
    # Common patterns: "in [City]", "te [City]", location names
    dutch_cities = ['Amsterdam', 'Rotterdam', 'Den Haag', 'Utrecht', 'Eindhoven',
                    'Groningen', 'Tilburg', 'Almere', 'Breda', 'Nijmegen',
                    'Arnhem', 'Haarlem', 'Enschede', 'Apeldoorn', 'Amersfoort',
                    'Zaanstad', 'Maastricht', 'Leiden', 'Dordrecht', 'Zoetermeer']

    page_text = soup.get_text()
    for city in dutch_cities:
        if city in page_text:
            if city not in result['locations']:
                result['locations'].append(city)

    # Method 4: Look for date added to inventory
    date_pattern = re.compile(r'(\d{1,2}[-/]\d{1,2}[-/]\d{4}|\d{4})')
    for text in soup.stripped_strings:
        if 'inventaris' in text.lower() or 'bijgeschreven' in text.lower():
            dates = date_pattern.findall(text)
            if dates:
                result['date_added'] = dates[0]

    # Method 5: Check for UNESCO status
    unesco_keywords = ['unesco', 'representatieve lijst', 'werelderfgoed', 'immaterieel erfgoed van de mensheid']
    page_text_lower = page_text.lower()
    for keyword in unesco_keywords:
        if keyword in page_text_lower:
            result['unesco_status'] = 'mentioned'
            break

    # Get description (first paragraph or meta description)
    meta_desc = soup.find('meta', {'name': 'description'})
    if meta_desc and meta_desc.get('content'):
        result['description'] = meta_desc['content']
    else:
        # Try first substantial paragraph
        for p in soup.find_all('p'):
            text = p.get_text(strip=True)
            if len(text) > 100:
                result['description'] = text[:500] + '...' if len(text) > 500 else text
                break

    return result


def main():
    """Main crawl function."""
    print("=" * 60)
    print("KIEN Heritage Custodian Crawler")
    print("=" * 60)
    print(f"Started at: {datetime.now().isoformat()}")
    print()

    # Step 1: Get all heritage form URLs
    print("Step 1: Fetching heritage form URLs from inventory...")
    heritage_items = get_all_heritage_urls()
    print(f"\nFound {len(heritage_items)} heritage forms")

    # Save the URL list
    urls_file = OUTPUT_DIR / "heritage_urls.json"
    with open(urls_file, 'w', encoding='utf-8') as f:
        json.dump(heritage_items, f, ensure_ascii=False, indent=2)
    print(f"Saved URL list to: {urls_file}")

    # Step 2: Extract custodian data from each page
    print("\nStep 2: Extracting custodian data from each heritage form...")
    all_results = []

    for i, item in enumerate(heritage_items, 1):
        print(f"\n[{i}/{len(heritage_items)}] {item['title']}")
        result = extract_custodians_from_page(item['url'], item['title'])
        all_results.append(result)

        # Progress save every 50 items
        if i % 50 == 0:
            progress_file = OUTPUT_DIR / f"custodians_progress_{i}.json"
            with open(progress_file, 'w', encoding='utf-8') as f:
                json.dump(all_results, f, ensure_ascii=False, indent=2)
            print(f"  [PROGRESS] Saved {i} items")

    # Step 3: Save final results
    print("\nStep 3: Saving results...")

    # Full JSON output
    output_file = OUTPUT_DIR / "kien_custodians.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)
    print(f"Saved full results to: {output_file}")

    # Summary statistics
    total_custodians = sum(len(r['custodians']) for r in all_results)
    with_custodians = sum(1 for r in all_results if r['custodians'])
    with_unesco = sum(1 for r in all_results if r['unesco_status'])

    summary = {
        'crawl_date': datetime.now(timezone.utc).isoformat(),
        'total_heritage_forms': len(all_results),
        'forms_with_custodians': with_custodians,
        'total_custodians_found': total_custodians,
        'forms_with_unesco_mention': with_unesco,
        'unique_domains': list(set(r['domain'] for r in all_results if r['domain'])),
        'unique_locations': list(set(loc for r in all_results for loc in r.get('locations', [])))
    }

    summary_file = OUTPUT_DIR / "crawl_summary.json"
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("\n" + "=" * 60)
    print("CRAWL COMPLETE")
    print("=" * 60)
    print(f"Total heritage forms: {summary['total_heritage_forms']}")
    print(f"Forms with custodians: {summary['forms_with_custodians']}")
    print(f"Total custodians found: {summary['total_custodians_found']}")
    print(f"Forms with UNESCO mention: {summary['forms_with_unesco_mention']}")
    print(f"\nResults saved to: {OUTPUT_DIR}")


if __name__ == "__main__":
    main()