#!/usr/bin/env python3 """ Crawl immaterieelerfgoed.nl (KIEN) using Playwright to extract custodian data. This script uses browser automation to handle JavaScript-rendered content. Author: GLAM Project Date: 2025-12-03 """ import asyncio import json import re from datetime import datetime, timezone from pathlib import Path from playwright.async_api import async_playwright, Page # Configuration BASE_URL = "https://www.immaterieelerfgoed.nl" INVENTORY_URL = f"{BASE_URL}/immaterieelerfgoed" OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage") DELAY_MS = 1500 # Delay between page loads # Ensure output directory exists OUTPUT_DIR.mkdir(parents=True, exist_ok=True) async def get_all_heritage_urls(page: Page) -> list[dict]: """Get all heritage form URLs by paginating through inventory.""" heritage_items = [] # Navigate to inventory print("Navigating to inventory...") await page.goto(INVENTORY_URL) await page.wait_for_selector('a[href*="/nl/"]', timeout=10000) await asyncio.sleep(2) # Let JS render page_num = 1 while True: print(f"\nProcessing page {page_num}...") # Extract heritage item links items = await page.evaluate('''() => { const items = []; const links = document.querySelectorAll('ul li a[href^="/nl/"]'); links.forEach(link => { const href = link.getAttribute('href'); const heading = link.querySelector('h2, h3, h4'); // Skip navigation links const skipPatterns = [ 'search', 'contact', 'over-ons', 'nieuws', 'kennisbank', 'watisimmaterieelerfgoed', 'veelgestelde-vragen', 'spotten', 'immaterieel-erfgoed-in-jouw-provincie', 'disclaimer', 'colofon', 'copyright', 'cookiesverklaring', 'privacy', 'nieuwsbrief', 'pers', 'partners', 'publicaties', 'Activiteitenplannen', 'missie-visie', 'oudepublicaties', 'gemeenteenprovincie', 'linksnaarpartners', 'immaterieel-erfgoed-films', 'inventaris-immaterieel-erfgoed' ]; const shouldSkip = skipPatterns.some(pattern => href.toLowerCase().includes(pattern)); if (heading && !shouldSkip) { items.push({ title: heading.textContent.trim(), url: href, slug: href.replace('/nl/', '').replace('/page/', '') }); } }); return items; }''') new_items = 0 for item in items: full_url = BASE_URL + item['url'] if not any(h['url'] == full_url for h in heritage_items): item['url'] = full_url heritage_items.append(item) new_items += 1 print(f" Found {new_items} new items (total: {len(heritage_items)})") # Check for next page next_button = await page.query_selector(f'a[href*="page={page_num + 1}"]') if next_button: await next_button.click() await asyncio.sleep(DELAY_MS / 1000) page_num += 1 else: print(" No more pages") break # Safety limit if page_num > 50: print(" Reached page limit") break return heritage_items async def extract_custodians_from_page(page: Page, url: str, title: str) -> dict: """Extract custodian information from a heritage form page.""" result = { 'heritage_form': title, 'url': url, 'domain': None, 'date_added': None, 'unesco_status': None, 'description': None, 'custodians': [], 'related_links': [], 'locations': [], 'extracted_at': datetime.now(timezone.utc).isoformat() } try: await page.goto(url, wait_until='networkidle') await asyncio.sleep(1) # Extract page data using JavaScript data = await page.evaluate('''() => { const result = { domain: null, dateAdded: null, unesco: false, description: null, custodians: [], relatedLinks: [], pageText: document.body.innerText }; // Get meta description const metaDesc = document.querySelector('meta[name="description"]'); if (metaDesc) { result.description = metaDesc.getAttribute('content'); } // Look for domain info const text = document.body.innerText; const domainMatch = text.match(/Domein[:\\s]+([^\\n]+)/i); if (domainMatch) { result.domain = domainMatch[1].trim(); } // Look for date added const dateMatch = text.match(/Bijgeschreven.*?(\\d{4}|\\d{1,2}[-\\/]\\d{1,2}[-\\/]\\d{4})/i); if (dateMatch) { result.dateAdded = dateMatch[1]; } // Check for UNESCO if (text.toLowerCase().includes('unesco') || text.toLowerCase().includes('representatieve lijst')) { result.unesco = true; } // Find all external links (potential custodians) const links = document.querySelectorAll('a[href^="http"]'); const seenUrls = new Set(); links.forEach(link => { const href = link.getAttribute('href'); const text = link.textContent.trim(); // Skip social media and KIEN itself const skipDomains = [ 'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', 'youtube.com', 'immaterieelerfgoed.nl', 'google.com', 'maps.google' ]; const shouldSkip = skipDomains.some(d => href.includes(d)); if (!shouldSkip && text.length > 2 && !seenUrls.has(href)) { seenUrls.add(href); result.relatedLinks.push({ text: text, url: href }); } }); // Find internal links that might be custodian profiles const internalLinks = document.querySelectorAll('a[href^="/nl/page/"]'); internalLinks.forEach(link => { const href = link.getAttribute('href'); const text = link.textContent.trim(); // Skip navigation if (text.length > 3 && !href.includes('kennisbank') && !href.includes('contact')) { result.custodians.push({ name: text, kienUrl: 'https://www.immaterieelerfgoed.nl' + href }); } }); return result; }''') result['domain'] = data.get('domain') result['date_added'] = data.get('dateAdded') result['unesco_status'] = 'mentioned' if data.get('unesco') else None result['description'] = data.get('description') result['custodians'] = data.get('custodians', []) result['related_links'] = data.get('relatedLinks', []) # Extract locations from page text dutch_cities = ['Amsterdam', 'Rotterdam', 'Den Haag', 'Utrecht', 'Eindhoven', 'Groningen', 'Tilburg', 'Almere', 'Breda', 'Nijmegen', 'Arnhem', 'Haarlem', 'Enschede', 'Apeldoorn', 'Amersfoort', 'Zaanstad', 'Maastricht', 'Leiden', 'Dordrecht', 'Zoetermeer', 'Zwolle', 'Deventer', 'Alkmaar', 'Delft', 'Venlo', 'Leeuwarden', 'Genemuiden', 'Schiermonnikoog', 'Texel', 'Vlieland', 'Terschelling'] page_text = data.get('pageText', '') for city in dutch_cities: if city in page_text and city not in result['locations']: result['locations'].append(city) except Exception as e: result['error'] = str(e) return result async def main(): """Main crawl function.""" print("=" * 60) print("KIEN Heritage Custodian Crawler (Playwright)") print("=" * 60) print(f"Started at: {datetime.now().isoformat()}") print() async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="GLAM-Heritage-Research/1.0 (Academic Research)" ) page = await context.new_page() # Step 1: Get all heritage form URLs print("Step 1: Fetching heritage form URLs from inventory...") heritage_items = await get_all_heritage_urls(page) print(f"\nFound {len(heritage_items)} heritage forms") # Save the URL list urls_file = OUTPUT_DIR / "heritage_urls.json" with open(urls_file, 'w', encoding='utf-8') as f: json.dump(heritage_items, f, ensure_ascii=False, indent=2) print(f"Saved URL list to: {urls_file}") # Step 2: Extract custodian data from each page print("\nStep 2: Extracting custodian data from each heritage form...") all_results = [] for i, item in enumerate(heritage_items, 1): print(f"[{i}/{len(heritage_items)}] {item['title']}") result = await extract_custodians_from_page(page, item['url'], item['title']) all_results.append(result) # Rate limiting await asyncio.sleep(DELAY_MS / 1000) # Progress save every 50 items if i % 50 == 0: progress_file = OUTPUT_DIR / f"custodians_progress_{i}.json" with open(progress_file, 'w', encoding='utf-8') as f: json.dump(all_results, f, ensure_ascii=False, indent=2) print(f" [PROGRESS] Saved {i} items") await browser.close() # Step 3: Save final results print("\nStep 3: Saving results...") output_file = OUTPUT_DIR / "kien_custodians.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(all_results, f, ensure_ascii=False, indent=2) print(f"Saved full results to: {output_file}") # Summary statistics total_custodians = sum(len(r['custodians']) for r in all_results) total_links = sum(len(r['related_links']) for r in all_results) with_custodians = sum(1 for r in all_results if r['custodians']) with_links = sum(1 for r in all_results if r['related_links']) with_unesco = sum(1 for r in all_results if r['unesco_status']) summary = { 'crawl_date': datetime.now(timezone.utc).isoformat(), 'total_heritage_forms': len(all_results), 'forms_with_custodian_profiles': with_custodians, 'forms_with_external_links': with_links, 'total_custodian_profiles_found': total_custodians, 'total_external_links_found': total_links, 'forms_with_unesco_mention': with_unesco, 'unique_domains': list(set(r['domain'] for r in all_results if r['domain'])), 'unique_locations': list(set(loc for r in all_results for loc in r.get('locations', []))) } summary_file = OUTPUT_DIR / "crawl_summary.json" with open(summary_file, 'w', encoding='utf-8') as f: json.dump(summary, f, ensure_ascii=False, indent=2) print("\n" + "=" * 60) print("CRAWL COMPLETE") print("=" * 60) print(f"Total heritage forms: {summary['total_heritage_forms']}") print(f"Forms with custodian profiles: {summary['forms_with_custodian_profiles']}") print(f"Forms with external links: {summary['forms_with_external_links']}") print(f"Total custodian profiles: {summary['total_custodian_profiles_found']}") print(f"Total external links: {summary['total_external_links_found']}") print(f"Forms with UNESCO mention: {summary['forms_with_unesco_mention']}") print(f"\nResults saved to: {OUTPUT_DIR}") if __name__ == "__main__": asyncio.run(main())