glam/scripts/crawl_kien_playwright.py
2025-12-05 15:30:23 +01:00

321 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Crawl immaterieelerfgoed.nl (KIEN) using Playwright to extract custodian data.
This script uses browser automation to handle JavaScript-rendered content.
Author: GLAM Project
Date: 2025-12-03
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright, Page
# Configuration
BASE_URL = "https://www.immaterieelerfgoed.nl"
INVENTORY_URL = f"{BASE_URL}/immaterieelerfgoed"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage")
DELAY_MS = 1500 # Delay between page loads
# Ensure output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
async def get_all_heritage_urls(page: Page) -> list[dict]:
"""Get all heritage form URLs by paginating through inventory."""
heritage_items = []
# Navigate to inventory
print("Navigating to inventory...")
await page.goto(INVENTORY_URL)
await page.wait_for_selector('a[href*="/nl/"]', timeout=10000)
await asyncio.sleep(2) # Let JS render
page_num = 1
while True:
print(f"\nProcessing page {page_num}...")
# Extract heritage item links
items = await page.evaluate('''() => {
const items = [];
const links = document.querySelectorAll('ul li a[href^="/nl/"]');
links.forEach(link => {
const href = link.getAttribute('href');
const heading = link.querySelector('h2, h3, h4');
// Skip navigation links
const skipPatterns = [
'search', 'contact', 'over-ons', 'nieuws', 'kennisbank',
'watisimmaterieelerfgoed', 'veelgestelde-vragen', 'spotten',
'immaterieel-erfgoed-in-jouw-provincie', 'disclaimer', 'colofon',
'copyright', 'cookiesverklaring', 'privacy', 'nieuwsbrief',
'pers', 'partners', 'publicaties', 'Activiteitenplannen',
'missie-visie', 'oudepublicaties', 'gemeenteenprovincie',
'linksnaarpartners', 'immaterieel-erfgoed-films', 'inventaris-immaterieel-erfgoed'
];
const shouldSkip = skipPatterns.some(pattern => href.toLowerCase().includes(pattern));
if (heading && !shouldSkip) {
items.push({
title: heading.textContent.trim(),
url: href,
slug: href.replace('/nl/', '').replace('/page/', '')
});
}
});
return items;
}''')
new_items = 0
for item in items:
full_url = BASE_URL + item['url']
if not any(h['url'] == full_url for h in heritage_items):
item['url'] = full_url
heritage_items.append(item)
new_items += 1
print(f" Found {new_items} new items (total: {len(heritage_items)})")
# Check for next page
next_button = await page.query_selector(f'a[href*="page={page_num + 1}"]')
if next_button:
await next_button.click()
await asyncio.sleep(DELAY_MS / 1000)
page_num += 1
else:
print(" No more pages")
break
# Safety limit
if page_num > 50:
print(" Reached page limit")
break
return heritage_items
async def extract_custodians_from_page(page: Page, url: str, title: str) -> dict:
"""Extract custodian information from a heritage form page."""
result = {
'heritage_form': title,
'url': url,
'domain': None,
'date_added': None,
'unesco_status': None,
'description': None,
'custodians': [],
'related_links': [],
'locations': [],
'extracted_at': datetime.now(timezone.utc).isoformat()
}
try:
await page.goto(url, wait_until='networkidle')
await asyncio.sleep(1)
# Extract page data using JavaScript
data = await page.evaluate('''() => {
const result = {
domain: null,
dateAdded: null,
unesco: false,
description: null,
custodians: [],
relatedLinks: [],
pageText: document.body.innerText
};
// Get meta description
const metaDesc = document.querySelector('meta[name="description"]');
if (metaDesc) {
result.description = metaDesc.getAttribute('content');
}
// Look for domain info
const text = document.body.innerText;
const domainMatch = text.match(/Domein[:\\s]+([^\\n]+)/i);
if (domainMatch) {
result.domain = domainMatch[1].trim();
}
// Look for date added
const dateMatch = text.match(/Bijgeschreven.*?(\\d{4}|\\d{1,2}[-\\/]\\d{1,2}[-\\/]\\d{4})/i);
if (dateMatch) {
result.dateAdded = dateMatch[1];
}
// Check for UNESCO
if (text.toLowerCase().includes('unesco') ||
text.toLowerCase().includes('representatieve lijst')) {
result.unesco = true;
}
// Find all external links (potential custodians)
const links = document.querySelectorAll('a[href^="http"]');
const seenUrls = new Set();
links.forEach(link => {
const href = link.getAttribute('href');
const text = link.textContent.trim();
// Skip social media and KIEN itself
const skipDomains = [
'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com',
'youtube.com', 'immaterieelerfgoed.nl', 'google.com', 'maps.google'
];
const shouldSkip = skipDomains.some(d => href.includes(d));
if (!shouldSkip && text.length > 2 && !seenUrls.has(href)) {
seenUrls.add(href);
result.relatedLinks.push({
text: text,
url: href
});
}
});
// Find internal links that might be custodian profiles
const internalLinks = document.querySelectorAll('a[href^="/nl/page/"]');
internalLinks.forEach(link => {
const href = link.getAttribute('href');
const text = link.textContent.trim();
// Skip navigation
if (text.length > 3 && !href.includes('kennisbank') && !href.includes('contact')) {
result.custodians.push({
name: text,
kienUrl: 'https://www.immaterieelerfgoed.nl' + href
});
}
});
return result;
}''')
result['domain'] = data.get('domain')
result['date_added'] = data.get('dateAdded')
result['unesco_status'] = 'mentioned' if data.get('unesco') else None
result['description'] = data.get('description')
result['custodians'] = data.get('custodians', [])
result['related_links'] = data.get('relatedLinks', [])
# Extract locations from page text
dutch_cities = ['Amsterdam', 'Rotterdam', 'Den Haag', 'Utrecht', 'Eindhoven',
'Groningen', 'Tilburg', 'Almere', 'Breda', 'Nijmegen',
'Arnhem', 'Haarlem', 'Enschede', 'Apeldoorn', 'Amersfoort',
'Zaanstad', 'Maastricht', 'Leiden', 'Dordrecht', 'Zoetermeer',
'Zwolle', 'Deventer', 'Alkmaar', 'Delft', 'Venlo', 'Leeuwarden',
'Genemuiden', 'Schiermonnikoog', 'Texel', 'Vlieland', 'Terschelling']
page_text = data.get('pageText', '')
for city in dutch_cities:
if city in page_text and city not in result['locations']:
result['locations'].append(city)
except Exception as e:
result['error'] = str(e)
return result
async def main():
"""Main crawl function."""
print("=" * 60)
print("KIEN Heritage Custodian Crawler (Playwright)")
print("=" * 60)
print(f"Started at: {datetime.now().isoformat()}")
print()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="GLAM-Heritage-Research/1.0 (Academic Research)"
)
page = await context.new_page()
# Step 1: Get all heritage form URLs
print("Step 1: Fetching heritage form URLs from inventory...")
heritage_items = await get_all_heritage_urls(page)
print(f"\nFound {len(heritage_items)} heritage forms")
# Save the URL list
urls_file = OUTPUT_DIR / "heritage_urls.json"
with open(urls_file, 'w', encoding='utf-8') as f:
json.dump(heritage_items, f, ensure_ascii=False, indent=2)
print(f"Saved URL list to: {urls_file}")
# Step 2: Extract custodian data from each page
print("\nStep 2: Extracting custodian data from each heritage form...")
all_results = []
for i, item in enumerate(heritage_items, 1):
print(f"[{i}/{len(heritage_items)}] {item['title']}")
result = await extract_custodians_from_page(page, item['url'], item['title'])
all_results.append(result)
# Rate limiting
await asyncio.sleep(DELAY_MS / 1000)
# Progress save every 50 items
if i % 50 == 0:
progress_file = OUTPUT_DIR / f"custodians_progress_{i}.json"
with open(progress_file, 'w', encoding='utf-8') as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f" [PROGRESS] Saved {i} items")
await browser.close()
# Step 3: Save final results
print("\nStep 3: Saving results...")
output_file = OUTPUT_DIR / "kien_custodians.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f"Saved full results to: {output_file}")
# Summary statistics
total_custodians = sum(len(r['custodians']) for r in all_results)
total_links = sum(len(r['related_links']) for r in all_results)
with_custodians = sum(1 for r in all_results if r['custodians'])
with_links = sum(1 for r in all_results if r['related_links'])
with_unesco = sum(1 for r in all_results if r['unesco_status'])
summary = {
'crawl_date': datetime.now(timezone.utc).isoformat(),
'total_heritage_forms': len(all_results),
'forms_with_custodian_profiles': with_custodians,
'forms_with_external_links': with_links,
'total_custodian_profiles_found': total_custodians,
'total_external_links_found': total_links,
'forms_with_unesco_mention': with_unesco,
'unique_domains': list(set(r['domain'] for r in all_results if r['domain'])),
'unique_locations': list(set(loc for r in all_results for loc in r.get('locations', [])))
}
summary_file = OUTPUT_DIR / "crawl_summary.json"
with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
print("\n" + "=" * 60)
print("CRAWL COMPLETE")
print("=" * 60)
print(f"Total heritage forms: {summary['total_heritage_forms']}")
print(f"Forms with custodian profiles: {summary['forms_with_custodian_profiles']}")
print(f"Forms with external links: {summary['forms_with_external_links']}")
print(f"Total custodian profiles: {summary['total_custodian_profiles_found']}")
print(f"Total external links: {summary['total_external_links_found']}")
print(f"Forms with UNESCO mention: {summary['forms_with_unesco_mention']}")
print(f"\nResults saved to: {OUTPUT_DIR}")
if __name__ == "__main__":
asyncio.run(main())