321 lines
12 KiB
Python
321 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Crawl immaterieelerfgoed.nl (KIEN) using Playwright to extract custodian data.
|
|
|
|
This script uses browser automation to handle JavaScript-rendered content.
|
|
|
|
Author: GLAM Project
|
|
Date: 2025-12-03
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from playwright.async_api import async_playwright, Page
|
|
|
|
# Configuration
|
|
BASE_URL = "https://www.immaterieelerfgoed.nl"
|
|
INVENTORY_URL = f"{BASE_URL}/immaterieelerfgoed"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage")
|
|
DELAY_MS = 1500 # Delay between page loads
|
|
|
|
# Ensure output directory exists
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
async def get_all_heritage_urls(page: Page) -> list[dict]:
|
|
"""Get all heritage form URLs by paginating through inventory."""
|
|
heritage_items = []
|
|
|
|
# Navigate to inventory
|
|
print("Navigating to inventory...")
|
|
await page.goto(INVENTORY_URL)
|
|
await page.wait_for_selector('a[href*="/nl/"]', timeout=10000)
|
|
await asyncio.sleep(2) # Let JS render
|
|
|
|
page_num = 1
|
|
while True:
|
|
print(f"\nProcessing page {page_num}...")
|
|
|
|
# Extract heritage item links
|
|
items = await page.evaluate('''() => {
|
|
const items = [];
|
|
const links = document.querySelectorAll('ul li a[href^="/nl/"]');
|
|
|
|
links.forEach(link => {
|
|
const href = link.getAttribute('href');
|
|
const heading = link.querySelector('h2, h3, h4');
|
|
|
|
// Skip navigation links
|
|
const skipPatterns = [
|
|
'search', 'contact', 'over-ons', 'nieuws', 'kennisbank',
|
|
'watisimmaterieelerfgoed', 'veelgestelde-vragen', 'spotten',
|
|
'immaterieel-erfgoed-in-jouw-provincie', 'disclaimer', 'colofon',
|
|
'copyright', 'cookiesverklaring', 'privacy', 'nieuwsbrief',
|
|
'pers', 'partners', 'publicaties', 'Activiteitenplannen',
|
|
'missie-visie', 'oudepublicaties', 'gemeenteenprovincie',
|
|
'linksnaarpartners', 'immaterieel-erfgoed-films', 'inventaris-immaterieel-erfgoed'
|
|
];
|
|
|
|
const shouldSkip = skipPatterns.some(pattern => href.toLowerCase().includes(pattern));
|
|
|
|
if (heading && !shouldSkip) {
|
|
items.push({
|
|
title: heading.textContent.trim(),
|
|
url: href,
|
|
slug: href.replace('/nl/', '').replace('/page/', '')
|
|
});
|
|
}
|
|
});
|
|
|
|
return items;
|
|
}''')
|
|
|
|
new_items = 0
|
|
for item in items:
|
|
full_url = BASE_URL + item['url']
|
|
if not any(h['url'] == full_url for h in heritage_items):
|
|
item['url'] = full_url
|
|
heritage_items.append(item)
|
|
new_items += 1
|
|
|
|
print(f" Found {new_items} new items (total: {len(heritage_items)})")
|
|
|
|
# Check for next page
|
|
next_button = await page.query_selector(f'a[href*="page={page_num + 1}"]')
|
|
if next_button:
|
|
await next_button.click()
|
|
await asyncio.sleep(DELAY_MS / 1000)
|
|
page_num += 1
|
|
else:
|
|
print(" No more pages")
|
|
break
|
|
|
|
# Safety limit
|
|
if page_num > 50:
|
|
print(" Reached page limit")
|
|
break
|
|
|
|
return heritage_items
|
|
|
|
|
|
async def extract_custodians_from_page(page: Page, url: str, title: str) -> dict:
|
|
"""Extract custodian information from a heritage form page."""
|
|
result = {
|
|
'heritage_form': title,
|
|
'url': url,
|
|
'domain': None,
|
|
'date_added': None,
|
|
'unesco_status': None,
|
|
'description': None,
|
|
'custodians': [],
|
|
'related_links': [],
|
|
'locations': [],
|
|
'extracted_at': datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
try:
|
|
await page.goto(url, wait_until='networkidle')
|
|
await asyncio.sleep(1)
|
|
|
|
# Extract page data using JavaScript
|
|
data = await page.evaluate('''() => {
|
|
const result = {
|
|
domain: null,
|
|
dateAdded: null,
|
|
unesco: false,
|
|
description: null,
|
|
custodians: [],
|
|
relatedLinks: [],
|
|
pageText: document.body.innerText
|
|
};
|
|
|
|
// Get meta description
|
|
const metaDesc = document.querySelector('meta[name="description"]');
|
|
if (metaDesc) {
|
|
result.description = metaDesc.getAttribute('content');
|
|
}
|
|
|
|
// Look for domain info
|
|
const text = document.body.innerText;
|
|
const domainMatch = text.match(/Domein[:\\s]+([^\\n]+)/i);
|
|
if (domainMatch) {
|
|
result.domain = domainMatch[1].trim();
|
|
}
|
|
|
|
// Look for date added
|
|
const dateMatch = text.match(/Bijgeschreven.*?(\\d{4}|\\d{1,2}[-\\/]\\d{1,2}[-\\/]\\d{4})/i);
|
|
if (dateMatch) {
|
|
result.dateAdded = dateMatch[1];
|
|
}
|
|
|
|
// Check for UNESCO
|
|
if (text.toLowerCase().includes('unesco') ||
|
|
text.toLowerCase().includes('representatieve lijst')) {
|
|
result.unesco = true;
|
|
}
|
|
|
|
// Find all external links (potential custodians)
|
|
const links = document.querySelectorAll('a[href^="http"]');
|
|
const seenUrls = new Set();
|
|
|
|
links.forEach(link => {
|
|
const href = link.getAttribute('href');
|
|
const text = link.textContent.trim();
|
|
|
|
// Skip social media and KIEN itself
|
|
const skipDomains = [
|
|
'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com',
|
|
'youtube.com', 'immaterieelerfgoed.nl', 'google.com', 'maps.google'
|
|
];
|
|
|
|
const shouldSkip = skipDomains.some(d => href.includes(d));
|
|
|
|
if (!shouldSkip && text.length > 2 && !seenUrls.has(href)) {
|
|
seenUrls.add(href);
|
|
result.relatedLinks.push({
|
|
text: text,
|
|
url: href
|
|
});
|
|
}
|
|
});
|
|
|
|
// Find internal links that might be custodian profiles
|
|
const internalLinks = document.querySelectorAll('a[href^="/nl/page/"]');
|
|
internalLinks.forEach(link => {
|
|
const href = link.getAttribute('href');
|
|
const text = link.textContent.trim();
|
|
|
|
// Skip navigation
|
|
if (text.length > 3 && !href.includes('kennisbank') && !href.includes('contact')) {
|
|
result.custodians.push({
|
|
name: text,
|
|
kienUrl: 'https://www.immaterieelerfgoed.nl' + href
|
|
});
|
|
}
|
|
});
|
|
|
|
return result;
|
|
}''')
|
|
|
|
result['domain'] = data.get('domain')
|
|
result['date_added'] = data.get('dateAdded')
|
|
result['unesco_status'] = 'mentioned' if data.get('unesco') else None
|
|
result['description'] = data.get('description')
|
|
result['custodians'] = data.get('custodians', [])
|
|
result['related_links'] = data.get('relatedLinks', [])
|
|
|
|
# Extract locations from page text
|
|
dutch_cities = ['Amsterdam', 'Rotterdam', 'Den Haag', 'Utrecht', 'Eindhoven',
|
|
'Groningen', 'Tilburg', 'Almere', 'Breda', 'Nijmegen',
|
|
'Arnhem', 'Haarlem', 'Enschede', 'Apeldoorn', 'Amersfoort',
|
|
'Zaanstad', 'Maastricht', 'Leiden', 'Dordrecht', 'Zoetermeer',
|
|
'Zwolle', 'Deventer', 'Alkmaar', 'Delft', 'Venlo', 'Leeuwarden',
|
|
'Genemuiden', 'Schiermonnikoog', 'Texel', 'Vlieland', 'Terschelling']
|
|
|
|
page_text = data.get('pageText', '')
|
|
for city in dutch_cities:
|
|
if city in page_text and city not in result['locations']:
|
|
result['locations'].append(city)
|
|
|
|
except Exception as e:
|
|
result['error'] = str(e)
|
|
|
|
return result
|
|
|
|
|
|
async def main():
|
|
"""Main crawl function."""
|
|
print("=" * 60)
|
|
print("KIEN Heritage Custodian Crawler (Playwright)")
|
|
print("=" * 60)
|
|
print(f"Started at: {datetime.now().isoformat()}")
|
|
print()
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
context = await browser.new_context(
|
|
user_agent="GLAM-Heritage-Research/1.0 (Academic Research)"
|
|
)
|
|
page = await context.new_page()
|
|
|
|
# Step 1: Get all heritage form URLs
|
|
print("Step 1: Fetching heritage form URLs from inventory...")
|
|
heritage_items = await get_all_heritage_urls(page)
|
|
print(f"\nFound {len(heritage_items)} heritage forms")
|
|
|
|
# Save the URL list
|
|
urls_file = OUTPUT_DIR / "heritage_urls.json"
|
|
with open(urls_file, 'w', encoding='utf-8') as f:
|
|
json.dump(heritage_items, f, ensure_ascii=False, indent=2)
|
|
print(f"Saved URL list to: {urls_file}")
|
|
|
|
# Step 2: Extract custodian data from each page
|
|
print("\nStep 2: Extracting custodian data from each heritage form...")
|
|
all_results = []
|
|
|
|
for i, item in enumerate(heritage_items, 1):
|
|
print(f"[{i}/{len(heritage_items)}] {item['title']}")
|
|
result = await extract_custodians_from_page(page, item['url'], item['title'])
|
|
all_results.append(result)
|
|
|
|
# Rate limiting
|
|
await asyncio.sleep(DELAY_MS / 1000)
|
|
|
|
# Progress save every 50 items
|
|
if i % 50 == 0:
|
|
progress_file = OUTPUT_DIR / f"custodians_progress_{i}.json"
|
|
with open(progress_file, 'w', encoding='utf-8') as f:
|
|
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
|
print(f" [PROGRESS] Saved {i} items")
|
|
|
|
await browser.close()
|
|
|
|
# Step 3: Save final results
|
|
print("\nStep 3: Saving results...")
|
|
|
|
output_file = OUTPUT_DIR / "kien_custodians.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
|
print(f"Saved full results to: {output_file}")
|
|
|
|
# Summary statistics
|
|
total_custodians = sum(len(r['custodians']) for r in all_results)
|
|
total_links = sum(len(r['related_links']) for r in all_results)
|
|
with_custodians = sum(1 for r in all_results if r['custodians'])
|
|
with_links = sum(1 for r in all_results if r['related_links'])
|
|
with_unesco = sum(1 for r in all_results if r['unesco_status'])
|
|
|
|
summary = {
|
|
'crawl_date': datetime.now(timezone.utc).isoformat(),
|
|
'total_heritage_forms': len(all_results),
|
|
'forms_with_custodian_profiles': with_custodians,
|
|
'forms_with_external_links': with_links,
|
|
'total_custodian_profiles_found': total_custodians,
|
|
'total_external_links_found': total_links,
|
|
'forms_with_unesco_mention': with_unesco,
|
|
'unique_domains': list(set(r['domain'] for r in all_results if r['domain'])),
|
|
'unique_locations': list(set(loc for r in all_results for loc in r.get('locations', [])))
|
|
}
|
|
|
|
summary_file = OUTPUT_DIR / "crawl_summary.json"
|
|
with open(summary_file, 'w', encoding='utf-8') as f:
|
|
json.dump(summary, f, ensure_ascii=False, indent=2)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("CRAWL COMPLETE")
|
|
print("=" * 60)
|
|
print(f"Total heritage forms: {summary['total_heritage_forms']}")
|
|
print(f"Forms with custodian profiles: {summary['forms_with_custodian_profiles']}")
|
|
print(f"Forms with external links: {summary['forms_with_external_links']}")
|
|
print(f"Total custodian profiles: {summary['total_custodian_profiles_found']}")
|
|
print(f"Total external links: {summary['total_external_links_found']}")
|
|
print(f"Forms with UNESCO mention: {summary['forms_with_unesco_mention']}")
|
|
print(f"\nResults saved to: {OUTPUT_DIR}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|