glam/scripts/crawl_kien_custodians.py
2025-12-05 15:30:23 +01:00

336 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Crawl immaterieelerfgoed.nl (KIEN) to extract custodian data from heritage forms.
This script:
1. Fetches all heritage form URLs from the inventory pagination
2. Visits each heritage form page to extract custodian information
3. Outputs structured YAML data for integration with GLAM schema
Author: GLAM Project
Date: 2025-12-03
"""
import json
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
# Configuration
BASE_URL = "https://www.immaterieelerfgoed.nl"
INVENTORY_URL = f"{BASE_URL}/nl/search"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage")
CACHE_DIR = OUTPUT_DIR / "cache"
DELAY_SECONDS = 1.5 # Be polite to the server
# Ensure output directories exist
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
# Session with headers
session = requests.Session()
session.headers.update({
"User-Agent": "GLAM-Heritage-Research/1.0 (Academic Research; contact: heritage@example.org)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "nl,en;q=0.5",
})
def fetch_page(url: str, use_cache: bool = True) -> str:
"""Fetch a page with caching support."""
# Create cache key from URL
cache_key = re.sub(r'[^\w\-]', '_', url.replace(BASE_URL, ''))
cache_file = CACHE_DIR / f"{cache_key}.html"
if use_cache and cache_file.exists():
print(f" [CACHE] {url}")
return cache_file.read_text(encoding='utf-8')
print(f" [FETCH] {url}")
time.sleep(DELAY_SECONDS)
try:
response = session.get(url, timeout=30)
response.raise_for_status()
html = response.text
# Cache the result
cache_file.write_text(html, encoding='utf-8')
return html
except requests.RequestException as e:
print(f" [ERROR] Failed to fetch {url}: {e}")
return ""
def get_all_heritage_urls() -> list[dict]:
"""Get all heritage form URLs from the inventory pages."""
heritage_items = []
page = 1
while True:
url = f"{INVENTORY_URL}?page={page}"
html = fetch_page(url)
if not html:
break
soup = BeautifulSoup(html, 'html.parser')
# Find heritage item links in the list
# Looking for links that go to heritage form pages
items_found = 0
for link in soup.find_all('a', href=True):
href = link.get('href', '')
# Heritage form URLs look like /nl/somename or /nl/page/XXXX/name
if href.startswith('/nl/') and not any(skip in href for skip in [
'search', 'contact', 'over-ons', 'nieuws', 'kennisbank',
'watisimmaterieelerfgoed', 'veelgestelde-vragen', 'spotten',
'immaterieel-erfgoed-in-jouw-provincie', 'disclaimer', 'colofon',
'copyright', 'cookiesverklaring', 'privacy', 'nieuwsbrief',
'pers', 'partners', 'publicaties', 'Activiteitenplannen',
'missie-visie', 'oudepublicaties', 'gemeenteenprovincie',
'linksnaarpartners', 'immaterieel-erfgoed-films'
]):
# Check if it has a heading child (inventory items do)
heading = link.find(['h2', 'h3', 'h4'])
if heading:
title = heading.get_text(strip=True)
full_url = urljoin(BASE_URL, href)
# Avoid duplicates
if not any(item['url'] == full_url for item in heritage_items):
heritage_items.append({
'title': title,
'url': full_url,
'slug': href.replace('/nl/', '').replace('/page/', '')
})
items_found += 1
print(f"Page {page}: Found {items_found} heritage forms (total: {len(heritage_items)})")
# Check if there's a next page
next_link = soup.find('a', href=re.compile(rf'page={page + 1}'))
if not next_link or items_found == 0:
break
page += 1
# Safety limit
if page > 50:
print(" [WARN] Reached page limit, stopping")
break
return heritage_items
def extract_custodians_from_page(url: str, title: str) -> dict:
"""Extract custodian information from a heritage form page."""
html = fetch_page(url)
if not html:
return {
'heritage_form': title,
'url': url,
'custodians': [],
'error': 'Failed to fetch page'
}
soup = BeautifulSoup(html, 'html.parser')
result = {
'heritage_form': title,
'url': url,
'domain': None,
'date_added': None,
'unesco_status': None,
'description': None,
'custodians': [],
'related_links': [],
'locations': [],
'extracted_at': datetime.now(timezone.utc).isoformat()
}
# Extract domain/category
# Look for domain indicators
for text in soup.stripped_strings:
if 'domein' in text.lower():
# Try to find the domain value nearby
parent = soup.find(string=re.compile('domein', re.I))
if parent:
next_text = parent.find_next(string=True)
if next_text:
result['domain'] = next_text.strip()
# Look for structured data sections
# Many pages have a sidebar or info section with custodian data
# Method 1: Look for "Gemeenschap" or "Erfgoedbeoefenaars" sections
for section_name in ['Gemeenschap', 'Erfgoedbeoefenaars', 'Beoefenaars', 'Organisaties', 'Contact']:
section = soup.find(string=re.compile(section_name, re.I))
if section:
parent = section.find_parent(['div', 'section', 'article'])
if parent:
# Extract links within this section
for link in parent.find_all('a', href=True):
href = link.get('href', '')
text = link.get_text(strip=True)
# Skip navigation links
if href.startswith('/nl/') and len(text) > 3:
# This might be a custodian page
if 'page/' in href:
result['custodians'].append({
'name': text,
'kien_url': urljoin(BASE_URL, href),
'type': 'organization'
})
elif href.startswith('http') and BASE_URL not in href:
# External website
result['custodians'].append({
'name': text,
'website': href,
'type': 'external'
})
# Method 2: Look for any external links that might be custodian websites
for link in soup.find_all('a', href=True):
href = link.get('href', '')
text = link.get_text(strip=True)
# External links (not to KIEN, not social media, not generic)
if (href.startswith('http') and
BASE_URL not in href and
not any(social in href.lower() for social in ['facebook', 'twitter', 'instagram', 'linkedin', 'youtube']) and
len(text) > 3):
# Check if this looks like an organization
if not any(c['website'] == href for c in result['custodians'] if 'website' in c):
result['related_links'].append({
'text': text,
'url': href
})
# Method 3: Look for location mentions
# Common patterns: "in [City]", "te [City]", location names
dutch_cities = ['Amsterdam', 'Rotterdam', 'Den Haag', 'Utrecht', 'Eindhoven',
'Groningen', 'Tilburg', 'Almere', 'Breda', 'Nijmegen',
'Arnhem', 'Haarlem', 'Enschede', 'Apeldoorn', 'Amersfoort',
'Zaanstad', 'Maastricht', 'Leiden', 'Dordrecht', 'Zoetermeer']
page_text = soup.get_text()
for city in dutch_cities:
if city in page_text:
if city not in result['locations']:
result['locations'].append(city)
# Method 4: Look for date added to inventory
date_pattern = re.compile(r'(\d{1,2}[-/]\d{1,2}[-/]\d{4}|\d{4})')
for text in soup.stripped_strings:
if 'inventaris' in text.lower() or 'bijgeschreven' in text.lower():
dates = date_pattern.findall(text)
if dates:
result['date_added'] = dates[0]
# Method 5: Check for UNESCO status
unesco_keywords = ['unesco', 'representatieve lijst', 'werelderfgoed', 'immaterieel erfgoed van de mensheid']
page_text_lower = page_text.lower()
for keyword in unesco_keywords:
if keyword in page_text_lower:
result['unesco_status'] = 'mentioned'
break
# Get description (first paragraph or meta description)
meta_desc = soup.find('meta', {'name': 'description'})
if meta_desc and meta_desc.get('content'):
result['description'] = meta_desc['content']
else:
# Try first substantial paragraph
for p in soup.find_all('p'):
text = p.get_text(strip=True)
if len(text) > 100:
result['description'] = text[:500] + '...' if len(text) > 500 else text
break
return result
def main():
"""Main crawl function."""
print("=" * 60)
print("KIEN Heritage Custodian Crawler")
print("=" * 60)
print(f"Started at: {datetime.now().isoformat()}")
print()
# Step 1: Get all heritage form URLs
print("Step 1: Fetching heritage form URLs from inventory...")
heritage_items = get_all_heritage_urls()
print(f"\nFound {len(heritage_items)} heritage forms")
# Save the URL list
urls_file = OUTPUT_DIR / "heritage_urls.json"
with open(urls_file, 'w', encoding='utf-8') as f:
json.dump(heritage_items, f, ensure_ascii=False, indent=2)
print(f"Saved URL list to: {urls_file}")
# Step 2: Extract custodian data from each page
print("\nStep 2: Extracting custodian data from each heritage form...")
all_results = []
for i, item in enumerate(heritage_items, 1):
print(f"\n[{i}/{len(heritage_items)}] {item['title']}")
result = extract_custodians_from_page(item['url'], item['title'])
all_results.append(result)
# Progress save every 50 items
if i % 50 == 0:
progress_file = OUTPUT_DIR / f"custodians_progress_{i}.json"
with open(progress_file, 'w', encoding='utf-8') as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f" [PROGRESS] Saved {i} items")
# Step 3: Save final results
print("\nStep 3: Saving results...")
# Full JSON output
output_file = OUTPUT_DIR / "kien_custodians.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f"Saved full results to: {output_file}")
# Summary statistics
total_custodians = sum(len(r['custodians']) for r in all_results)
with_custodians = sum(1 for r in all_results if r['custodians'])
with_unesco = sum(1 for r in all_results if r['unesco_status'])
summary = {
'crawl_date': datetime.now(timezone.utc).isoformat(),
'total_heritage_forms': len(all_results),
'forms_with_custodians': with_custodians,
'total_custodians_found': total_custodians,
'forms_with_unesco_mention': with_unesco,
'unique_domains': list(set(r['domain'] for r in all_results if r['domain'])),
'unique_locations': list(set(loc for r in all_results for loc in r.get('locations', [])))
}
summary_file = OUTPUT_DIR / "crawl_summary.json"
with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
print("\n" + "=" * 60)
print("CRAWL COMPLETE")
print("=" * 60)
print(f"Total heritage forms: {summary['total_heritage_forms']}")
print(f"Forms with custodians: {summary['forms_with_custodians']}")
print(f"Total custodians found: {summary['total_custodians_found']}")
print(f"Forms with UNESCO mention: {summary['forms_with_unesco_mention']}")
print(f"\nResults saved to: {OUTPUT_DIR}")
if __name__ == "__main__":
main()