336 lines
13 KiB
Python
336 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Crawl immaterieelerfgoed.nl (KIEN) to extract custodian data from heritage forms.
|
|
|
|
This script:
|
|
1. Fetches all heritage form URLs from the inventory pagination
|
|
2. Visits each heritage form page to extract custodian information
|
|
3. Outputs structured YAML data for integration with GLAM schema
|
|
|
|
Author: GLAM Project
|
|
Date: 2025-12-03
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configuration
|
|
BASE_URL = "https://www.immaterieelerfgoed.nl"
|
|
INVENTORY_URL = f"{BASE_URL}/nl/search"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage")
|
|
CACHE_DIR = OUTPUT_DIR / "cache"
|
|
DELAY_SECONDS = 1.5 # Be polite to the server
|
|
|
|
# Ensure output directories exist
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Session with headers
|
|
session = requests.Session()
|
|
session.headers.update({
|
|
"User-Agent": "GLAM-Heritage-Research/1.0 (Academic Research; contact: heritage@example.org)",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "nl,en;q=0.5",
|
|
})
|
|
|
|
|
|
def fetch_page(url: str, use_cache: bool = True) -> str:
|
|
"""Fetch a page with caching support."""
|
|
# Create cache key from URL
|
|
cache_key = re.sub(r'[^\w\-]', '_', url.replace(BASE_URL, ''))
|
|
cache_file = CACHE_DIR / f"{cache_key}.html"
|
|
|
|
if use_cache and cache_file.exists():
|
|
print(f" [CACHE] {url}")
|
|
return cache_file.read_text(encoding='utf-8')
|
|
|
|
print(f" [FETCH] {url}")
|
|
time.sleep(DELAY_SECONDS)
|
|
|
|
try:
|
|
response = session.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
html = response.text
|
|
|
|
# Cache the result
|
|
cache_file.write_text(html, encoding='utf-8')
|
|
return html
|
|
except requests.RequestException as e:
|
|
print(f" [ERROR] Failed to fetch {url}: {e}")
|
|
return ""
|
|
|
|
|
|
def get_all_heritage_urls() -> list[dict]:
|
|
"""Get all heritage form URLs from the inventory pages."""
|
|
heritage_items = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = f"{INVENTORY_URL}?page={page}"
|
|
html = fetch_page(url)
|
|
|
|
if not html:
|
|
break
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Find heritage item links in the list
|
|
# Looking for links that go to heritage form pages
|
|
items_found = 0
|
|
for link in soup.find_all('a', href=True):
|
|
href = link.get('href', '')
|
|
# Heritage form URLs look like /nl/somename or /nl/page/XXXX/name
|
|
if href.startswith('/nl/') and not any(skip in href for skip in [
|
|
'search', 'contact', 'over-ons', 'nieuws', 'kennisbank',
|
|
'watisimmaterieelerfgoed', 'veelgestelde-vragen', 'spotten',
|
|
'immaterieel-erfgoed-in-jouw-provincie', 'disclaimer', 'colofon',
|
|
'copyright', 'cookiesverklaring', 'privacy', 'nieuwsbrief',
|
|
'pers', 'partners', 'publicaties', 'Activiteitenplannen',
|
|
'missie-visie', 'oudepublicaties', 'gemeenteenprovincie',
|
|
'linksnaarpartners', 'immaterieel-erfgoed-films'
|
|
]):
|
|
# Check if it has a heading child (inventory items do)
|
|
heading = link.find(['h2', 'h3', 'h4'])
|
|
if heading:
|
|
title = heading.get_text(strip=True)
|
|
full_url = urljoin(BASE_URL, href)
|
|
|
|
# Avoid duplicates
|
|
if not any(item['url'] == full_url for item in heritage_items):
|
|
heritage_items.append({
|
|
'title': title,
|
|
'url': full_url,
|
|
'slug': href.replace('/nl/', '').replace('/page/', '')
|
|
})
|
|
items_found += 1
|
|
|
|
print(f"Page {page}: Found {items_found} heritage forms (total: {len(heritage_items)})")
|
|
|
|
# Check if there's a next page
|
|
next_link = soup.find('a', href=re.compile(rf'page={page + 1}'))
|
|
if not next_link or items_found == 0:
|
|
break
|
|
|
|
page += 1
|
|
|
|
# Safety limit
|
|
if page > 50:
|
|
print(" [WARN] Reached page limit, stopping")
|
|
break
|
|
|
|
return heritage_items
|
|
|
|
|
|
def extract_custodians_from_page(url: str, title: str) -> dict:
|
|
"""Extract custodian information from a heritage form page."""
|
|
html = fetch_page(url)
|
|
|
|
if not html:
|
|
return {
|
|
'heritage_form': title,
|
|
'url': url,
|
|
'custodians': [],
|
|
'error': 'Failed to fetch page'
|
|
}
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
result = {
|
|
'heritage_form': title,
|
|
'url': url,
|
|
'domain': None,
|
|
'date_added': None,
|
|
'unesco_status': None,
|
|
'description': None,
|
|
'custodians': [],
|
|
'related_links': [],
|
|
'locations': [],
|
|
'extracted_at': datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
# Extract domain/category
|
|
# Look for domain indicators
|
|
for text in soup.stripped_strings:
|
|
if 'domein' in text.lower():
|
|
# Try to find the domain value nearby
|
|
parent = soup.find(string=re.compile('domein', re.I))
|
|
if parent:
|
|
next_text = parent.find_next(string=True)
|
|
if next_text:
|
|
result['domain'] = next_text.strip()
|
|
|
|
# Look for structured data sections
|
|
# Many pages have a sidebar or info section with custodian data
|
|
|
|
# Method 1: Look for "Gemeenschap" or "Erfgoedbeoefenaars" sections
|
|
for section_name in ['Gemeenschap', 'Erfgoedbeoefenaars', 'Beoefenaars', 'Organisaties', 'Contact']:
|
|
section = soup.find(string=re.compile(section_name, re.I))
|
|
if section:
|
|
parent = section.find_parent(['div', 'section', 'article'])
|
|
if parent:
|
|
# Extract links within this section
|
|
for link in parent.find_all('a', href=True):
|
|
href = link.get('href', '')
|
|
text = link.get_text(strip=True)
|
|
|
|
# Skip navigation links
|
|
if href.startswith('/nl/') and len(text) > 3:
|
|
# This might be a custodian page
|
|
if 'page/' in href:
|
|
result['custodians'].append({
|
|
'name': text,
|
|
'kien_url': urljoin(BASE_URL, href),
|
|
'type': 'organization'
|
|
})
|
|
elif href.startswith('http') and BASE_URL not in href:
|
|
# External website
|
|
result['custodians'].append({
|
|
'name': text,
|
|
'website': href,
|
|
'type': 'external'
|
|
})
|
|
|
|
# Method 2: Look for any external links that might be custodian websites
|
|
for link in soup.find_all('a', href=True):
|
|
href = link.get('href', '')
|
|
text = link.get_text(strip=True)
|
|
|
|
# External links (not to KIEN, not social media, not generic)
|
|
if (href.startswith('http') and
|
|
BASE_URL not in href and
|
|
not any(social in href.lower() for social in ['facebook', 'twitter', 'instagram', 'linkedin', 'youtube']) and
|
|
len(text) > 3):
|
|
|
|
# Check if this looks like an organization
|
|
if not any(c['website'] == href for c in result['custodians'] if 'website' in c):
|
|
result['related_links'].append({
|
|
'text': text,
|
|
'url': href
|
|
})
|
|
|
|
# Method 3: Look for location mentions
|
|
# Common patterns: "in [City]", "te [City]", location names
|
|
dutch_cities = ['Amsterdam', 'Rotterdam', 'Den Haag', 'Utrecht', 'Eindhoven',
|
|
'Groningen', 'Tilburg', 'Almere', 'Breda', 'Nijmegen',
|
|
'Arnhem', 'Haarlem', 'Enschede', 'Apeldoorn', 'Amersfoort',
|
|
'Zaanstad', 'Maastricht', 'Leiden', 'Dordrecht', 'Zoetermeer']
|
|
|
|
page_text = soup.get_text()
|
|
for city in dutch_cities:
|
|
if city in page_text:
|
|
if city not in result['locations']:
|
|
result['locations'].append(city)
|
|
|
|
# Method 4: Look for date added to inventory
|
|
date_pattern = re.compile(r'(\d{1,2}[-/]\d{1,2}[-/]\d{4}|\d{4})')
|
|
for text in soup.stripped_strings:
|
|
if 'inventaris' in text.lower() or 'bijgeschreven' in text.lower():
|
|
dates = date_pattern.findall(text)
|
|
if dates:
|
|
result['date_added'] = dates[0]
|
|
|
|
# Method 5: Check for UNESCO status
|
|
unesco_keywords = ['unesco', 'representatieve lijst', 'werelderfgoed', 'immaterieel erfgoed van de mensheid']
|
|
page_text_lower = page_text.lower()
|
|
for keyword in unesco_keywords:
|
|
if keyword in page_text_lower:
|
|
result['unesco_status'] = 'mentioned'
|
|
break
|
|
|
|
# Get description (first paragraph or meta description)
|
|
meta_desc = soup.find('meta', {'name': 'description'})
|
|
if meta_desc and meta_desc.get('content'):
|
|
result['description'] = meta_desc['content']
|
|
else:
|
|
# Try first substantial paragraph
|
|
for p in soup.find_all('p'):
|
|
text = p.get_text(strip=True)
|
|
if len(text) > 100:
|
|
result['description'] = text[:500] + '...' if len(text) > 500 else text
|
|
break
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
"""Main crawl function."""
|
|
print("=" * 60)
|
|
print("KIEN Heritage Custodian Crawler")
|
|
print("=" * 60)
|
|
print(f"Started at: {datetime.now().isoformat()}")
|
|
print()
|
|
|
|
# Step 1: Get all heritage form URLs
|
|
print("Step 1: Fetching heritage form URLs from inventory...")
|
|
heritage_items = get_all_heritage_urls()
|
|
print(f"\nFound {len(heritage_items)} heritage forms")
|
|
|
|
# Save the URL list
|
|
urls_file = OUTPUT_DIR / "heritage_urls.json"
|
|
with open(urls_file, 'w', encoding='utf-8') as f:
|
|
json.dump(heritage_items, f, ensure_ascii=False, indent=2)
|
|
print(f"Saved URL list to: {urls_file}")
|
|
|
|
# Step 2: Extract custodian data from each page
|
|
print("\nStep 2: Extracting custodian data from each heritage form...")
|
|
all_results = []
|
|
|
|
for i, item in enumerate(heritage_items, 1):
|
|
print(f"\n[{i}/{len(heritage_items)}] {item['title']}")
|
|
result = extract_custodians_from_page(item['url'], item['title'])
|
|
all_results.append(result)
|
|
|
|
# Progress save every 50 items
|
|
if i % 50 == 0:
|
|
progress_file = OUTPUT_DIR / f"custodians_progress_{i}.json"
|
|
with open(progress_file, 'w', encoding='utf-8') as f:
|
|
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
|
print(f" [PROGRESS] Saved {i} items")
|
|
|
|
# Step 3: Save final results
|
|
print("\nStep 3: Saving results...")
|
|
|
|
# Full JSON output
|
|
output_file = OUTPUT_DIR / "kien_custodians.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
|
print(f"Saved full results to: {output_file}")
|
|
|
|
# Summary statistics
|
|
total_custodians = sum(len(r['custodians']) for r in all_results)
|
|
with_custodians = sum(1 for r in all_results if r['custodians'])
|
|
with_unesco = sum(1 for r in all_results if r['unesco_status'])
|
|
|
|
summary = {
|
|
'crawl_date': datetime.now(timezone.utc).isoformat(),
|
|
'total_heritage_forms': len(all_results),
|
|
'forms_with_custodians': with_custodians,
|
|
'total_custodians_found': total_custodians,
|
|
'forms_with_unesco_mention': with_unesco,
|
|
'unique_domains': list(set(r['domain'] for r in all_results if r['domain'])),
|
|
'unique_locations': list(set(loc for r in all_results for loc in r.get('locations', [])))
|
|
}
|
|
|
|
summary_file = OUTPUT_DIR / "crawl_summary.json"
|
|
with open(summary_file, 'w', encoding='utf-8') as f:
|
|
json.dump(summary, f, ensure_ascii=False, indent=2)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("CRAWL COMPLETE")
|
|
print("=" * 60)
|
|
print(f"Total heritage forms: {summary['total_heritage_forms']}")
|
|
print(f"Forms with custodians: {summary['forms_with_custodians']}")
|
|
print(f"Total custodians found: {summary['total_custodians_found']}")
|
|
print(f"Forms with UNESCO mention: {summary['forms_with_unesco_mention']}")
|
|
print(f"\nResults saved to: {OUTPUT_DIR}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|