#!/usr/bin/env python3 """ Crawl immaterieelerfgoed.nl (KIEN) to extract custodian data from heritage forms. This script: 1. Fetches all heritage form URLs from the inventory pagination 2. Visits each heritage form page to extract custodian information 3. Outputs structured YAML data for integration with GLAM schema Author: GLAM Project Date: 2025-12-03 """ import json import re import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urljoin import requests from bs4 import BeautifulSoup # Configuration BASE_URL = "https://www.immaterieelerfgoed.nl" INVENTORY_URL = f"{BASE_URL}/nl/search" OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage") CACHE_DIR = OUTPUT_DIR / "cache" DELAY_SECONDS = 1.5 # Be polite to the server # Ensure output directories exist OUTPUT_DIR.mkdir(parents=True, exist_ok=True) CACHE_DIR.mkdir(parents=True, exist_ok=True) # Session with headers session = requests.Session() session.headers.update({ "User-Agent": "GLAM-Heritage-Research/1.0 (Academic Research; contact: heritage@example.org)", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "nl,en;q=0.5", }) def fetch_page(url: str, use_cache: bool = True) -> str: """Fetch a page with caching support.""" # Create cache key from URL cache_key = re.sub(r'[^\w\-]', '_', url.replace(BASE_URL, '')) cache_file = CACHE_DIR / f"{cache_key}.html" if use_cache and cache_file.exists(): print(f" [CACHE] {url}") return cache_file.read_text(encoding='utf-8') print(f" [FETCH] {url}") time.sleep(DELAY_SECONDS) try: response = session.get(url, timeout=30) response.raise_for_status() html = response.text # Cache the result cache_file.write_text(html, encoding='utf-8') return html except requests.RequestException as e: print(f" [ERROR] Failed to fetch {url}: {e}") return "" def get_all_heritage_urls() -> list[dict]: """Get all heritage form URLs from the inventory pages.""" heritage_items = [] page = 1 while True: url = f"{INVENTORY_URL}?page={page}" html = fetch_page(url) if not html: break soup = BeautifulSoup(html, 'html.parser') # Find heritage item links in the list # Looking for links that go to heritage form pages items_found = 0 for link in soup.find_all('a', href=True): href = link.get('href', '') # Heritage form URLs look like /nl/somename or /nl/page/XXXX/name if href.startswith('/nl/') and not any(skip in href for skip in [ 'search', 'contact', 'over-ons', 'nieuws', 'kennisbank', 'watisimmaterieelerfgoed', 'veelgestelde-vragen', 'spotten', 'immaterieel-erfgoed-in-jouw-provincie', 'disclaimer', 'colofon', 'copyright', 'cookiesverklaring', 'privacy', 'nieuwsbrief', 'pers', 'partners', 'publicaties', 'Activiteitenplannen', 'missie-visie', 'oudepublicaties', 'gemeenteenprovincie', 'linksnaarpartners', 'immaterieel-erfgoed-films' ]): # Check if it has a heading child (inventory items do) heading = link.find(['h2', 'h3', 'h4']) if heading: title = heading.get_text(strip=True) full_url = urljoin(BASE_URL, href) # Avoid duplicates if not any(item['url'] == full_url for item in heritage_items): heritage_items.append({ 'title': title, 'url': full_url, 'slug': href.replace('/nl/', '').replace('/page/', '') }) items_found += 1 print(f"Page {page}: Found {items_found} heritage forms (total: {len(heritage_items)})") # Check if there's a next page next_link = soup.find('a', href=re.compile(rf'page={page + 1}')) if not next_link or items_found == 0: break page += 1 # Safety limit if page > 50: print(" [WARN] Reached page limit, stopping") break return heritage_items def extract_custodians_from_page(url: str, title: str) -> dict: """Extract custodian information from a heritage form page.""" html = fetch_page(url) if not html: return { 'heritage_form': title, 'url': url, 'custodians': [], 'error': 'Failed to fetch page' } soup = BeautifulSoup(html, 'html.parser') result = { 'heritage_form': title, 'url': url, 'domain': None, 'date_added': None, 'unesco_status': None, 'description': None, 'custodians': [], 'related_links': [], 'locations': [], 'extracted_at': datetime.now(timezone.utc).isoformat() } # Extract domain/category # Look for domain indicators for text in soup.stripped_strings: if 'domein' in text.lower(): # Try to find the domain value nearby parent = soup.find(string=re.compile('domein', re.I)) if parent: next_text = parent.find_next(string=True) if next_text: result['domain'] = next_text.strip() # Look for structured data sections # Many pages have a sidebar or info section with custodian data # Method 1: Look for "Gemeenschap" or "Erfgoedbeoefenaars" sections for section_name in ['Gemeenschap', 'Erfgoedbeoefenaars', 'Beoefenaars', 'Organisaties', 'Contact']: section = soup.find(string=re.compile(section_name, re.I)) if section: parent = section.find_parent(['div', 'section', 'article']) if parent: # Extract links within this section for link in parent.find_all('a', href=True): href = link.get('href', '') text = link.get_text(strip=True) # Skip navigation links if href.startswith('/nl/') and len(text) > 3: # This might be a custodian page if 'page/' in href: result['custodians'].append({ 'name': text, 'kien_url': urljoin(BASE_URL, href), 'type': 'organization' }) elif href.startswith('http') and BASE_URL not in href: # External website result['custodians'].append({ 'name': text, 'website': href, 'type': 'external' }) # Method 2: Look for any external links that might be custodian websites for link in soup.find_all('a', href=True): href = link.get('href', '') text = link.get_text(strip=True) # External links (not to KIEN, not social media, not generic) if (href.startswith('http') and BASE_URL not in href and not any(social in href.lower() for social in ['facebook', 'twitter', 'instagram', 'linkedin', 'youtube']) and len(text) > 3): # Check if this looks like an organization if not any(c['website'] == href for c in result['custodians'] if 'website' in c): result['related_links'].append({ 'text': text, 'url': href }) # Method 3: Look for location mentions # Common patterns: "in [City]", "te [City]", location names dutch_cities = ['Amsterdam', 'Rotterdam', 'Den Haag', 'Utrecht', 'Eindhoven', 'Groningen', 'Tilburg', 'Almere', 'Breda', 'Nijmegen', 'Arnhem', 'Haarlem', 'Enschede', 'Apeldoorn', 'Amersfoort', 'Zaanstad', 'Maastricht', 'Leiden', 'Dordrecht', 'Zoetermeer'] page_text = soup.get_text() for city in dutch_cities: if city in page_text: if city not in result['locations']: result['locations'].append(city) # Method 4: Look for date added to inventory date_pattern = re.compile(r'(\d{1,2}[-/]\d{1,2}[-/]\d{4}|\d{4})') for text in soup.stripped_strings: if 'inventaris' in text.lower() or 'bijgeschreven' in text.lower(): dates = date_pattern.findall(text) if dates: result['date_added'] = dates[0] # Method 5: Check for UNESCO status unesco_keywords = ['unesco', 'representatieve lijst', 'werelderfgoed', 'immaterieel erfgoed van de mensheid'] page_text_lower = page_text.lower() for keyword in unesco_keywords: if keyword in page_text_lower: result['unesco_status'] = 'mentioned' break # Get description (first paragraph or meta description) meta_desc = soup.find('meta', {'name': 'description'}) if meta_desc and meta_desc.get('content'): result['description'] = meta_desc['content'] else: # Try first substantial paragraph for p in soup.find_all('p'): text = p.get_text(strip=True) if len(text) > 100: result['description'] = text[:500] + '...' if len(text) > 500 else text break return result def main(): """Main crawl function.""" print("=" * 60) print("KIEN Heritage Custodian Crawler") print("=" * 60) print(f"Started at: {datetime.now().isoformat()}") print() # Step 1: Get all heritage form URLs print("Step 1: Fetching heritage form URLs from inventory...") heritage_items = get_all_heritage_urls() print(f"\nFound {len(heritage_items)} heritage forms") # Save the URL list urls_file = OUTPUT_DIR / "heritage_urls.json" with open(urls_file, 'w', encoding='utf-8') as f: json.dump(heritage_items, f, ensure_ascii=False, indent=2) print(f"Saved URL list to: {urls_file}") # Step 2: Extract custodian data from each page print("\nStep 2: Extracting custodian data from each heritage form...") all_results = [] for i, item in enumerate(heritage_items, 1): print(f"\n[{i}/{len(heritage_items)}] {item['title']}") result = extract_custodians_from_page(item['url'], item['title']) all_results.append(result) # Progress save every 50 items if i % 50 == 0: progress_file = OUTPUT_DIR / f"custodians_progress_{i}.json" with open(progress_file, 'w', encoding='utf-8') as f: json.dump(all_results, f, ensure_ascii=False, indent=2) print(f" [PROGRESS] Saved {i} items") # Step 3: Save final results print("\nStep 3: Saving results...") # Full JSON output output_file = OUTPUT_DIR / "kien_custodians.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(all_results, f, ensure_ascii=False, indent=2) print(f"Saved full results to: {output_file}") # Summary statistics total_custodians = sum(len(r['custodians']) for r in all_results) with_custodians = sum(1 for r in all_results if r['custodians']) with_unesco = sum(1 for r in all_results if r['unesco_status']) summary = { 'crawl_date': datetime.now(timezone.utc).isoformat(), 'total_heritage_forms': len(all_results), 'forms_with_custodians': with_custodians, 'total_custodians_found': total_custodians, 'forms_with_unesco_mention': with_unesco, 'unique_domains': list(set(r['domain'] for r in all_results if r['domain'])), 'unique_locations': list(set(loc for r in all_results for loc in r.get('locations', []))) } summary_file = OUTPUT_DIR / "crawl_summary.json" with open(summary_file, 'w', encoding='utf-8') as f: json.dump(summary, f, ensure_ascii=False, indent=2) print("\n" + "=" * 60) print("CRAWL COMPLETE") print("=" * 60) print(f"Total heritage forms: {summary['total_heritage_forms']}") print(f"Forms with custodians: {summary['forms_with_custodians']}") print(f"Total custodians found: {summary['total_custodians_found']}") print(f"Forms with UNESCO mention: {summary['forms_with_unesco_mention']}") print(f"\nResults saved to: {OUTPUT_DIR}") if __name__ == "__main__": main()