#!/usr/bin/env python3 """ Belgian ISIL Code Web Scraper Extracts heritage institution data from KBR (libraries) and Rijksarchief (archives) registries. Author: GLAM Data Extraction Project Date: 2025-11-17 License: MIT """ import requests from bs4 import BeautifulSoup import time import csv import json import re from datetime import datetime, timezone from pathlib import Path from typing import List, Dict, Optional import logging # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Output directory OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) class BelgianISILScraper: """Scrapes Belgian ISIL registries from KBR and Rijksarchief.""" def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (GLAM Data Extraction Bot; +https://github.com/kempersc/glam)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9,nl;q=0.8,fr;q=0.7', }) self.rate_limit_delay = 1 # Respectful 1-second delay between requests def scrape_kbr_libraries(self) -> List[Dict]: """ Scrape KBR library ISIL registry. Returns: List of institution dictionaries """ logger.info("Starting KBR library registry scrape...") # Empty query returns all results search_url = "https://isil.kbr.be/search.php?lang=en&query=" try: response = self.session.get(search_url) response.raise_for_status() response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # Find all institution links institutions = [] links = soup.find_all('a', href=re.compile(r'data\.php\?.*id=')) logger.info(f"Found {len(links)} institution links") for idx, link in enumerate(links, 1): # Extract ISIL code from link text link_text = link.get_text(strip=True) # Pattern: "BE-XXX00 - Institution Name" match = re.match(r'(BE-[A-Z0-9]+)\s*-\s*(.+)', link_text) if match: isil_code = match.group(1).strip() name = match.group(2).strip() # Extract detail page URL detail_url = "https://isil.kbr.be/" + link['href'] institution = { 'isil_code': isil_code, 'name': name, 'detail_url': detail_url, 'registry': 'KBR', 'sector': 'libraries' } institutions.append(institution) logger.info(f"[{idx}/{len(links)}] Extracted: {isil_code} - {name}") # Respectful rate limiting time.sleep(self.rate_limit_delay) logger.info(f"Successfully scraped {len(institutions)} KBR library records") return institutions except requests.RequestException as e: logger.error(f"Error fetching KBR registry: {e}") return [] def scrape_rijksarchief_archives(self) -> List[Dict]: """ Scrape Rijksarchief archive ISIL registry. Returns: List of institution dictionaries """ logger.info("Starting Rijksarchief archive registry scrape...") # Empty search query search_url = "http://isil.arch.be/?view=searchisil" try: # First, get the search page to understand structure response = self.session.get(search_url) response.raise_for_status() response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # Note: Rijksarchief site may require form submission # This is a basic implementation - may need refinement institutions = [] # Look for ISIL code patterns in the page text = soup.get_text() isil_matches = re.findall(r'(BE-A\d{4})\s*[-–]\s*([^\n]+)', text) for isil_code, name in isil_matches: institution = { 'isil_code': isil_code.strip(), 'name': name.strip(), 'detail_url': f'http://isil.arch.be/?view=searchisil&code={isil_code}', 'registry': 'Rijksarchief', 'sector': 'archives' } institutions.append(institution) logger.info(f"Extracted: {isil_code} - {name}") logger.info(f"Successfully scraped {len(institutions)} Rijksarchief archive records") return institutions except requests.RequestException as e: logger.error(f"Error fetching Rijksarchief registry: {e}") return [] def export_to_csv(self, institutions: List[Dict], filename: str): """Export scraped data to CSV.""" output_file = OUTPUT_DIR / filename if not institutions: logger.warning(f"No institutions to export to {filename}") return fieldnames = ['isil_code', 'name', 'registry', 'sector', 'detail_url'] with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(institutions) logger.info(f"Exported {len(institutions)} records to {output_file}") def export_to_json(self, institutions: List[Dict], filename: str): """Export scraped data to JSON.""" output_file = OUTPUT_DIR / filename metadata = { 'extraction_date': datetime.now(timezone.utc).isoformat(), 'data_source': 'Belgian ISIL Registries', 'scraper_version': '1.0.0', 'record_count': len(institutions), 'institutions': institutions } with open(output_file, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) logger.info(f"Exported {len(institutions)} records to {output_file}") def main(): """Main execution function.""" logger.info("=== Belgian ISIL Registry Scraper ===") logger.info("Respectful web scraping with 3-second rate limiting") scraper = BelgianISILScraper() # Scrape KBR libraries logger.info("\n--- Phase 1: KBR Library Registry ---") kbr_institutions = scraper.scrape_kbr_libraries() if kbr_institutions: scraper.export_to_csv(kbr_institutions, "belgian_isil_kbr_libraries.csv") scraper.export_to_json(kbr_institutions, "belgian_isil_kbr_libraries.json") # Scrape Rijksarchief archives logger.info("\n--- Phase 2: Rijksarchief Archive Registry ---") rijksarchief_institutions = scraper.scrape_rijksarchief_archives() if rijksarchief_institutions: scraper.export_to_csv(rijksarchief_institutions, "belgian_isil_rijksarchief_archives.csv") scraper.export_to_json(rijksarchief_institutions, "belgian_isil_rijksarchief_archives.json") # Combined export all_institutions = kbr_institutions + rijksarchief_institutions if all_institutions: scraper.export_to_csv(all_institutions, "belgian_isil_combined.csv") scraper.export_to_json(all_institutions, "belgian_isil_combined.json") logger.info(f"\n=== Scraping Complete ===") logger.info(f"Total institutions extracted: {len(all_institutions)}") logger.info(f" - KBR libraries: {len(kbr_institutions)}") logger.info(f" - Rijksarchief archives: {len(rijksarchief_institutions)}") logger.info(f"\nOutput directory: {OUTPUT_DIR}") else: logger.error("No institutions were scraped. Check logs for errors.") if __name__ == "__main__": main()