225 lines
8.1 KiB
Python
225 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Belgian ISIL Code Web Scraper (Optimized Version)
|
||
Extracts heritage institution data from KBR (libraries) and Rijksarchief (archives) registries.
|
||
|
||
Author: GLAM Data Extraction Project
|
||
Date: 2025-11-17
|
||
License: MIT
|
||
"""
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import time
|
||
import csv
|
||
import json
|
||
import re
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import List, Dict, Optional
|
||
import logging
|
||
|
||
# Configure logging
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Output directory
|
||
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil"
|
||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
|
||
class BelgianISILScraper:
|
||
"""Scrapes Belgian ISIL registries from KBR and Rijksarchief."""
|
||
|
||
def __init__(self):
|
||
self.session = requests.Session()
|
||
self.session.headers.update({
|
||
'User-Agent': 'Mozilla/5.0 (GLAM Data Extraction Bot; +https://github.com/kempersc/glam)',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
'Accept-Language': 'en-US,en;q=0.9,nl;q=0.8,fr;q=0.7',
|
||
})
|
||
|
||
def scrape_kbr_libraries(self) -> List[Dict]:
|
||
"""
|
||
Scrape KBR library ISIL registry.
|
||
|
||
Returns:
|
||
List of institution dictionaries
|
||
"""
|
||
logger.info("Starting KBR library registry scrape...")
|
||
|
||
# Empty query returns all results
|
||
search_url = "https://isil.kbr.be/search.php?lang=en&query="
|
||
|
||
try:
|
||
response = self.session.get(search_url)
|
||
response.raise_for_status()
|
||
response.encoding = 'utf-8'
|
||
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
# Find all institution links
|
||
institutions = []
|
||
links = soup.find_all('a', href=re.compile(r'data\.php\?.*id='))
|
||
|
||
logger.info(f"Found {len(links)} institution links")
|
||
|
||
for idx, link in enumerate(links, 1):
|
||
# Extract ISIL code from link text
|
||
link_text = link.get_text(strip=True)
|
||
|
||
# Pattern: "BE-XXX00 - Institution Name"
|
||
match = re.match(r'(BE-[A-Z0-9]+)\s*-\s*(.+)', link_text)
|
||
|
||
if match:
|
||
isil_code = match.group(1).strip()
|
||
name = match.group(2).strip()
|
||
|
||
# Extract detail page URL
|
||
detail_url = "https://isil.kbr.be/" + link['href']
|
||
|
||
institution = {
|
||
'isil_code': isil_code,
|
||
'name': name,
|
||
'detail_url': detail_url,
|
||
'registry': 'KBR',
|
||
'sector': 'libraries'
|
||
}
|
||
|
||
institutions.append(institution)
|
||
|
||
if idx % 50 == 0: # Log progress every 50 records
|
||
logger.info(f"Processed {idx}/{len(links)} records...")
|
||
|
||
logger.info(f"Successfully scraped {len(institutions)} KBR library records")
|
||
return institutions
|
||
|
||
except requests.RequestException as e:
|
||
logger.error(f"Error fetching KBR registry: {e}")
|
||
return []
|
||
|
||
def scrape_rijksarchief_archives(self) -> List[Dict]:
|
||
"""
|
||
Scrape Rijksarchief archive ISIL registry.
|
||
|
||
Returns:
|
||
List of institution dictionaries
|
||
"""
|
||
logger.info("Starting Rijksarchief archive registry scrape...")
|
||
|
||
# Try to get all records by querying with empty form
|
||
search_url = "http://isil.arch.be/?view=searchisil"
|
||
|
||
try:
|
||
response = self.session.get(search_url)
|
||
response.raise_for_status()
|
||
response.encoding = 'utf-8'
|
||
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
institutions = []
|
||
|
||
# Look for ISIL code patterns in the page
|
||
# Pattern: BE-Axxxx (archives format)
|
||
text = soup.get_text()
|
||
isil_matches = re.findall(r'(BE-A\d{4})\s*[-–:]\s*([^\n]+)', text)
|
||
|
||
for isil_code, name in isil_matches:
|
||
institution = {
|
||
'isil_code': isil_code.strip(),
|
||
'name': name.strip(),
|
||
'detail_url': f'http://isil.arch.be/?view=searchisil&code={isil_code}',
|
||
'registry': 'Rijksarchief',
|
||
'sector': 'archives'
|
||
}
|
||
institutions.append(institution)
|
||
|
||
logger.info(f"Successfully scraped {len(institutions)} Rijksarchief archive records")
|
||
return institutions
|
||
|
||
except requests.RequestException as e:
|
||
logger.error(f"Error fetching Rijksarchief registry: {e}")
|
||
return []
|
||
|
||
def export_to_csv(self, institutions: List[Dict], filename: str):
|
||
"""Export scraped data to CSV."""
|
||
output_file = OUTPUT_DIR / filename
|
||
|
||
if not institutions:
|
||
logger.warning(f"No institutions to export to {filename}")
|
||
return
|
||
|
||
fieldnames = ['isil_code', 'name', 'registry', 'sector', 'detail_url']
|
||
|
||
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||
writer.writeheader()
|
||
writer.writerows(institutions)
|
||
|
||
logger.info(f"Exported {len(institutions)} records to {output_file}")
|
||
|
||
def export_to_json(self, institutions: List[Dict], filename: str):
|
||
"""Export scraped data to JSON."""
|
||
output_file = OUTPUT_DIR / filename
|
||
|
||
metadata = {
|
||
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
||
'data_source': 'Belgian ISIL Registries',
|
||
'scraper_version': '1.0.1',
|
||
'record_count': len(institutions),
|
||
'institutions': institutions
|
||
}
|
||
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
||
|
||
logger.info(f"Exported {len(institutions)} records to {output_file}")
|
||
|
||
|
||
def main():
|
||
"""Main execution function."""
|
||
logger.info("=== Belgian ISIL Registry Scraper (Fast) ===")
|
||
logger.info("Optimized for speed - no unnecessary delays")
|
||
|
||
scraper = BelgianISILScraper()
|
||
|
||
# Scrape KBR libraries
|
||
logger.info("\n--- Phase 1: KBR Library Registry ---")
|
||
kbr_institutions = scraper.scrape_kbr_libraries()
|
||
|
||
if kbr_institutions:
|
||
scraper.export_to_csv(kbr_institutions, "belgian_isil_kbr_libraries.csv")
|
||
scraper.export_to_json(kbr_institutions, "belgian_isil_kbr_libraries.json")
|
||
|
||
# Wait 2 seconds before next registry
|
||
time.sleep(2)
|
||
|
||
# Scrape Rijksarchief archives
|
||
logger.info("\n--- Phase 2: Rijksarchief Archive Registry ---")
|
||
rijksarchief_institutions = scraper.scrape_rijksarchief_archives()
|
||
|
||
if rijksarchief_institutions:
|
||
scraper.export_to_csv(rijksarchief_institutions, "belgian_isil_rijksarchief_archives.csv")
|
||
scraper.export_to_json(rijksarchief_institutions, "belgian_isil_rijksarchief_archives.json")
|
||
|
||
# Combined export
|
||
all_institutions = kbr_institutions + rijksarchief_institutions
|
||
|
||
if all_institutions:
|
||
scraper.export_to_csv(all_institutions, "belgian_isil_combined.csv")
|
||
scraper.export_to_json(all_institutions, "belgian_isil_combined.json")
|
||
|
||
logger.info(f"\n=== Scraping Complete ===")
|
||
logger.info(f"Total institutions extracted: {len(all_institutions)}")
|
||
logger.info(f" - KBR libraries: {len(kbr_institutions)}")
|
||
logger.info(f" - Rijksarchief archives: {len(rijksarchief_institutions)}")
|
||
logger.info(f"\nOutput directory: {OUTPUT_DIR}")
|
||
else:
|
||
logger.error("No institutions were scraped. Check logs for errors.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|