#!/usr/bin/env python3 """ Belgian ISIL Detailed Metadata Scraper Extracts comprehensive metadata from KBR ISIL registry detail pages. Phase 2 enhancement: Scrapes rich metadata from individual institution pages. Author: GLAM Data Extraction Project Date: 2025-11-18 License: MIT """ import csv import json import re import time from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional import requests from bs4 import BeautifulSoup # Configuration BASE_CSV = Path(__file__).parent.parent.parent / "data" / "isil" / "belgian_isil_combined.csv" OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" OUTPUT_CSV = OUTPUT_DIR / "belgian_isil_detailed.csv" OUTPUT_JSON = OUTPUT_DIR / "belgian_isil_detailed.json" LOG_FILE = OUTPUT_DIR / "belgian_isil_detailed_scrape.log" PROGRESS_FILE = OUTPUT_DIR / ".belgian_scrape_progress.json" # Rate limiting (respectful scraping!) REQUEST_DELAY = 3.0 # 3 seconds between requests MAX_RETRIES = 3 # User-Agent HEADERS = { "User-Agent": "GLAM-Data-Extraction-Bot/1.0 (https://github.com/kempersc/glam; research purposes)", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9,nl;q=0.8,fr;q=0.7", } # CSV output columns CSV_COLUMNS = [ # Basic identification "isil_code", "institution_name", "institution_name_en", "alternative_names", "acronym", # Address "street_address", "postal_code", "city", "country", # Contact "email", "telephone", "fax", "website", # Organizational info "parent_organization", "institution_type", "legal_status", "founding_date", # Collection metadata "collection_description", "subjects", "date_range_start", "date_range_end", "collection_size", "collection_size_unit", "languages", # Access information "publicly_accessible", "access_conditions", "opening_hours", "services_provided", # Additional metadata "notes", "last_updated", # Provenance "detail_url", "scraped_at", "scrape_success", "scrape_error", ] def log_message(message: str): """Log message to console and file.""" timestamp = datetime.now(timezone.utc).isoformat() log_entry = f"[{timestamp}] {message}" print(log_entry) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) with open(LOG_FILE, 'a', encoding='utf-8') as f: f.write(log_entry + '\n') def load_progress() -> Dict: """Load scraping progress from file.""" if PROGRESS_FILE.exists(): with open(PROGRESS_FILE, 'r', encoding='utf-8') as f: return json.load(f) return {"completed_isil_codes": [], "last_index": 0} def save_progress(progress: Dict): """Save scraping progress to file.""" with open(PROGRESS_FILE, 'w', encoding='utf-8') as f: json.dump(progress, f, ensure_ascii=False, indent=2) def fetch_detail_page(url: str) -> Optional[BeautifulSoup]: """Fetch a detail page and return BeautifulSoup object.""" for attempt in range(MAX_RETRIES): try: time.sleep(REQUEST_DELAY) response = requests.get(url, headers=HEADERS, timeout=30) response.raise_for_status() response.encoding = 'utf-8' return BeautifulSoup(response.text, 'html.parser') except requests.RequestException as e: log_message(f"Error fetching {url} (attempt {attempt + 1}/{MAX_RETRIES}): {e}") if attempt < MAX_RETRIES - 1: time.sleep(REQUEST_DELAY * 2) else: return None def extract_text_from_element(element) -> Optional[str]: """Extract clean text from BeautifulSoup element.""" if element: text = element.get_text(separator=' ', strip=True) return text if text else None return None def extract_table_row_value(soup: BeautifulSoup, label: str, case_sensitive: bool = False) -> Optional[str]: """Extract value from HTML table by row label.""" flags = 0 if case_sensitive else re.IGNORECASE # Try to find with label headers = soup.find_all('th', string=re.compile(label, flags)) for header in headers: row = header.find_parent('tr') if row: cells = row.find_all('td') if cells: return extract_text_from_element(cells[0]) # Try to find with bold label labels = soup.find_all('td', string=re.compile(f'^\\s*{label}\\s*$', flags)) for label_cell in labels: row = label_cell.find_parent('tr') if row: cells = row.find_all('td') if len(cells) > 1: return extract_text_from_element(cells[1]) return None def extract_kbr_metadata(soup: BeautifulSoup, isil_code: str, base_name: str) -> Dict: """Extract metadata from KBR detail page.""" data = { 'isil_code': isil_code, 'institution_name': base_name.replace(' [Archive]', ''), 'country': 'BE', 'scrape_success': True, 'scrape_error': None, } # Find all table rows rows = soup.find_all('tr') # Pattern: Extract fields from table rows field_mappings = { 'institution_name_en': r'Name.*English', 'alternative_names': r'Alternative.*name|Other.*name|Acronym', 'acronym': r'Acronym', 'street_address': r'Address|Street', 'postal_code': r'Postal.*code|Zip.*code', 'city': r'City|Town', 'email': r'E-?mail', 'telephone': r'Phone|Telephone|Tel\.', 'fax': r'Fax', 'website': r'Website|URL|Web.*site', 'parent_organization': r'Parent.*organization|Main.*organization', 'institution_type': r'Type.*institution|Institution.*type|Library.*type', 'legal_status': r'Legal.*status|Status', 'founding_date': r'Founding.*date|Established|Created|Foundation.*date', 'collection_description': r'Collection.*description|Holdings|About.*collection', 'subjects': r'Subject|Theme|Topic', 'date_range_start': r'Date.*range.*from|Earliest.*date|Start.*date', 'date_range_end': r'Date.*range.*to|Latest.*date|End.*date', 'collection_size': r'Collection.*size|Holdings.*size|Extent', 'languages': r'Language', 'publicly_accessible': r'Public.*access|Open.*to.*public|Accessible', 'access_conditions': r'Access.*condition|Admission|Entry.*requirement', 'opening_hours': r'Opening.*hour|Schedule|Hours', 'services_provided': r'Service', 'notes': r'Note|Remark|Comment', 'last_updated': r'Last.*updated|Modified|Updated', } for field, pattern in field_mappings.items(): value = extract_table_row_value(soup, pattern) if value: data[field] = value # Special handling for website URLs (extract href) website_cells = soup.find_all('a', href=re.compile(r'^https?://')) if website_cells: websites = [a['href'] for a in website_cells if not a['href'].startswith('https://isil.kbr.be')] if websites: data['website'] = websites[0] # Take first external URL # Extract city and postal code from combined field if present address_full = extract_table_row_value(soup, r'Address.*full|Complete.*address') if address_full and not data.get('city'): # Pattern: "Street, PostalCode City" parts = address_full.split(',') if len(parts) >= 2: postal_city = parts[-1].strip() postal_parts = postal_city.split(maxsplit=1) if len(postal_parts) == 2: data['postal_code'] = postal_parts[0] data['city'] = postal_parts[1] # Parse collection size (extract number and unit) if data.get('collection_size'): size_text = data['collection_size'] # Pattern: "1,000 meters" or "500 linear meters" or "25 m" size_match = re.search(r'([\d,\.]+)\s*(linear\s+)?(meter|metre|m\b|km|shelf|volume|item|document)', size_text, re.IGNORECASE) if size_match: data['collection_size'] = size_match.group(1).replace(',', '') data['collection_size_unit'] = size_match.group(3).lower() # Parse date range (split if in format "YYYY-YYYY") date_range = extract_table_row_value(soup, r'Date.*range|Temporal.*coverage|Period') if date_range and not data.get('date_range_start'): # Pattern: "1800-1950" or "1800-present" date_parts = re.split(r'[-–—to]', date_range) if len(date_parts) >= 2: data['date_range_start'] = date_parts[0].strip() data['date_range_end'] = date_parts[1].strip() # Determine if it's an archive if '[Archive]' in base_name or isil_code.startswith('BE-A'): data['institution_type'] = 'Archive' return data def scrape_institution_detail(isil_code: str, name: str, detail_url: str) -> Dict: """Scrape detailed metadata for a single institution.""" log_message(f"Scraping {isil_code} - {name[:50]}...") data = { 'isil_code': isil_code, 'institution_name': name.replace(' [Archive]', ''), 'detail_url': detail_url, 'scraped_at': datetime.now(timezone.utc).isoformat(), 'country': 'BE', 'scrape_success': False, 'scrape_error': None, } soup = fetch_detail_page(detail_url) if not soup: data['scrape_error'] = "Failed to fetch page" return data try: # Extract metadata based on page structure metadata = extract_kbr_metadata(soup, isil_code, name) data.update(metadata) data['scrape_success'] = True except Exception as e: log_message(f"Error parsing {isil_code}: {e}") data['scrape_error'] = str(e) return data def load_base_institutions() -> List[Dict]: """Load basic institution list from CSV.""" institutions = [] with open(BASE_CSV, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: institutions.append(row) return institutions def save_data(data: List[Dict]): """Save scraped data to CSV and JSON.""" if not data: return # Save CSV with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f: writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS, extrasaction='ignore') writer.writeheader() for row in data: # Fill missing columns with None row_data = {col: row.get(col) for col in CSV_COLUMNS} writer.writerow(row_data) # Save JSON metadata = { 'extraction_date': datetime.now(timezone.utc).isoformat(), 'data_source': 'Belgian ISIL Registry (KBR)', 'scraper_version': '2.0.0', 'record_count': len(data), 'successful_scrapes': sum(1 for d in data if d.get('scrape_success')), 'failed_scrapes': sum(1 for d in data if not d.get('scrape_success')), 'institutions': data, } with open(OUTPUT_JSON, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) log_message(f"Saved {len(data)} institutions (CSV: {OUTPUT_CSV}, JSON: {OUTPUT_JSON})") def main(): """Main scraper execution.""" start_time = datetime.now(timezone.utc) log_message("=" * 80) log_message("Belgian ISIL Detailed Metadata Scraper (Phase 2)") log_message(f"Started at: {start_time.isoformat()}") log_message("=" * 80) # Load base institutions institutions = load_base_institutions() log_message(f"Loaded {len(institutions)} institutions from base CSV") # Load progress (for resume capability) progress = load_progress() completed_codes = set(progress.get('completed_isil_codes', [])) start_index = progress.get('last_index', 0) if completed_codes: log_message(f"Resuming from index {start_index} ({len(completed_codes)} already completed)") # Scrape each institution all_data = [] for i, inst in enumerate(institutions): if i < start_index: continue isil_code = inst['isil_code'] # Skip if already completed if isil_code in completed_codes: log_message(f"[{i+1}/{len(institutions)}] Skipping {isil_code} (already completed)") continue log_message(f"[{i+1}/{len(institutions)}] Processing {isil_code}...") data = scrape_institution_detail( isil_code, inst['name'], inst['detail_url'] ) all_data.append(data) completed_codes.add(isil_code) # Update progress progress['completed_isil_codes'] = list(completed_codes) progress['last_index'] = i + 1 save_progress(progress) # Save incrementally every 25 institutions if (i + 1) % 25 == 0: log_message(f"Saving progress... ({i+1} institutions scraped)") save_data(all_data) # Final save log_message("Scraping complete. Saving final data...") save_data(all_data) # Clean up progress file if PROGRESS_FILE.exists(): PROGRESS_FILE.unlink() # Summary end_time = datetime.now(timezone.utc) duration = (end_time - start_time).total_seconds() successful = sum(1 for d in all_data if d.get('scrape_success')) failed = len(all_data) - successful log_message("=" * 80) log_message(f"Scraping completed at: {end_time.isoformat()}") log_message(f"Duration: {duration:.1f} seconds ({duration/60:.1f} minutes)") log_message(f"Total institutions: {len(all_data)}") log_message(f"Successful: {successful}") log_message(f"Failed: {failed}") log_message(f"Success rate: {successful/len(all_data)*100:.1f}%") log_message("=" * 80) log_message(f"Output files:") log_message(f" CSV: {OUTPUT_CSV}") log_message(f" JSON: {OUTPUT_JSON}") log_message(f" Log: {LOG_FILE}") log_message("=" * 80) if __name__ == "__main__": main()