glam/scripts/scrapers/scrape_belgian_isil_detailed.py
2025-11-19 23:25:22 +01:00

421 lines
14 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Belgian ISIL Detailed Metadata Scraper
Extracts comprehensive metadata from KBR ISIL registry detail pages.
Phase 2 enhancement: Scrapes rich metadata from individual institution pages.
Author: GLAM Data Extraction Project
Date: 2025-11-18
License: MIT
"""
import csv
import json
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
import requests
from bs4 import BeautifulSoup
# Configuration
BASE_CSV = Path(__file__).parent.parent.parent / "data" / "isil" / "belgian_isil_combined.csv"
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil"
OUTPUT_CSV = OUTPUT_DIR / "belgian_isil_detailed.csv"
OUTPUT_JSON = OUTPUT_DIR / "belgian_isil_detailed.json"
LOG_FILE = OUTPUT_DIR / "belgian_isil_detailed_scrape.log"
PROGRESS_FILE = OUTPUT_DIR / ".belgian_scrape_progress.json"
# Rate limiting (respectful scraping!)
REQUEST_DELAY = 3.0 # 3 seconds between requests
MAX_RETRIES = 3
# User-Agent
HEADERS = {
"User-Agent": "GLAM-Data-Extraction-Bot/1.0 (https://github.com/kempersc/glam; research purposes)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9,nl;q=0.8,fr;q=0.7",
}
# CSV output columns
CSV_COLUMNS = [
# Basic identification
"isil_code",
"institution_name",
"institution_name_en",
"alternative_names",
"acronym",
# Address
"street_address",
"postal_code",
"city",
"country",
# Contact
"email",
"telephone",
"fax",
"website",
# Organizational info
"parent_organization",
"institution_type",
"legal_status",
"founding_date",
# Collection metadata
"collection_description",
"subjects",
"date_range_start",
"date_range_end",
"collection_size",
"collection_size_unit",
"languages",
# Access information
"publicly_accessible",
"access_conditions",
"opening_hours",
"services_provided",
# Additional metadata
"notes",
"last_updated",
# Provenance
"detail_url",
"scraped_at",
"scrape_success",
"scrape_error",
]
def log_message(message: str):
"""Log message to console and file."""
timestamp = datetime.now(timezone.utc).isoformat()
log_entry = f"[{timestamp}] {message}"
print(log_entry)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
with open(LOG_FILE, 'a', encoding='utf-8') as f:
f.write(log_entry + '\n')
def load_progress() -> Dict:
"""Load scraping progress from file."""
if PROGRESS_FILE.exists():
with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
return {"completed_isil_codes": [], "last_index": 0}
def save_progress(progress: Dict):
"""Save scraping progress to file."""
with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
json.dump(progress, f, ensure_ascii=False, indent=2)
def fetch_detail_page(url: str) -> Optional[BeautifulSoup]:
"""Fetch a detail page and return BeautifulSoup object."""
for attempt in range(MAX_RETRIES):
try:
time.sleep(REQUEST_DELAY)
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
response.encoding = 'utf-8'
return BeautifulSoup(response.text, 'html.parser')
except requests.RequestException as e:
log_message(f"Error fetching {url} (attempt {attempt + 1}/{MAX_RETRIES}): {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(REQUEST_DELAY * 2)
else:
return None
def extract_text_from_element(element) -> Optional[str]:
"""Extract clean text from BeautifulSoup element."""
if element:
text = element.get_text(separator=' ', strip=True)
return text if text else None
return None
def extract_table_row_value(soup: BeautifulSoup, label: str, case_sensitive: bool = False) -> Optional[str]:
"""Extract value from HTML table by row label."""
flags = 0 if case_sensitive else re.IGNORECASE
# Try to find <th> with label
headers = soup.find_all('th', string=re.compile(label, flags))
for header in headers:
row = header.find_parent('tr')
if row:
cells = row.find_all('td')
if cells:
return extract_text_from_element(cells[0])
# Try to find <td> with bold label
labels = soup.find_all('td', string=re.compile(f'^\\s*{label}\\s*$', flags))
for label_cell in labels:
row = label_cell.find_parent('tr')
if row:
cells = row.find_all('td')
if len(cells) > 1:
return extract_text_from_element(cells[1])
return None
def extract_kbr_metadata(soup: BeautifulSoup, isil_code: str, base_name: str) -> Dict:
"""Extract metadata from KBR detail page."""
data = {
'isil_code': isil_code,
'institution_name': base_name.replace(' [Archive]', ''),
'country': 'BE',
'scrape_success': True,
'scrape_error': None,
}
# Find all table rows
rows = soup.find_all('tr')
# Pattern: Extract fields from table rows
field_mappings = {
'institution_name_en': r'Name.*English',
'alternative_names': r'Alternative.*name|Other.*name|Acronym',
'acronym': r'Acronym',
'street_address': r'Address|Street',
'postal_code': r'Postal.*code|Zip.*code',
'city': r'City|Town',
'email': r'E-?mail',
'telephone': r'Phone|Telephone|Tel\.',
'fax': r'Fax',
'website': r'Website|URL|Web.*site',
'parent_organization': r'Parent.*organization|Main.*organization',
'institution_type': r'Type.*institution|Institution.*type|Library.*type',
'legal_status': r'Legal.*status|Status',
'founding_date': r'Founding.*date|Established|Created|Foundation.*date',
'collection_description': r'Collection.*description|Holdings|About.*collection',
'subjects': r'Subject|Theme|Topic',
'date_range_start': r'Date.*range.*from|Earliest.*date|Start.*date',
'date_range_end': r'Date.*range.*to|Latest.*date|End.*date',
'collection_size': r'Collection.*size|Holdings.*size|Extent',
'languages': r'Language',
'publicly_accessible': r'Public.*access|Open.*to.*public|Accessible',
'access_conditions': r'Access.*condition|Admission|Entry.*requirement',
'opening_hours': r'Opening.*hour|Schedule|Hours',
'services_provided': r'Service',
'notes': r'Note|Remark|Comment',
'last_updated': r'Last.*updated|Modified|Updated',
}
for field, pattern in field_mappings.items():
value = extract_table_row_value(soup, pattern)
if value:
data[field] = value
# Special handling for website URLs (extract href)
website_cells = soup.find_all('a', href=re.compile(r'^https?://'))
if website_cells:
websites = [a['href'] for a in website_cells if not a['href'].startswith('https://isil.kbr.be')]
if websites:
data['website'] = websites[0] # Take first external URL
# Extract city and postal code from combined field if present
address_full = extract_table_row_value(soup, r'Address.*full|Complete.*address')
if address_full and not data.get('city'):
# Pattern: "Street, PostalCode City"
parts = address_full.split(',')
if len(parts) >= 2:
postal_city = parts[-1].strip()
postal_parts = postal_city.split(maxsplit=1)
if len(postal_parts) == 2:
data['postal_code'] = postal_parts[0]
data['city'] = postal_parts[1]
# Parse collection size (extract number and unit)
if data.get('collection_size'):
size_text = data['collection_size']
# Pattern: "1,000 meters" or "500 linear meters" or "25 m"
size_match = re.search(r'([\d,\.]+)\s*(linear\s+)?(meter|metre|m\b|km|shelf|volume|item|document)', size_text, re.IGNORECASE)
if size_match:
data['collection_size'] = size_match.group(1).replace(',', '')
data['collection_size_unit'] = size_match.group(3).lower()
# Parse date range (split if in format "YYYY-YYYY")
date_range = extract_table_row_value(soup, r'Date.*range|Temporal.*coverage|Period')
if date_range and not data.get('date_range_start'):
# Pattern: "1800-1950" or "1800-present"
date_parts = re.split(r'[-—to]', date_range)
if len(date_parts) >= 2:
data['date_range_start'] = date_parts[0].strip()
data['date_range_end'] = date_parts[1].strip()
# Determine if it's an archive
if '[Archive]' in base_name or isil_code.startswith('BE-A'):
data['institution_type'] = 'Archive'
return data
def scrape_institution_detail(isil_code: str, name: str, detail_url: str) -> Dict:
"""Scrape detailed metadata for a single institution."""
log_message(f"Scraping {isil_code} - {name[:50]}...")
data = {
'isil_code': isil_code,
'institution_name': name.replace(' [Archive]', ''),
'detail_url': detail_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'country': 'BE',
'scrape_success': False,
'scrape_error': None,
}
soup = fetch_detail_page(detail_url)
if not soup:
data['scrape_error'] = "Failed to fetch page"
return data
try:
# Extract metadata based on page structure
metadata = extract_kbr_metadata(soup, isil_code, name)
data.update(metadata)
data['scrape_success'] = True
except Exception as e:
log_message(f"Error parsing {isil_code}: {e}")
data['scrape_error'] = str(e)
return data
def load_base_institutions() -> List[Dict]:
"""Load basic institution list from CSV."""
institutions = []
with open(BASE_CSV, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
institutions.append(row)
return institutions
def save_data(data: List[Dict]):
"""Save scraped data to CSV and JSON."""
if not data:
return
# Save CSV
with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS, extrasaction='ignore')
writer.writeheader()
for row in data:
# Fill missing columns with None
row_data = {col: row.get(col) for col in CSV_COLUMNS}
writer.writerow(row_data)
# Save JSON
metadata = {
'extraction_date': datetime.now(timezone.utc).isoformat(),
'data_source': 'Belgian ISIL Registry (KBR)',
'scraper_version': '2.0.0',
'record_count': len(data),
'successful_scrapes': sum(1 for d in data if d.get('scrape_success')),
'failed_scrapes': sum(1 for d in data if not d.get('scrape_success')),
'institutions': data,
}
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
log_message(f"Saved {len(data)} institutions (CSV: {OUTPUT_CSV}, JSON: {OUTPUT_JSON})")
def main():
"""Main scraper execution."""
start_time = datetime.now(timezone.utc)
log_message("=" * 80)
log_message("Belgian ISIL Detailed Metadata Scraper (Phase 2)")
log_message(f"Started at: {start_time.isoformat()}")
log_message("=" * 80)
# Load base institutions
institutions = load_base_institutions()
log_message(f"Loaded {len(institutions)} institutions from base CSV")
# Load progress (for resume capability)
progress = load_progress()
completed_codes = set(progress.get('completed_isil_codes', []))
start_index = progress.get('last_index', 0)
if completed_codes:
log_message(f"Resuming from index {start_index} ({len(completed_codes)} already completed)")
# Scrape each institution
all_data = []
for i, inst in enumerate(institutions):
if i < start_index:
continue
isil_code = inst['isil_code']
# Skip if already completed
if isil_code in completed_codes:
log_message(f"[{i+1}/{len(institutions)}] Skipping {isil_code} (already completed)")
continue
log_message(f"[{i+1}/{len(institutions)}] Processing {isil_code}...")
data = scrape_institution_detail(
isil_code,
inst['name'],
inst['detail_url']
)
all_data.append(data)
completed_codes.add(isil_code)
# Update progress
progress['completed_isil_codes'] = list(completed_codes)
progress['last_index'] = i + 1
save_progress(progress)
# Save incrementally every 25 institutions
if (i + 1) % 25 == 0:
log_message(f"Saving progress... ({i+1} institutions scraped)")
save_data(all_data)
# Final save
log_message("Scraping complete. Saving final data...")
save_data(all_data)
# Clean up progress file
if PROGRESS_FILE.exists():
PROGRESS_FILE.unlink()
# Summary
end_time = datetime.now(timezone.utc)
duration = (end_time - start_time).total_seconds()
successful = sum(1 for d in all_data if d.get('scrape_success'))
failed = len(all_data) - successful
log_message("=" * 80)
log_message(f"Scraping completed at: {end_time.isoformat()}")
log_message(f"Duration: {duration:.1f} seconds ({duration/60:.1f} minutes)")
log_message(f"Total institutions: {len(all_data)}")
log_message(f"Successful: {successful}")
log_message(f"Failed: {failed}")
log_message(f"Success rate: {successful/len(all_data)*100:.1f}%")
log_message("=" * 80)
log_message(f"Output files:")
log_message(f" CSV: {OUTPUT_CSV}")
log_message(f" JSON: {OUTPUT_JSON}")
log_message(f" Log: {LOG_FILE}")
log_message("=" * 80)
if __name__ == "__main__":
main()