421 lines
14 KiB
Python
Executable file
421 lines
14 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Belgian ISIL Detailed Metadata Scraper
|
||
|
||
Extracts comprehensive metadata from KBR ISIL registry detail pages.
|
||
Phase 2 enhancement: Scrapes rich metadata from individual institution pages.
|
||
|
||
Author: GLAM Data Extraction Project
|
||
Date: 2025-11-18
|
||
License: MIT
|
||
"""
|
||
|
||
import csv
|
||
import json
|
||
import re
|
||
import time
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
# Configuration
|
||
BASE_CSV = Path(__file__).parent.parent.parent / "data" / "isil" / "belgian_isil_combined.csv"
|
||
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil"
|
||
OUTPUT_CSV = OUTPUT_DIR / "belgian_isil_detailed.csv"
|
||
OUTPUT_JSON = OUTPUT_DIR / "belgian_isil_detailed.json"
|
||
LOG_FILE = OUTPUT_DIR / "belgian_isil_detailed_scrape.log"
|
||
PROGRESS_FILE = OUTPUT_DIR / ".belgian_scrape_progress.json"
|
||
|
||
# Rate limiting (respectful scraping!)
|
||
REQUEST_DELAY = 3.0 # 3 seconds between requests
|
||
MAX_RETRIES = 3
|
||
|
||
# User-Agent
|
||
HEADERS = {
|
||
"User-Agent": "GLAM-Data-Extraction-Bot/1.0 (https://github.com/kempersc/glam; research purposes)",
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "en-US,en;q=0.9,nl;q=0.8,fr;q=0.7",
|
||
}
|
||
|
||
# CSV output columns
|
||
CSV_COLUMNS = [
|
||
# Basic identification
|
||
"isil_code",
|
||
"institution_name",
|
||
"institution_name_en",
|
||
"alternative_names",
|
||
"acronym",
|
||
|
||
# Address
|
||
"street_address",
|
||
"postal_code",
|
||
"city",
|
||
"country",
|
||
|
||
# Contact
|
||
"email",
|
||
"telephone",
|
||
"fax",
|
||
"website",
|
||
|
||
# Organizational info
|
||
"parent_organization",
|
||
"institution_type",
|
||
"legal_status",
|
||
"founding_date",
|
||
|
||
# Collection metadata
|
||
"collection_description",
|
||
"subjects",
|
||
"date_range_start",
|
||
"date_range_end",
|
||
"collection_size",
|
||
"collection_size_unit",
|
||
"languages",
|
||
|
||
# Access information
|
||
"publicly_accessible",
|
||
"access_conditions",
|
||
"opening_hours",
|
||
"services_provided",
|
||
|
||
# Additional metadata
|
||
"notes",
|
||
"last_updated",
|
||
|
||
# Provenance
|
||
"detail_url",
|
||
"scraped_at",
|
||
"scrape_success",
|
||
"scrape_error",
|
||
]
|
||
|
||
|
||
def log_message(message: str):
|
||
"""Log message to console and file."""
|
||
timestamp = datetime.now(timezone.utc).isoformat()
|
||
log_entry = f"[{timestamp}] {message}"
|
||
print(log_entry)
|
||
|
||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
with open(LOG_FILE, 'a', encoding='utf-8') as f:
|
||
f.write(log_entry + '\n')
|
||
|
||
|
||
def load_progress() -> Dict:
|
||
"""Load scraping progress from file."""
|
||
if PROGRESS_FILE.exists():
|
||
with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
|
||
return json.load(f)
|
||
return {"completed_isil_codes": [], "last_index": 0}
|
||
|
||
|
||
def save_progress(progress: Dict):
|
||
"""Save scraping progress to file."""
|
||
with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
|
||
json.dump(progress, f, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def fetch_detail_page(url: str) -> Optional[BeautifulSoup]:
|
||
"""Fetch a detail page and return BeautifulSoup object."""
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
time.sleep(REQUEST_DELAY)
|
||
response = requests.get(url, headers=HEADERS, timeout=30)
|
||
response.raise_for_status()
|
||
response.encoding = 'utf-8'
|
||
return BeautifulSoup(response.text, 'html.parser')
|
||
except requests.RequestException as e:
|
||
log_message(f"Error fetching {url} (attempt {attempt + 1}/{MAX_RETRIES}): {e}")
|
||
if attempt < MAX_RETRIES - 1:
|
||
time.sleep(REQUEST_DELAY * 2)
|
||
else:
|
||
return None
|
||
|
||
|
||
def extract_text_from_element(element) -> Optional[str]:
|
||
"""Extract clean text from BeautifulSoup element."""
|
||
if element:
|
||
text = element.get_text(separator=' ', strip=True)
|
||
return text if text else None
|
||
return None
|
||
|
||
|
||
def extract_table_row_value(soup: BeautifulSoup, label: str, case_sensitive: bool = False) -> Optional[str]:
|
||
"""Extract value from HTML table by row label."""
|
||
flags = 0 if case_sensitive else re.IGNORECASE
|
||
|
||
# Try to find <th> with label
|
||
headers = soup.find_all('th', string=re.compile(label, flags))
|
||
for header in headers:
|
||
row = header.find_parent('tr')
|
||
if row:
|
||
cells = row.find_all('td')
|
||
if cells:
|
||
return extract_text_from_element(cells[0])
|
||
|
||
# Try to find <td> with bold label
|
||
labels = soup.find_all('td', string=re.compile(f'^\\s*{label}\\s*$', flags))
|
||
for label_cell in labels:
|
||
row = label_cell.find_parent('tr')
|
||
if row:
|
||
cells = row.find_all('td')
|
||
if len(cells) > 1:
|
||
return extract_text_from_element(cells[1])
|
||
|
||
return None
|
||
|
||
|
||
def extract_kbr_metadata(soup: BeautifulSoup, isil_code: str, base_name: str) -> Dict:
|
||
"""Extract metadata from KBR detail page."""
|
||
data = {
|
||
'isil_code': isil_code,
|
||
'institution_name': base_name.replace(' [Archive]', ''),
|
||
'country': 'BE',
|
||
'scrape_success': True,
|
||
'scrape_error': None,
|
||
}
|
||
|
||
# Find all table rows
|
||
rows = soup.find_all('tr')
|
||
|
||
# Pattern: Extract fields from table rows
|
||
field_mappings = {
|
||
'institution_name_en': r'Name.*English',
|
||
'alternative_names': r'Alternative.*name|Other.*name|Acronym',
|
||
'acronym': r'Acronym',
|
||
'street_address': r'Address|Street',
|
||
'postal_code': r'Postal.*code|Zip.*code',
|
||
'city': r'City|Town',
|
||
'email': r'E-?mail',
|
||
'telephone': r'Phone|Telephone|Tel\.',
|
||
'fax': r'Fax',
|
||
'website': r'Website|URL|Web.*site',
|
||
'parent_organization': r'Parent.*organization|Main.*organization',
|
||
'institution_type': r'Type.*institution|Institution.*type|Library.*type',
|
||
'legal_status': r'Legal.*status|Status',
|
||
'founding_date': r'Founding.*date|Established|Created|Foundation.*date',
|
||
'collection_description': r'Collection.*description|Holdings|About.*collection',
|
||
'subjects': r'Subject|Theme|Topic',
|
||
'date_range_start': r'Date.*range.*from|Earliest.*date|Start.*date',
|
||
'date_range_end': r'Date.*range.*to|Latest.*date|End.*date',
|
||
'collection_size': r'Collection.*size|Holdings.*size|Extent',
|
||
'languages': r'Language',
|
||
'publicly_accessible': r'Public.*access|Open.*to.*public|Accessible',
|
||
'access_conditions': r'Access.*condition|Admission|Entry.*requirement',
|
||
'opening_hours': r'Opening.*hour|Schedule|Hours',
|
||
'services_provided': r'Service',
|
||
'notes': r'Note|Remark|Comment',
|
||
'last_updated': r'Last.*updated|Modified|Updated',
|
||
}
|
||
|
||
for field, pattern in field_mappings.items():
|
||
value = extract_table_row_value(soup, pattern)
|
||
if value:
|
||
data[field] = value
|
||
|
||
# Special handling for website URLs (extract href)
|
||
website_cells = soup.find_all('a', href=re.compile(r'^https?://'))
|
||
if website_cells:
|
||
websites = [a['href'] for a in website_cells if not a['href'].startswith('https://isil.kbr.be')]
|
||
if websites:
|
||
data['website'] = websites[0] # Take first external URL
|
||
|
||
# Extract city and postal code from combined field if present
|
||
address_full = extract_table_row_value(soup, r'Address.*full|Complete.*address')
|
||
if address_full and not data.get('city'):
|
||
# Pattern: "Street, PostalCode City"
|
||
parts = address_full.split(',')
|
||
if len(parts) >= 2:
|
||
postal_city = parts[-1].strip()
|
||
postal_parts = postal_city.split(maxsplit=1)
|
||
if len(postal_parts) == 2:
|
||
data['postal_code'] = postal_parts[0]
|
||
data['city'] = postal_parts[1]
|
||
|
||
# Parse collection size (extract number and unit)
|
||
if data.get('collection_size'):
|
||
size_text = data['collection_size']
|
||
# Pattern: "1,000 meters" or "500 linear meters" or "25 m"
|
||
size_match = re.search(r'([\d,\.]+)\s*(linear\s+)?(meter|metre|m\b|km|shelf|volume|item|document)', size_text, re.IGNORECASE)
|
||
if size_match:
|
||
data['collection_size'] = size_match.group(1).replace(',', '')
|
||
data['collection_size_unit'] = size_match.group(3).lower()
|
||
|
||
# Parse date range (split if in format "YYYY-YYYY")
|
||
date_range = extract_table_row_value(soup, r'Date.*range|Temporal.*coverage|Period')
|
||
if date_range and not data.get('date_range_start'):
|
||
# Pattern: "1800-1950" or "1800-present"
|
||
date_parts = re.split(r'[-–—to]', date_range)
|
||
if len(date_parts) >= 2:
|
||
data['date_range_start'] = date_parts[0].strip()
|
||
data['date_range_end'] = date_parts[1].strip()
|
||
|
||
# Determine if it's an archive
|
||
if '[Archive]' in base_name or isil_code.startswith('BE-A'):
|
||
data['institution_type'] = 'Archive'
|
||
|
||
return data
|
||
|
||
|
||
def scrape_institution_detail(isil_code: str, name: str, detail_url: str) -> Dict:
|
||
"""Scrape detailed metadata for a single institution."""
|
||
log_message(f"Scraping {isil_code} - {name[:50]}...")
|
||
|
||
data = {
|
||
'isil_code': isil_code,
|
||
'institution_name': name.replace(' [Archive]', ''),
|
||
'detail_url': detail_url,
|
||
'scraped_at': datetime.now(timezone.utc).isoformat(),
|
||
'country': 'BE',
|
||
'scrape_success': False,
|
||
'scrape_error': None,
|
||
}
|
||
|
||
soup = fetch_detail_page(detail_url)
|
||
if not soup:
|
||
data['scrape_error'] = "Failed to fetch page"
|
||
return data
|
||
|
||
try:
|
||
# Extract metadata based on page structure
|
||
metadata = extract_kbr_metadata(soup, isil_code, name)
|
||
data.update(metadata)
|
||
data['scrape_success'] = True
|
||
except Exception as e:
|
||
log_message(f"Error parsing {isil_code}: {e}")
|
||
data['scrape_error'] = str(e)
|
||
|
||
return data
|
||
|
||
|
||
def load_base_institutions() -> List[Dict]:
|
||
"""Load basic institution list from CSV."""
|
||
institutions = []
|
||
with open(BASE_CSV, 'r', encoding='utf-8') as f:
|
||
reader = csv.DictReader(f)
|
||
for row in reader:
|
||
institutions.append(row)
|
||
return institutions
|
||
|
||
|
||
def save_data(data: List[Dict]):
|
||
"""Save scraped data to CSV and JSON."""
|
||
if not data:
|
||
return
|
||
|
||
# Save CSV
|
||
with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
|
||
writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS, extrasaction='ignore')
|
||
writer.writeheader()
|
||
for row in data:
|
||
# Fill missing columns with None
|
||
row_data = {col: row.get(col) for col in CSV_COLUMNS}
|
||
writer.writerow(row_data)
|
||
|
||
# Save JSON
|
||
metadata = {
|
||
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
||
'data_source': 'Belgian ISIL Registry (KBR)',
|
||
'scraper_version': '2.0.0',
|
||
'record_count': len(data),
|
||
'successful_scrapes': sum(1 for d in data if d.get('scrape_success')),
|
||
'failed_scrapes': sum(1 for d in data if not d.get('scrape_success')),
|
||
'institutions': data,
|
||
}
|
||
|
||
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
|
||
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
||
|
||
log_message(f"Saved {len(data)} institutions (CSV: {OUTPUT_CSV}, JSON: {OUTPUT_JSON})")
|
||
|
||
|
||
def main():
|
||
"""Main scraper execution."""
|
||
start_time = datetime.now(timezone.utc)
|
||
log_message("=" * 80)
|
||
log_message("Belgian ISIL Detailed Metadata Scraper (Phase 2)")
|
||
log_message(f"Started at: {start_time.isoformat()}")
|
||
log_message("=" * 80)
|
||
|
||
# Load base institutions
|
||
institutions = load_base_institutions()
|
||
log_message(f"Loaded {len(institutions)} institutions from base CSV")
|
||
|
||
# Load progress (for resume capability)
|
||
progress = load_progress()
|
||
completed_codes = set(progress.get('completed_isil_codes', []))
|
||
start_index = progress.get('last_index', 0)
|
||
|
||
if completed_codes:
|
||
log_message(f"Resuming from index {start_index} ({len(completed_codes)} already completed)")
|
||
|
||
# Scrape each institution
|
||
all_data = []
|
||
|
||
for i, inst in enumerate(institutions):
|
||
if i < start_index:
|
||
continue
|
||
|
||
isil_code = inst['isil_code']
|
||
|
||
# Skip if already completed
|
||
if isil_code in completed_codes:
|
||
log_message(f"[{i+1}/{len(institutions)}] Skipping {isil_code} (already completed)")
|
||
continue
|
||
|
||
log_message(f"[{i+1}/{len(institutions)}] Processing {isil_code}...")
|
||
|
||
data = scrape_institution_detail(
|
||
isil_code,
|
||
inst['name'],
|
||
inst['detail_url']
|
||
)
|
||
|
||
all_data.append(data)
|
||
completed_codes.add(isil_code)
|
||
|
||
# Update progress
|
||
progress['completed_isil_codes'] = list(completed_codes)
|
||
progress['last_index'] = i + 1
|
||
save_progress(progress)
|
||
|
||
# Save incrementally every 25 institutions
|
||
if (i + 1) % 25 == 0:
|
||
log_message(f"Saving progress... ({i+1} institutions scraped)")
|
||
save_data(all_data)
|
||
|
||
# Final save
|
||
log_message("Scraping complete. Saving final data...")
|
||
save_data(all_data)
|
||
|
||
# Clean up progress file
|
||
if PROGRESS_FILE.exists():
|
||
PROGRESS_FILE.unlink()
|
||
|
||
# Summary
|
||
end_time = datetime.now(timezone.utc)
|
||
duration = (end_time - start_time).total_seconds()
|
||
successful = sum(1 for d in all_data if d.get('scrape_success'))
|
||
failed = len(all_data) - successful
|
||
|
||
log_message("=" * 80)
|
||
log_message(f"Scraping completed at: {end_time.isoformat()}")
|
||
log_message(f"Duration: {duration:.1f} seconds ({duration/60:.1f} minutes)")
|
||
log_message(f"Total institutions: {len(all_data)}")
|
||
log_message(f"Successful: {successful}")
|
||
log_message(f"Failed: {failed}")
|
||
log_message(f"Success rate: {successful/len(all_data)*100:.1f}%")
|
||
log_message("=" * 80)
|
||
log_message(f"Output files:")
|
||
log_message(f" CSV: {OUTPUT_CSV}")
|
||
log_message(f" JSON: {OUTPUT_JSON}")
|
||
log_message(f" Log: {LOG_FILE}")
|
||
log_message("=" * 80)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|