glam/scripts/scrapers/scrape_belarus_isil.py
2025-11-19 23:25:22 +01:00

182 lines
6.6 KiB
Python

#!/usr/bin/env python3
"""
Belarus ISIL Code Web Scraper
Extracts heritage institution data from National Library of Belarus registry.
Author: GLAM Data Extraction Project
Date: 2025-11-17
License: MIT
"""
import requests
from bs4 import BeautifulSoup
import csv
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Output directory
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "BY"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
class BelarusISILScraper:
"""Scrapes Belarus ISIL registry from National Library of Belarus."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (GLAM Data Extraction Bot; +https://github.com/kempersc/glam)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
})
self.url = "https://nlb.by/en/for-librarians/international-standard-identifier-for-libraries-and-related-organizations-isil/list-of-libraries-organizations-of-the-republic-of-belarus-and-their-isil-codes"
def scrape_institutions(self) -> List[Dict]:
"""
Scrape Belarus ISIL registry from table.
Returns:
List of institution dictionaries
"""
logger.info("Starting Belarus ISIL registry scrape...")
logger.info(f"URL: {self.url}")
try:
response = self.session.get(self.url)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# Find the table with institution data
table = soup.find('table', class_='info_table')
if not table:
logger.error("Could not find institution table on page")
return []
institutions = []
current_region = None
# Process each row in the table
rows = table.find_all('tr')
logger.info(f"Found {len(rows)} table rows")
for row in rows:
cells = row.find_all('td')
# Skip header rows
if not cells or len(cells) < 2:
continue
# Check if this is a region header (colspan=2)
if cells[0].get('colspan') == '2':
region_text = cells[0].get_text(strip=True)
if region_text and region_text not in ['ISIL-code', 'Library / Organization']:
current_region = region_text
logger.info(f"Processing region: {current_region}")
continue
# Extract ISIL code and institution name
isil_code = cells[0].get_text(strip=True)
name = cells[1].get_text(strip=True)
if isil_code and name and isil_code.startswith('BY-'):
institution = {
'isil_code': isil_code,
'name': name,
'region': current_region,
'country': 'Belarus',
'registry': 'National Library of Belarus',
'detail_url': self.url
}
institutions.append(institution)
if len(institutions) % 20 == 0:
logger.info(f"Processed {len(institutions)} institutions...")
logger.info(f"Successfully scraped {len(institutions)} Belarus institutions")
return institutions
except requests.RequestException as e:
logger.error(f"Error fetching Belarus registry: {e}")
return []
def export_to_csv(self, institutions: List[Dict], filename: str):
"""Export scraped data to CSV."""
output_file = OUTPUT_DIR / filename
if not institutions:
logger.warning(f"No institutions to export to {filename}")
return
fieldnames = ['isil_code', 'name', 'region', 'country', 'registry', 'detail_url']
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(institutions)
logger.info(f"Exported {len(institutions)} records to {output_file}")
def export_to_json(self, institutions: List[Dict], filename: str):
"""Export scraped data to JSON."""
output_file = OUTPUT_DIR / filename
metadata = {
'extraction_date': datetime.now(timezone.utc).isoformat(),
'data_source': 'National Library of Belarus ISIL Registry',
'source_url': self.url,
'scraper_version': '1.0.0',
'country': 'Belarus',
'record_count': len(institutions),
'institutions': institutions
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
logger.info(f"Exported {len(institutions)} records to {output_file}")
def main():
"""Main execution function."""
logger.info("=== Belarus ISIL Registry Scraper ===")
scraper = BelarusISILScraper()
# Scrape institutions
institutions = scraper.scrape_institutions()
if institutions:
scraper.export_to_csv(institutions, "belarus_isil_all.csv")
scraper.export_to_json(institutions, "belarus_isil_all.json")
# Summary by region
regions = {}
for inst in institutions:
region = inst.get('region', 'Unknown')
regions[region] = regions.get(region, 0) + 1
logger.info(f"\n=== Scraping Complete ===")
logger.info(f"Total institutions extracted: {len(institutions)}")
logger.info(f"\nBreakdown by region:")
for region, count in sorted(regions.items(), key=lambda x: x[1], reverse=True):
logger.info(f" - {region}: {count} institutions")
logger.info(f"\nOutput directory: {OUTPUT_DIR}")
else:
logger.error("No institutions were scraped. Check logs for errors.")
if __name__ == "__main__":
main()