#!/usr/bin/env python3 """ Belarus ISIL Code Web Scraper Extracts heritage institution data from National Library of Belarus registry. Author: GLAM Data Extraction Project Date: 2025-11-17 License: MIT """ import requests from bs4 import BeautifulSoup import csv import json from datetime import datetime, timezone from pathlib import Path from typing import List, Dict import logging # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Output directory OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "BY" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) class BelarusISILScraper: """Scrapes Belarus ISIL registry from National Library of Belarus.""" def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (GLAM Data Extraction Bot; +https://github.com/kempersc/glam)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', }) self.url = "https://nlb.by/en/for-librarians/international-standard-identifier-for-libraries-and-related-organizations-isil/list-of-libraries-organizations-of-the-republic-of-belarus-and-their-isil-codes" def scrape_institutions(self) -> List[Dict]: """ Scrape Belarus ISIL registry from table. Returns: List of institution dictionaries """ logger.info("Starting Belarus ISIL registry scrape...") logger.info(f"URL: {self.url}") try: response = self.session.get(self.url) response.raise_for_status() response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # Find the table with institution data table = soup.find('table', class_='info_table') if not table: logger.error("Could not find institution table on page") return [] institutions = [] current_region = None # Process each row in the table rows = table.find_all('tr') logger.info(f"Found {len(rows)} table rows") for row in rows: cells = row.find_all('td') # Skip header rows if not cells or len(cells) < 2: continue # Check if this is a region header (colspan=2) if cells[0].get('colspan') == '2': region_text = cells[0].get_text(strip=True) if region_text and region_text not in ['ISIL-code', 'Library / Organization']: current_region = region_text logger.info(f"Processing region: {current_region}") continue # Extract ISIL code and institution name isil_code = cells[0].get_text(strip=True) name = cells[1].get_text(strip=True) if isil_code and name and isil_code.startswith('BY-'): institution = { 'isil_code': isil_code, 'name': name, 'region': current_region, 'country': 'Belarus', 'registry': 'National Library of Belarus', 'detail_url': self.url } institutions.append(institution) if len(institutions) % 20 == 0: logger.info(f"Processed {len(institutions)} institutions...") logger.info(f"Successfully scraped {len(institutions)} Belarus institutions") return institutions except requests.RequestException as e: logger.error(f"Error fetching Belarus registry: {e}") return [] def export_to_csv(self, institutions: List[Dict], filename: str): """Export scraped data to CSV.""" output_file = OUTPUT_DIR / filename if not institutions: logger.warning(f"No institutions to export to {filename}") return fieldnames = ['isil_code', 'name', 'region', 'country', 'registry', 'detail_url'] with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(institutions) logger.info(f"Exported {len(institutions)} records to {output_file}") def export_to_json(self, institutions: List[Dict], filename: str): """Export scraped data to JSON.""" output_file = OUTPUT_DIR / filename metadata = { 'extraction_date': datetime.now(timezone.utc).isoformat(), 'data_source': 'National Library of Belarus ISIL Registry', 'source_url': self.url, 'scraper_version': '1.0.0', 'country': 'Belarus', 'record_count': len(institutions), 'institutions': institutions } with open(output_file, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) logger.info(f"Exported {len(institutions)} records to {output_file}") def main(): """Main execution function.""" logger.info("=== Belarus ISIL Registry Scraper ===") scraper = BelarusISILScraper() # Scrape institutions institutions = scraper.scrape_institutions() if institutions: scraper.export_to_csv(institutions, "belarus_isil_all.csv") scraper.export_to_json(institutions, "belarus_isil_all.json") # Summary by region regions = {} for inst in institutions: region = inst.get('region', 'Unknown') regions[region] = regions.get(region, 0) + 1 logger.info(f"\n=== Scraping Complete ===") logger.info(f"Total institutions extracted: {len(institutions)}") logger.info(f"\nBreakdown by region:") for region, count in sorted(regions.items(), key=lambda x: x[1], reverse=True): logger.info(f" - {region}: {count} institutions") logger.info(f"\nOutput directory: {OUTPUT_DIR}") else: logger.error("No institutions were scraped. Check logs for errors.") if __name__ == "__main__": main()