182 lines
6.6 KiB
Python
182 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Belarus ISIL Code Web Scraper
|
|
Extracts heritage institution data from National Library of Belarus registry.
|
|
|
|
Author: GLAM Data Extraction Project
|
|
Date: 2025-11-17
|
|
License: MIT
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import csv
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Dict
|
|
import logging
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Output directory
|
|
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "BY"
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
class BelarusISILScraper:
|
|
"""Scrapes Belarus ISIL registry from National Library of Belarus."""
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (GLAM Data Extraction Bot; +https://github.com/kempersc/glam)',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
})
|
|
self.url = "https://nlb.by/en/for-librarians/international-standard-identifier-for-libraries-and-related-organizations-isil/list-of-libraries-organizations-of-the-republic-of-belarus-and-their-isil-codes"
|
|
|
|
def scrape_institutions(self) -> List[Dict]:
|
|
"""
|
|
Scrape Belarus ISIL registry from table.
|
|
|
|
Returns:
|
|
List of institution dictionaries
|
|
"""
|
|
logger.info("Starting Belarus ISIL registry scrape...")
|
|
logger.info(f"URL: {self.url}")
|
|
|
|
try:
|
|
response = self.session.get(self.url)
|
|
response.raise_for_status()
|
|
response.encoding = 'utf-8'
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Find the table with institution data
|
|
table = soup.find('table', class_='info_table')
|
|
if not table:
|
|
logger.error("Could not find institution table on page")
|
|
return []
|
|
|
|
institutions = []
|
|
current_region = None
|
|
|
|
# Process each row in the table
|
|
rows = table.find_all('tr')
|
|
logger.info(f"Found {len(rows)} table rows")
|
|
|
|
for row in rows:
|
|
cells = row.find_all('td')
|
|
|
|
# Skip header rows
|
|
if not cells or len(cells) < 2:
|
|
continue
|
|
|
|
# Check if this is a region header (colspan=2)
|
|
if cells[0].get('colspan') == '2':
|
|
region_text = cells[0].get_text(strip=True)
|
|
if region_text and region_text not in ['ISIL-code', 'Library / Organization']:
|
|
current_region = region_text
|
|
logger.info(f"Processing region: {current_region}")
|
|
continue
|
|
|
|
# Extract ISIL code and institution name
|
|
isil_code = cells[0].get_text(strip=True)
|
|
name = cells[1].get_text(strip=True)
|
|
|
|
if isil_code and name and isil_code.startswith('BY-'):
|
|
institution = {
|
|
'isil_code': isil_code,
|
|
'name': name,
|
|
'region': current_region,
|
|
'country': 'Belarus',
|
|
'registry': 'National Library of Belarus',
|
|
'detail_url': self.url
|
|
}
|
|
|
|
institutions.append(institution)
|
|
|
|
if len(institutions) % 20 == 0:
|
|
logger.info(f"Processed {len(institutions)} institutions...")
|
|
|
|
logger.info(f"Successfully scraped {len(institutions)} Belarus institutions")
|
|
return institutions
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Error fetching Belarus registry: {e}")
|
|
return []
|
|
|
|
def export_to_csv(self, institutions: List[Dict], filename: str):
|
|
"""Export scraped data to CSV."""
|
|
output_file = OUTPUT_DIR / filename
|
|
|
|
if not institutions:
|
|
logger.warning(f"No institutions to export to {filename}")
|
|
return
|
|
|
|
fieldnames = ['isil_code', 'name', 'region', 'country', 'registry', 'detail_url']
|
|
|
|
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(institutions)
|
|
|
|
logger.info(f"Exported {len(institutions)} records to {output_file}")
|
|
|
|
def export_to_json(self, institutions: List[Dict], filename: str):
|
|
"""Export scraped data to JSON."""
|
|
output_file = OUTPUT_DIR / filename
|
|
|
|
metadata = {
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'data_source': 'National Library of Belarus ISIL Registry',
|
|
'source_url': self.url,
|
|
'scraper_version': '1.0.0',
|
|
'country': 'Belarus',
|
|
'record_count': len(institutions),
|
|
'institutions': institutions
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"Exported {len(institutions)} records to {output_file}")
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
logger.info("=== Belarus ISIL Registry Scraper ===")
|
|
|
|
scraper = BelarusISILScraper()
|
|
|
|
# Scrape institutions
|
|
institutions = scraper.scrape_institutions()
|
|
|
|
if institutions:
|
|
scraper.export_to_csv(institutions, "belarus_isil_all.csv")
|
|
scraper.export_to_json(institutions, "belarus_isil_all.json")
|
|
|
|
# Summary by region
|
|
regions = {}
|
|
for inst in institutions:
|
|
region = inst.get('region', 'Unknown')
|
|
regions[region] = regions.get(region, 0) + 1
|
|
|
|
logger.info(f"\n=== Scraping Complete ===")
|
|
logger.info(f"Total institutions extracted: {len(institutions)}")
|
|
logger.info(f"\nBreakdown by region:")
|
|
for region, count in sorted(regions.items(), key=lambda x: x[1], reverse=True):
|
|
logger.info(f" - {region}: {count} institutions")
|
|
logger.info(f"\nOutput directory: {OUTPUT_DIR}")
|
|
else:
|
|
logger.error("No institutions were scraped. Check logs for errors.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|