glam/scripts/scrapers/bulgarian_isil_scraper.py
2025-11-19 23:25:22 +01:00

253 lines
8.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Bulgarian ISIL Registry Scraper
Extracts heritage institution data from the Bulgarian National Library's ISIL registry.
Source: http://www.nationallibrary.bg/wp/?page_id=5686
Author: GLAM Data Extraction Project
Date: 2025-11-18
"""
import csv
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
from bs4 import BeautifulSoup
import re
def extract_table_data(table) -> Dict[str, str]:
"""
Extract data from a single HTML table representing one institution.
Args:
table: BeautifulSoup table element
Returns:
Dictionary with institution data
"""
data = {}
rows = table.find_all('tr')
for row in rows:
cells = row.find_all('td')
if len(cells) == 2:
# Extract field name and value
field_name = cells[0].get_text(strip=True)
field_value = cells[1].get_text(strip=True)
# Clean up field names
field_name_cleaned = field_name.replace('**', '').strip()
# Map Bulgarian field names to English keys
# NOTE: The source HTML has inconsistent spacing (typos):
# - 24 tables: "Наименование на организацията" (correct)
# - 70 tables: "Наименование наорганизацията" (typo - no space)
# We handle BOTH variants to capture all 94 institutions
field_mapping = {
'ISIL': 'isil',
# Name fields (with and without space typos)
'Наименование на организацията': 'name_bg',
'Наименование наорганизацията': 'name_bg', # Typo variant
'Наименование на английски език': 'name_en',
'Наименование наанглийски език': 'name_en', # Typo variant
# Name variants (with typo)
'Варианти на името': 'name_variants',
'Варианти наимето': 'name_variants', # Typo variant
# Other fields
'Тип': 'library_type',
'Адрес': 'address',
'Телефон/факс': 'phone_fax',
'Е-mail': 'email',
'Уебсайт': 'website',
'Е-каталог': 'online_catalog',
'Достъпност': 'accessibility',
'Работно време': 'opening_hours',
'Колекции': 'collections',
'Обем на фонда': 'collection_size',
# Interlibrary loan (with typo)
'Междубиблиотечно заемане': 'interlibrary_loan',
'Междубиблиотечнозаемане': 'interlibrary_loan' # Typo variant
}
english_key = field_mapping.get(field_name_cleaned)
if english_key:
# Clean up email addresses (remove mailto:)
if english_key == 'email':
field_value = re.sub(r'mailto:', '', field_value)
data[english_key] = field_value
return data
def parse_bulgarian_isil_html(html_content: str) -> List[Dict[str, str]]:
"""
Parse the Bulgarian ISIL registry HTML page.
Args:
html_content: Raw HTML content from the registry page
Returns:
List of dictionaries, each representing an institution
"""
soup = BeautifulSoup(html_content, 'html.parser')
# Find all tables in the content
# The registry has one table per institution
content_div = soup.find('div', class_='post-content')
if not content_div:
print("Warning: Could not find content div")
return []
# Find all tables (each table is one institution)
tables = content_div.find_all('table')
institutions = []
for i, table in enumerate(tables):
try:
institution_data = extract_table_data(table)
# Only add if we have an ISIL code (validates it's a real institution record)
if 'isil' in institution_data and institution_data['isil']:
institutions.append(institution_data)
except Exception as e:
print(f"Warning: Error parsing table {i+1}: {e}")
continue
return institutions
def export_to_csv(institutions: List[Dict[str, str]], output_path: Path):
"""
Export institutions to CSV format.
Args:
institutions: List of institution dictionaries
output_path: Path to output CSV file
"""
if not institutions:
print("No institutions to export")
return
# Define CSV columns
fieldnames = [
'isil',
'name_bg',
'name_en',
'name_variants',
'library_type',
'address',
'phone_fax',
'email',
'website',
'online_catalog',
'accessibility',
'opening_hours',
'collections',
'collection_size',
'interlibrary_loan'
]
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for institution in institutions:
writer.writerow(institution)
print(f"✓ Exported {len(institutions)} institutions to {output_path}")
def export_to_json(institutions: List[Dict[str, str]], output_path: Path):
"""
Export institutions to JSON format.
Args:
institutions: List of institution dictionaries
output_path: Path to output JSON file
"""
# Add metadata
output_data = {
'metadata': {
'source': 'Bulgarian National Library ISIL Registry',
'source_url': 'http://www.nationallibrary.bg/wp/?page_id=5686',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'total_institutions': len(institutions),
'country': 'BG',
'data_tier': 'TIER_1_AUTHORITATIVE',
'maintainer': 'National Library "St. Cyril and St. Methodius"',
'maintainer_isil': 'BG-2200000'
},
'institutions': institutions
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output_data, f, ensure_ascii=False, indent=2)
print(f"✓ Exported {len(institutions)} institutions to {output_path}")
def analyze_institution_types(institutions: List[Dict[str, str]]):
"""
Analyze and count institution types.
Args:
institutions: List of institution dictionaries
"""
type_counts = {}
for inst in institutions:
lib_type = inst.get('library_type', 'Unknown')
type_counts[lib_type] = type_counts.get(lib_type, 0) + 1
print("\n=== Institution Type Distribution ===")
for lib_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {lib_type}: {count}")
def main():
"""Main execution function."""
# Read the HTML file
html_path = Path('/tmp/bulgarian_isil.html')
if not html_path.exists():
print("Error: HTML file not found. Please save the HTML content first.")
return
with open(html_path, 'r', encoding='utf-8') as f:
html_content = f.read()
print("Parsing Bulgarian ISIL registry...")
institutions = parse_bulgarian_isil_html(html_content)
print(f"✓ Extracted {len(institutions)} institutions")
# Analyze types
analyze_institution_types(institutions)
# Export to CSV
csv_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.csv')
export_to_csv(institutions, csv_path)
# Export to JSON
json_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.json')
export_to_json(institutions, json_path)
# Print sample records
print("\n=== Sample Records ===")
for i, inst in enumerate(institutions[:3]):
print(f"\n{i+1}. {inst.get('name_bg', 'N/A')}")
print(f" ISIL: {inst.get('isil', 'N/A')}")
print(f" Type: {inst.get('library_type', 'N/A')}")
print(f" City: {inst.get('address', 'N/A')[:50]}...")
if __name__ == '__main__':
main()