glam/scripts/scrapers/bulgarian_isil_scraper.py

#!/usr/bin/env python3
"""
Bulgarian ISIL Registry Scraper

Extracts heritage institution data from the Bulgarian National Library's ISIL registry.
Source: http://www.nationallibrary.bg/wp/?page_id=5686

Author: GLAM Data Extraction Project
Date: 2025-11-18
"""

import csv
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
from bs4 import BeautifulSoup
import re


def extract_table_data(table) -> Dict[str, str]:
    """
    Extract data from a single HTML table representing one institution.

    Args:
        table: BeautifulSoup table element

    Returns:
        Dictionary with institution data
    """
    data = {}

    rows = table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 2:
            # Extract field name and value
            field_name = cells[0].get_text(strip=True)
            field_value = cells[1].get_text(strip=True)

            # Clean up field names
            field_name_cleaned = field_name.replace('**', '').strip()

            # Map Bulgarian field names to English keys
            # NOTE: The source HTML has inconsistent spacing (typos):
            #   - 24 tables: "Наименование на организацията" (correct)
            #   - 70 tables: "Наименование наорганизацията" (typo - no space)
            # We handle BOTH variants to capture all 94 institutions
            field_mapping = {
                'ISIL': 'isil',
                # Name fields (with and without space typos)
                'Наименование на организацията': 'name_bg',
                'Наименование наорганизацията': 'name_bg',  # Typo variant
                'Наименование на английски език': 'name_en',
                'Наименование наанглийски език': 'name_en',  # Typo variant
                # Name variants (with typo)
                'Варианти на името': 'name_variants',
                'Варианти наимето': 'name_variants',  # Typo variant
                # Other fields
                'Тип': 'library_type',
                'Адрес': 'address',
                'Телефон/факс': 'phone_fax',
                'Е-mail': 'email',
                'Уебсайт': 'website',
                'Е-каталог': 'online_catalog',
                'Достъпност': 'accessibility',
                'Работно време': 'opening_hours',
                'Колекции': 'collections',
                'Обем на фонда': 'collection_size',
                # Interlibrary loan (with typo)
                'Междубиблиотечно заемане': 'interlibrary_loan',
                'Междубиблиотечнозаемане': 'interlibrary_loan'  # Typo variant
            }

            english_key = field_mapping.get(field_name_cleaned)
            if english_key:
                # Clean up email addresses (remove mailto:)
                if english_key == 'email':
                    field_value = re.sub(r'mailto:', '', field_value)

                data[english_key] = field_value

    return data


def parse_bulgarian_isil_html(html_content: str) -> List[Dict[str, str]]:
    """
    Parse the Bulgarian ISIL registry HTML page.

    Args:
        html_content: Raw HTML content from the registry page

    Returns:
        List of dictionaries, each representing an institution
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all tables in the content
    # The registry has one table per institution
    content_div = soup.find('div', class_='post-content')

    if not content_div:
        print("Warning: Could not find content div")
        return []

    # Find all tables (each table is one institution)
    tables = content_div.find_all('table')

    institutions = []

    for i, table in enumerate(tables):
        try:
            institution_data = extract_table_data(table)

            # Only add if we have an ISIL code (validates it's a real institution record)
            if 'isil' in institution_data and institution_data['isil']:
                institutions.append(institution_data)

        except Exception as e:
            print(f"Warning: Error parsing table {i+1}: {e}")
            continue

    return institutions


def export_to_csv(institutions: List[Dict[str, str]], output_path: Path):
    """
    Export institutions to CSV format.

    Args:
        institutions: List of institution dictionaries
        output_path: Path to output CSV file
    """
    if not institutions:
        print("No institutions to export")
        return

    # Define CSV columns
    fieldnames = [
        'isil',
        'name_bg',
        'name_en',
        'name_variants',
        'library_type',
        'address',
        'phone_fax',
        'email',
        'website',
        'online_catalog',
        'accessibility',
        'opening_hours',
        'collections',
        'collection_size',
        'interlibrary_loan'
    ]

    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for institution in institutions:
            writer.writerow(institution)

    print(f"✓ Exported {len(institutions)} institutions to {output_path}")


def export_to_json(institutions: List[Dict[str, str]], output_path: Path):
    """
    Export institutions to JSON format.

    Args:
        institutions: List of institution dictionaries
        output_path: Path to output JSON file
    """
    # Add metadata
    output_data = {
        'metadata': {
            'source': 'Bulgarian National Library ISIL Registry',
            'source_url': 'http://www.nationallibrary.bg/wp/?page_id=5686',
            'extraction_date': datetime.now(timezone.utc).isoformat(),
            'total_institutions': len(institutions),
            'country': 'BG',
            'data_tier': 'TIER_1_AUTHORITATIVE',
            'maintainer': 'National Library "St. Cyril and St. Methodius"',
            'maintainer_isil': 'BG-2200000'
        },
        'institutions': institutions
    }

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    print(f"✓ Exported {len(institutions)} institutions to {output_path}")


def analyze_institution_types(institutions: List[Dict[str, str]]):
    """
    Analyze and count institution types.

    Args:
        institutions: List of institution dictionaries
    """
    type_counts = {}

    for inst in institutions:
        lib_type = inst.get('library_type', 'Unknown')
        type_counts[lib_type] = type_counts.get(lib_type, 0) + 1

    print("\n=== Institution Type Distribution ===")
    for lib_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"  {lib_type}: {count}")


def main():
    """Main execution function."""

    # Read the HTML file
    html_path = Path('/tmp/bulgarian_isil.html')

    if not html_path.exists():
        print("Error: HTML file not found. Please save the HTML content first.")
        return

    with open(html_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    print("Parsing Bulgarian ISIL registry...")
    institutions = parse_bulgarian_isil_html(html_content)

    print(f"✓ Extracted {len(institutions)} institutions")

    # Analyze types
    analyze_institution_types(institutions)

    # Export to CSV
    csv_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.csv')
    export_to_csv(institutions, csv_path)

    # Export to JSON
    json_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.json')
    export_to_json(institutions, json_path)

    # Print sample records
    print("\n=== Sample Records ===")
    for i, inst in enumerate(institutions[:3]):
        print(f"\n{i+1}. {inst.get('name_bg', 'N/A')}")
        print(f"   ISIL: {inst.get('isil', 'N/A')}")
        print(f"   Type: {inst.get('library_type', 'N/A')}")
        print(f"   City: {inst.get('address', 'N/A')[:50]}...")


if __name__ == '__main__':
    main()