#!/usr/bin/env python3
"""
Bulgarian ISIL Registry Scraper

Extracts heritage institution data from the Bulgarian National Library's ISIL registry.
Source: http://www.nationallibrary.bg/wp/?page_id=5686

Author: GLAM Data Extraction Project
Date: 2025-11-18
"""

import csv
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
from bs4 import BeautifulSoup
import re


def extract_table_data(table) -> Dict[str, str]:
    """
    Extract data from a single HTML table representing one institution.
    
    Args:
        table: BeautifulSoup table element
        
    Returns:
        Dictionary with institution data
    """
    data = {}
    
    rows = table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 2:
            # Extract field name and value
            field_name = cells[0].get_text(strip=True)
            field_value = cells[1].get_text(strip=True)
            
            # Clean up field names
            field_name_cleaned = field_name.replace('**', '').strip()
            
            # Map Bulgarian field names to English keys
            # NOTE: The source HTML has inconsistent spacing (typos):
            #   - 24 tables: "Наименование на организацията" (correct)
            #   - 70 tables: "Наименование наорганизацията" (typo - no space)
            # We handle BOTH variants to capture all 94 institutions
            field_mapping = {
                'ISIL': 'isil',
                # Name fields (with and without space typos)
                'Наименование на организацията': 'name_bg',
                'Наименование наорганизацията': 'name_bg',  # Typo variant
                'Наименование на английски език': 'name_en',
                'Наименование наанглийски език': 'name_en',  # Typo variant
                # Name variants (with typo)
                'Варианти на името': 'name_variants',
                'Варианти наимето': 'name_variants',  # Typo variant
                # Other fields
                'Тип': 'library_type',
                'Адрес': 'address',
                'Телефон/факс': 'phone_fax',
                'Е-mail': 'email',
                'Уебсайт': 'website',
                'Е-каталог': 'online_catalog',
                'Достъпност': 'accessibility',
                'Работно време': 'opening_hours',
                'Колекции': 'collections',
                'Обем на фонда': 'collection_size',
                # Interlibrary loan (with typo)
                'Междубиблиотечно заемане': 'interlibrary_loan',
                'Междубиблиотечнозаемане': 'interlibrary_loan'  # Typo variant
            }
            
            english_key = field_mapping.get(field_name_cleaned)
            if english_key:
                # Clean up email addresses (remove mailto:)
                if english_key == 'email':
                    field_value = re.sub(r'mailto:', '', field_value)
                
                data[english_key] = field_value
    
    return data


def parse_bulgarian_isil_html(html_content: str) -> List[Dict[str, str]]:
    """
    Parse the Bulgarian ISIL registry HTML page.
    
    Args:
        html_content: Raw HTML content from the registry page
        
    Returns:
        List of dictionaries, each representing an institution
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all tables in the content
    # The registry has one table per institution
    content_div = soup.find('div', class_='post-content')
    
    if not content_div:
        print("Warning: Could not find content div")
        return []
    
    # Find all tables (each table is one institution)
    tables = content_div.find_all('table')
    
    institutions = []
    
    for i, table in enumerate(tables):
        try:
            institution_data = extract_table_data(table)
            
            # Only add if we have an ISIL code (validates it's a real institution record)
            if 'isil' in institution_data and institution_data['isil']:
                institutions.append(institution_data)
            
        except Exception as e:
            print(f"Warning: Error parsing table {i+1}: {e}")
            continue
    
    return institutions


def export_to_csv(institutions: List[Dict[str, str]], output_path: Path):
    """
    Export institutions to CSV format.
    
    Args:
        institutions: List of institution dictionaries
        output_path: Path to output CSV file
    """
    if not institutions:
        print("No institutions to export")
        return
    
    # Define CSV columns
    fieldnames = [
        'isil',
        'name_bg',
        'name_en',
        'name_variants',
        'library_type',
        'address',
        'phone_fax',
        'email',
        'website',
        'online_catalog',
        'accessibility',
        'opening_hours',
        'collections',
        'collection_size',
        'interlibrary_loan'
    ]
    
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for institution in institutions:
            writer.writerow(institution)
    
    print(f"✓ Exported {len(institutions)} institutions to {output_path}")


def export_to_json(institutions: List[Dict[str, str]], output_path: Path):
    """
    Export institutions to JSON format.
    
    Args:
        institutions: List of institution dictionaries
        output_path: Path to output JSON file
    """
    # Add metadata
    output_data = {
        'metadata': {
            'source': 'Bulgarian National Library ISIL Registry',
            'source_url': 'http://www.nationallibrary.bg/wp/?page_id=5686',
            'extraction_date': datetime.now(timezone.utc).isoformat(),
            'total_institutions': len(institutions),
            'country': 'BG',
            'data_tier': 'TIER_1_AUTHORITATIVE',
            'maintainer': 'National Library "St. Cyril and St. Methodius"',
            'maintainer_isil': 'BG-2200000'
        },
        'institutions': institutions
    }
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)
    
    print(f"✓ Exported {len(institutions)} institutions to {output_path}")


def analyze_institution_types(institutions: List[Dict[str, str]]):
    """
    Analyze and count institution types.
    
    Args:
        institutions: List of institution dictionaries
    """
    type_counts = {}
    
    for inst in institutions:
        lib_type = inst.get('library_type', 'Unknown')
        type_counts[lib_type] = type_counts.get(lib_type, 0) + 1
    
    print("\n=== Institution Type Distribution ===")
    for lib_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"  {lib_type}: {count}")


def main():
    """Main execution function."""
    
    # Read the HTML file
    html_path = Path('/tmp/bulgarian_isil.html')
    
    if not html_path.exists():
        print("Error: HTML file not found. Please save the HTML content first.")
        return
    
    with open(html_path, 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    print("Parsing Bulgarian ISIL registry...")
    institutions = parse_bulgarian_isil_html(html_content)
    
    print(f"✓ Extracted {len(institutions)} institutions")
    
    # Analyze types
    analyze_institution_types(institutions)
    
    # Export to CSV
    csv_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.csv')
    export_to_csv(institutions, csv_path)
    
    # Export to JSON
    json_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.json')
    export_to_json(institutions, json_path)
    
    # Print sample records
    print("\n=== Sample Records ===")
    for i, inst in enumerate(institutions[:3]):
        print(f"\n{i+1}. {inst.get('name_bg', 'N/A')}")
        print(f"   ISIL: {inst.get('isil', 'N/A')}")
        print(f"   Type: {inst.get('library_type', 'N/A')}")
        print(f"   City: {inst.get('address', 'N/A')[:50]}...")


if __name__ == '__main__':
    main()