#!/usr/bin/env python3 """ Bulgarian ISIL Registry Scraper Extracts heritage institution data from the Bulgarian National Library's ISIL registry. Source: http://www.nationallibrary.bg/wp/?page_id=5686 Author: GLAM Data Extraction Project Date: 2025-11-18 """ import csv import json from datetime import datetime, timezone from pathlib import Path from typing import List, Dict, Any from bs4 import BeautifulSoup import re def extract_table_data(table) -> Dict[str, str]: """ Extract data from a single HTML table representing one institution. Args: table: BeautifulSoup table element Returns: Dictionary with institution data """ data = {} rows = table.find_all('tr') for row in rows: cells = row.find_all('td') if len(cells) == 2: # Extract field name and value field_name = cells[0].get_text(strip=True) field_value = cells[1].get_text(strip=True) # Clean up field names field_name_cleaned = field_name.replace('**', '').strip() # Map Bulgarian field names to English keys # NOTE: The source HTML has inconsistent spacing (typos): # - 24 tables: "Наименование на организацията" (correct) # - 70 tables: "Наименование наорганизацията" (typo - no space) # We handle BOTH variants to capture all 94 institutions field_mapping = { 'ISIL': 'isil', # Name fields (with and without space typos) 'Наименование на организацията': 'name_bg', 'Наименование наорганизацията': 'name_bg', # Typo variant 'Наименование на английски език': 'name_en', 'Наименование наанглийски език': 'name_en', # Typo variant # Name variants (with typo) 'Варианти на името': 'name_variants', 'Варианти наимето': 'name_variants', # Typo variant # Other fields 'Тип': 'library_type', 'Адрес': 'address', 'Телефон/факс': 'phone_fax', 'Е-mail': 'email', 'Уебсайт': 'website', 'Е-каталог': 'online_catalog', 'Достъпност': 'accessibility', 'Работно време': 'opening_hours', 'Колекции': 'collections', 'Обем на фонда': 'collection_size', # Interlibrary loan (with typo) 'Междубиблиотечно заемане': 'interlibrary_loan', 'Междубиблиотечнозаемане': 'interlibrary_loan' # Typo variant } english_key = field_mapping.get(field_name_cleaned) if english_key: # Clean up email addresses (remove mailto:) if english_key == 'email': field_value = re.sub(r'mailto:', '', field_value) data[english_key] = field_value return data def parse_bulgarian_isil_html(html_content: str) -> List[Dict[str, str]]: """ Parse the Bulgarian ISIL registry HTML page. Args: html_content: Raw HTML content from the registry page Returns: List of dictionaries, each representing an institution """ soup = BeautifulSoup(html_content, 'html.parser') # Find all tables in the content # The registry has one table per institution content_div = soup.find('div', class_='post-content') if not content_div: print("Warning: Could not find content div") return [] # Find all tables (each table is one institution) tables = content_div.find_all('table') institutions = [] for i, table in enumerate(tables): try: institution_data = extract_table_data(table) # Only add if we have an ISIL code (validates it's a real institution record) if 'isil' in institution_data and institution_data['isil']: institutions.append(institution_data) except Exception as e: print(f"Warning: Error parsing table {i+1}: {e}") continue return institutions def export_to_csv(institutions: List[Dict[str, str]], output_path: Path): """ Export institutions to CSV format. Args: institutions: List of institution dictionaries output_path: Path to output CSV file """ if not institutions: print("No institutions to export") return # Define CSV columns fieldnames = [ 'isil', 'name_bg', 'name_en', 'name_variants', 'library_type', 'address', 'phone_fax', 'email', 'website', 'online_catalog', 'accessibility', 'opening_hours', 'collections', 'collection_size', 'interlibrary_loan' ] with open(output_path, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for institution in institutions: writer.writerow(institution) print(f"✓ Exported {len(institutions)} institutions to {output_path}") def export_to_json(institutions: List[Dict[str, str]], output_path: Path): """ Export institutions to JSON format. Args: institutions: List of institution dictionaries output_path: Path to output JSON file """ # Add metadata output_data = { 'metadata': { 'source': 'Bulgarian National Library ISIL Registry', 'source_url': 'http://www.nationallibrary.bg/wp/?page_id=5686', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'total_institutions': len(institutions), 'country': 'BG', 'data_tier': 'TIER_1_AUTHORITATIVE', 'maintainer': 'National Library "St. Cyril and St. Methodius"', 'maintainer_isil': 'BG-2200000' }, 'institutions': institutions } with open(output_path, 'w', encoding='utf-8') as f: json.dump(output_data, f, ensure_ascii=False, indent=2) print(f"✓ Exported {len(institutions)} institutions to {output_path}") def analyze_institution_types(institutions: List[Dict[str, str]]): """ Analyze and count institution types. Args: institutions: List of institution dictionaries """ type_counts = {} for inst in institutions: lib_type = inst.get('library_type', 'Unknown') type_counts[lib_type] = type_counts.get(lib_type, 0) + 1 print("\n=== Institution Type Distribution ===") for lib_type, count in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {lib_type}: {count}") def main(): """Main execution function.""" # Read the HTML file html_path = Path('/tmp/bulgarian_isil.html') if not html_path.exists(): print("Error: HTML file not found. Please save the HTML content first.") return with open(html_path, 'r', encoding='utf-8') as f: html_content = f.read() print("Parsing Bulgarian ISIL registry...") institutions = parse_bulgarian_isil_html(html_content) print(f"✓ Extracted {len(institutions)} institutions") # Analyze types analyze_institution_types(institutions) # Export to CSV csv_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.csv') export_to_csv(institutions, csv_path) # Export to JSON json_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.json') export_to_json(institutions, json_path) # Print sample records print("\n=== Sample Records ===") for i, inst in enumerate(institutions[:3]): print(f"\n{i+1}. {inst.get('name_bg', 'N/A')}") print(f" ISIL: {inst.get('isil', 'N/A')}") print(f" Type: {inst.get('library_type', 'N/A')}") print(f" City: {inst.get('address', 'N/A')[:50]}...") if __name__ == '__main__': main()