253 lines
8.3 KiB
Python
253 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Bulgarian ISIL Registry Scraper
|
||
|
||
Extracts heritage institution data from the Bulgarian National Library's ISIL registry.
|
||
Source: http://www.nationallibrary.bg/wp/?page_id=5686
|
||
|
||
Author: GLAM Data Extraction Project
|
||
Date: 2025-11-18
|
||
"""
|
||
|
||
import csv
|
||
import json
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import List, Dict, Any
|
||
from bs4 import BeautifulSoup
|
||
import re
|
||
|
||
|
||
def extract_table_data(table) -> Dict[str, str]:
|
||
"""
|
||
Extract data from a single HTML table representing one institution.
|
||
|
||
Args:
|
||
table: BeautifulSoup table element
|
||
|
||
Returns:
|
||
Dictionary with institution data
|
||
"""
|
||
data = {}
|
||
|
||
rows = table.find_all('tr')
|
||
for row in rows:
|
||
cells = row.find_all('td')
|
||
if len(cells) == 2:
|
||
# Extract field name and value
|
||
field_name = cells[0].get_text(strip=True)
|
||
field_value = cells[1].get_text(strip=True)
|
||
|
||
# Clean up field names
|
||
field_name_cleaned = field_name.replace('**', '').strip()
|
||
|
||
# Map Bulgarian field names to English keys
|
||
# NOTE: The source HTML has inconsistent spacing (typos):
|
||
# - 24 tables: "Наименование на организацията" (correct)
|
||
# - 70 tables: "Наименование наорганизацията" (typo - no space)
|
||
# We handle BOTH variants to capture all 94 institutions
|
||
field_mapping = {
|
||
'ISIL': 'isil',
|
||
# Name fields (with and without space typos)
|
||
'Наименование на организацията': 'name_bg',
|
||
'Наименование наорганизацията': 'name_bg', # Typo variant
|
||
'Наименование на английски език': 'name_en',
|
||
'Наименование наанглийски език': 'name_en', # Typo variant
|
||
# Name variants (with typo)
|
||
'Варианти на името': 'name_variants',
|
||
'Варианти наимето': 'name_variants', # Typo variant
|
||
# Other fields
|
||
'Тип': 'library_type',
|
||
'Адрес': 'address',
|
||
'Телефон/факс': 'phone_fax',
|
||
'Е-mail': 'email',
|
||
'Уебсайт': 'website',
|
||
'Е-каталог': 'online_catalog',
|
||
'Достъпност': 'accessibility',
|
||
'Работно време': 'opening_hours',
|
||
'Колекции': 'collections',
|
||
'Обем на фонда': 'collection_size',
|
||
# Interlibrary loan (with typo)
|
||
'Междубиблиотечно заемане': 'interlibrary_loan',
|
||
'Междубиблиотечнозаемане': 'interlibrary_loan' # Typo variant
|
||
}
|
||
|
||
english_key = field_mapping.get(field_name_cleaned)
|
||
if english_key:
|
||
# Clean up email addresses (remove mailto:)
|
||
if english_key == 'email':
|
||
field_value = re.sub(r'mailto:', '', field_value)
|
||
|
||
data[english_key] = field_value
|
||
|
||
return data
|
||
|
||
|
||
def parse_bulgarian_isil_html(html_content: str) -> List[Dict[str, str]]:
|
||
"""
|
||
Parse the Bulgarian ISIL registry HTML page.
|
||
|
||
Args:
|
||
html_content: Raw HTML content from the registry page
|
||
|
||
Returns:
|
||
List of dictionaries, each representing an institution
|
||
"""
|
||
soup = BeautifulSoup(html_content, 'html.parser')
|
||
|
||
# Find all tables in the content
|
||
# The registry has one table per institution
|
||
content_div = soup.find('div', class_='post-content')
|
||
|
||
if not content_div:
|
||
print("Warning: Could not find content div")
|
||
return []
|
||
|
||
# Find all tables (each table is one institution)
|
||
tables = content_div.find_all('table')
|
||
|
||
institutions = []
|
||
|
||
for i, table in enumerate(tables):
|
||
try:
|
||
institution_data = extract_table_data(table)
|
||
|
||
# Only add if we have an ISIL code (validates it's a real institution record)
|
||
if 'isil' in institution_data and institution_data['isil']:
|
||
institutions.append(institution_data)
|
||
|
||
except Exception as e:
|
||
print(f"Warning: Error parsing table {i+1}: {e}")
|
||
continue
|
||
|
||
return institutions
|
||
|
||
|
||
def export_to_csv(institutions: List[Dict[str, str]], output_path: Path):
|
||
"""
|
||
Export institutions to CSV format.
|
||
|
||
Args:
|
||
institutions: List of institution dictionaries
|
||
output_path: Path to output CSV file
|
||
"""
|
||
if not institutions:
|
||
print("No institutions to export")
|
||
return
|
||
|
||
# Define CSV columns
|
||
fieldnames = [
|
||
'isil',
|
||
'name_bg',
|
||
'name_en',
|
||
'name_variants',
|
||
'library_type',
|
||
'address',
|
||
'phone_fax',
|
||
'email',
|
||
'website',
|
||
'online_catalog',
|
||
'accessibility',
|
||
'opening_hours',
|
||
'collections',
|
||
'collection_size',
|
||
'interlibrary_loan'
|
||
]
|
||
|
||
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||
writer.writeheader()
|
||
|
||
for institution in institutions:
|
||
writer.writerow(institution)
|
||
|
||
print(f"✓ Exported {len(institutions)} institutions to {output_path}")
|
||
|
||
|
||
def export_to_json(institutions: List[Dict[str, str]], output_path: Path):
|
||
"""
|
||
Export institutions to JSON format.
|
||
|
||
Args:
|
||
institutions: List of institution dictionaries
|
||
output_path: Path to output JSON file
|
||
"""
|
||
# Add metadata
|
||
output_data = {
|
||
'metadata': {
|
||
'source': 'Bulgarian National Library ISIL Registry',
|
||
'source_url': 'http://www.nationallibrary.bg/wp/?page_id=5686',
|
||
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
||
'total_institutions': len(institutions),
|
||
'country': 'BG',
|
||
'data_tier': 'TIER_1_AUTHORITATIVE',
|
||
'maintainer': 'National Library "St. Cyril and St. Methodius"',
|
||
'maintainer_isil': 'BG-2200000'
|
||
},
|
||
'institutions': institutions
|
||
}
|
||
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
json.dump(output_data, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"✓ Exported {len(institutions)} institutions to {output_path}")
|
||
|
||
|
||
def analyze_institution_types(institutions: List[Dict[str, str]]):
|
||
"""
|
||
Analyze and count institution types.
|
||
|
||
Args:
|
||
institutions: List of institution dictionaries
|
||
"""
|
||
type_counts = {}
|
||
|
||
for inst in institutions:
|
||
lib_type = inst.get('library_type', 'Unknown')
|
||
type_counts[lib_type] = type_counts.get(lib_type, 0) + 1
|
||
|
||
print("\n=== Institution Type Distribution ===")
|
||
for lib_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||
print(f" {lib_type}: {count}")
|
||
|
||
|
||
def main():
|
||
"""Main execution function."""
|
||
|
||
# Read the HTML file
|
||
html_path = Path('/tmp/bulgarian_isil.html')
|
||
|
||
if not html_path.exists():
|
||
print("Error: HTML file not found. Please save the HTML content first.")
|
||
return
|
||
|
||
with open(html_path, 'r', encoding='utf-8') as f:
|
||
html_content = f.read()
|
||
|
||
print("Parsing Bulgarian ISIL registry...")
|
||
institutions = parse_bulgarian_isil_html(html_content)
|
||
|
||
print(f"✓ Extracted {len(institutions)} institutions")
|
||
|
||
# Analyze types
|
||
analyze_institution_types(institutions)
|
||
|
||
# Export to CSV
|
||
csv_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.csv')
|
||
export_to_csv(institutions, csv_path)
|
||
|
||
# Export to JSON
|
||
json_path = Path('/Users/kempersc/apps/glam/data/isil/bulgarian_isil_registry.json')
|
||
export_to_json(institutions, json_path)
|
||
|
||
# Print sample records
|
||
print("\n=== Sample Records ===")
|
||
for i, inst in enumerate(institutions[:3]):
|
||
print(f"\n{i+1}. {inst.get('name_bg', 'N/A')}")
|
||
print(f" ISIL: {inst.get('isil', 'N/A')}")
|
||
print(f" Type: {inst.get('library_type', 'N/A')}")
|
||
print(f" City: {inst.get('address', 'N/A')[:50]}...")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|