312 lines
9.4 KiB
Python
Executable file
312 lines
9.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Swiss ISIL Database Harvester
|
|
|
|
This script harvests all Swiss and Liechtenstein ISIL records by scraping
|
|
the Swiss National Library's ISIL directory web interface.
|
|
|
|
Source: https://www.isil.nb.admin.ch/en/
|
|
Records: ~2,379 institutions (Switzerland + Liechtenstein)
|
|
|
|
Author: OpenCode + MCP Tools
|
|
Date: 2025-11-19
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import re
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configuration
|
|
BASE_URL = "https://www.isil.nb.admin.ch/en/"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
|
|
RESULTS_PER_PAGE = 25 # Default pagination
|
|
REQUEST_DELAY = 2.0 # Seconds between requests (be polite)
|
|
MAX_RETRIES = 3
|
|
|
|
# Create output directory
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def fetch_page(page_num: int) -> Optional[BeautifulSoup]:
|
|
"""
|
|
Fetch a page of ISIL records.
|
|
|
|
Args:
|
|
page_num: Page number (1-indexed)
|
|
|
|
Returns:
|
|
BeautifulSoup object or None on error
|
|
"""
|
|
url = f"{BASE_URL}?page={page_num}"
|
|
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
print(f"Fetching page {page_num}...", end=' ')
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
print("OK")
|
|
return soup
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
|
|
if attempt < MAX_RETRIES - 1:
|
|
time.sleep(REQUEST_DELAY * (attempt + 1))
|
|
else:
|
|
return None
|
|
|
|
return None
|
|
|
|
|
|
def extract_institution_links(soup: BeautifulSoup) -> List[str]:
|
|
"""
|
|
Extract institution detail page URLs from a listing page.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object of the listing page
|
|
|
|
Returns:
|
|
List of institution detail URLs
|
|
"""
|
|
links = []
|
|
|
|
# Find all institution links
|
|
# Pattern: /en/institutions/[slug]/
|
|
for link in soup.find_all('a', href=True):
|
|
href = link['href']
|
|
if '/institutions/' in href and href.endswith('/'):
|
|
if not href.startswith('http'):
|
|
href = f"https://www.isil.nb.admin.ch{href}"
|
|
links.append(href)
|
|
|
|
return list(set(links)) # Remove duplicates
|
|
|
|
|
|
def fetch_institution_detail(url: str) -> Optional[Dict]:
|
|
"""
|
|
Fetch detailed information for a single institution.
|
|
|
|
Args:
|
|
url: Institution detail page URL
|
|
|
|
Returns:
|
|
Dictionary with institution data or None on error
|
|
"""
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Initialize record
|
|
record = {
|
|
'source_url': url,
|
|
'isil': None,
|
|
'name': None,
|
|
'alternative_names': [],
|
|
'institution_type': None,
|
|
'address': {},
|
|
'contact': {},
|
|
'urls': [],
|
|
'parent_org': None,
|
|
'status': 'active',
|
|
'notes': None
|
|
}
|
|
|
|
# Extract ISIL code from URL (e.g., /institutions/name-ch-123456-x/)
|
|
isil_match = re.search(r'-(ch-\d{6}-[\dx])', url.lower())
|
|
if isil_match:
|
|
record['isil'] = isil_match.group(1).upper()
|
|
|
|
# Extract name (h1 or h2 heading)
|
|
name_elem = soup.find(['h1', 'h2', 'h3'])
|
|
if name_elem:
|
|
record['name'] = name_elem.get_text(strip=True)
|
|
|
|
# Extract description/notes
|
|
description = soup.find('div', class_=re.compile('description|content|detail'))
|
|
if description:
|
|
record['notes'] = description.get_text(strip=True)
|
|
|
|
# Extract metadata (region, canton, type, etc.)
|
|
# Look for metadata sections
|
|
for elem in soup.find_all(['span', 'div', 'p']):
|
|
text = elem.get_text(strip=True)
|
|
|
|
# Check for institution type
|
|
if any(keyword in text.lower() for keyword in [
|
|
'library', 'archive', 'museum', 'documentation',
|
|
'bibliothek', 'archiv', 'dokumentation'
|
|
]):
|
|
record['institution_type'] = text
|
|
|
|
# Check for status
|
|
if 'inactive' in text.lower() or 'nicht mehr' in text.lower():
|
|
record['status'] = 'inactive'
|
|
|
|
return record
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
def get_total_pages() -> int:
|
|
"""
|
|
Get the total number of pages by fetching the first page.
|
|
|
|
Returns:
|
|
Total number of pages
|
|
"""
|
|
soup = fetch_page(1)
|
|
if not soup:
|
|
return 0
|
|
|
|
# Look for total results count
|
|
# Pattern: "2379 Search results"
|
|
results_text = soup.find(text=re.compile(r'\d+\s+Search results'))
|
|
if results_text:
|
|
match = re.search(r'(\d+)\s+Search results', str(results_text))
|
|
if match:
|
|
total_results = int(match.group(1))
|
|
total_pages = (total_results + RESULTS_PER_PAGE - 1) // RESULTS_PER_PAGE
|
|
print(f"Found {total_results} total results across {total_pages} pages")
|
|
return total_pages
|
|
|
|
# Fallback: look for pagination links
|
|
pagination = soup.find_all('a', href=re.compile(r'\?page=\d+'))
|
|
if pagination:
|
|
page_nums = []
|
|
for link in pagination:
|
|
if 'page=' in link['href']:
|
|
match = re.search(r'page=(\d+)', link['href'])
|
|
if match:
|
|
page_nums.append(int(match.group(1)))
|
|
return max(page_nums) if page_nums else 1
|
|
|
|
return 1
|
|
|
|
|
|
def harvest_all_institutions() -> List[Dict]:
|
|
"""
|
|
Harvest all ISIL records from the Swiss directory.
|
|
|
|
Returns:
|
|
List of institution records
|
|
"""
|
|
print("=" * 60)
|
|
print("Swiss ISIL Database Harvester")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
total_pages = get_total_pages()
|
|
print(f"Total pages to harvest: {total_pages}")
|
|
print()
|
|
|
|
all_institution_urls = []
|
|
|
|
# Step 1: Collect all institution URLs from listing pages
|
|
print("Step 1: Collecting institution URLs from listing pages...")
|
|
for page_num in range(1, total_pages + 1):
|
|
soup = fetch_page(page_num)
|
|
if soup:
|
|
links = extract_institution_links(soup)
|
|
all_institution_urls.extend(links)
|
|
print(f" Found {len(links)} institutions on page {page_num}")
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
print(f"\nCollected {len(all_institution_urls)} unique institution URLs")
|
|
print()
|
|
|
|
# Step 2: Fetch detailed information for each institution
|
|
print("Step 2: Fetching detailed information...")
|
|
institutions = []
|
|
|
|
for idx, url in enumerate(all_institution_urls, 1):
|
|
print(f"[{idx}/{len(all_institution_urls)}] ", end='')
|
|
record = fetch_institution_detail(url)
|
|
if record:
|
|
institutions.append(record)
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
print()
|
|
print(f"Successfully harvested {len(institutions)} institutions")
|
|
|
|
return institutions
|
|
|
|
|
|
def save_results(institutions: List[Dict], timestamp: str):
|
|
"""
|
|
Save harvested data to JSON and JSONL files.
|
|
|
|
Args:
|
|
institutions: List of institution records
|
|
timestamp: Timestamp string for filenames
|
|
"""
|
|
# Save as JSON
|
|
json_path = OUTPUT_DIR / f"swiss_isil_complete_{timestamp}.json"
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(institutions, f, indent=2, ensure_ascii=False)
|
|
print(f"\nSaved JSON: {json_path}")
|
|
|
|
# Save as JSONL
|
|
jsonl_path = OUTPUT_DIR / f"swiss_isil_complete_{timestamp}.jsonl"
|
|
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
|
for inst in institutions:
|
|
f.write(json.dumps(inst, ensure_ascii=False) + '\n')
|
|
print(f"Saved JSONL: {jsonl_path}")
|
|
|
|
# Save statistics
|
|
stats = {
|
|
'total_institutions': len(institutions),
|
|
'active_institutions': sum(1 for i in institutions if i.get('status') == 'active'),
|
|
'inactive_institutions': sum(1 for i in institutions if i.get('status') == 'inactive'),
|
|
'harvest_date': datetime.now().isoformat(),
|
|
'source': 'https://www.isil.nb.admin.ch/en/'
|
|
}
|
|
|
|
stats_path = OUTPUT_DIR / f"swiss_isil_stats_{timestamp}.json"
|
|
with open(stats_path, 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, indent=2)
|
|
print(f"Saved stats: {stats_path}")
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
start_time = time.time()
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
try:
|
|
# Harvest all institutions
|
|
institutions = harvest_all_institutions()
|
|
|
|
# Save results
|
|
save_results(institutions, timestamp)
|
|
|
|
# Print summary
|
|
elapsed = time.time() - start_time
|
|
print()
|
|
print("=" * 60)
|
|
print("Harvest Complete!")
|
|
print("=" * 60)
|
|
print(f"Total institutions: {len(institutions)}")
|
|
print(f"Time elapsed: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
|
|
print(f"Output directory: {OUTPUT_DIR}")
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\nHarvest interrupted by user")
|
|
except Exception as e:
|
|
print(f"\n\nError during harvest: {e}")
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|