glam/scripts/scrapers/harvest_swiss_isil.py
2025-11-19 23:25:22 +01:00

312 lines
9.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Swiss ISIL Database Harvester
This script harvests all Swiss and Liechtenstein ISIL records by scraping
the Swiss National Library's ISIL directory web interface.
Source: https://www.isil.nb.admin.ch/en/
Records: ~2,379 institutions (Switzerland + Liechtenstein)
Author: OpenCode + MCP Tools
Date: 2025-11-19
"""
import json
import time
import re
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
import requests
from bs4 import BeautifulSoup
# Configuration
BASE_URL = "https://www.isil.nb.admin.ch/en/"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
RESULTS_PER_PAGE = 25 # Default pagination
REQUEST_DELAY = 2.0 # Seconds between requests (be polite)
MAX_RETRIES = 3
# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
def fetch_page(page_num: int) -> Optional[BeautifulSoup]:
"""
Fetch a page of ISIL records.
Args:
page_num: Page number (1-indexed)
Returns:
BeautifulSoup object or None on error
"""
url = f"{BASE_URL}?page={page_num}"
for attempt in range(MAX_RETRIES):
try:
print(f"Fetching page {page_num}...", end=' ')
response = requests.get(url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
print("OK")
return soup
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(REQUEST_DELAY * (attempt + 1))
else:
return None
return None
def extract_institution_links(soup: BeautifulSoup) -> List[str]:
"""
Extract institution detail page URLs from a listing page.
Args:
soup: BeautifulSoup object of the listing page
Returns:
List of institution detail URLs
"""
links = []
# Find all institution links
# Pattern: /en/institutions/[slug]/
for link in soup.find_all('a', href=True):
href = link['href']
if '/institutions/' in href and href.endswith('/'):
if not href.startswith('http'):
href = f"https://www.isil.nb.admin.ch{href}"
links.append(href)
return list(set(links)) # Remove duplicates
def fetch_institution_detail(url: str) -> Optional[Dict]:
"""
Fetch detailed information for a single institution.
Args:
url: Institution detail page URL
Returns:
Dictionary with institution data or None on error
"""
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Initialize record
record = {
'source_url': url,
'isil': None,
'name': None,
'alternative_names': [],
'institution_type': None,
'address': {},
'contact': {},
'urls': [],
'parent_org': None,
'status': 'active',
'notes': None
}
# Extract ISIL code from URL (e.g., /institutions/name-ch-123456-x/)
isil_match = re.search(r'-(ch-\d{6}-[\dx])', url.lower())
if isil_match:
record['isil'] = isil_match.group(1).upper()
# Extract name (h1 or h2 heading)
name_elem = soup.find(['h1', 'h2', 'h3'])
if name_elem:
record['name'] = name_elem.get_text(strip=True)
# Extract description/notes
description = soup.find('div', class_=re.compile('description|content|detail'))
if description:
record['notes'] = description.get_text(strip=True)
# Extract metadata (region, canton, type, etc.)
# Look for metadata sections
for elem in soup.find_all(['span', 'div', 'p']):
text = elem.get_text(strip=True)
# Check for institution type
if any(keyword in text.lower() for keyword in [
'library', 'archive', 'museum', 'documentation',
'bibliothek', 'archiv', 'dokumentation'
]):
record['institution_type'] = text
# Check for status
if 'inactive' in text.lower() or 'nicht mehr' in text.lower():
record['status'] = 'inactive'
return record
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
def get_total_pages() -> int:
"""
Get the total number of pages by fetching the first page.
Returns:
Total number of pages
"""
soup = fetch_page(1)
if not soup:
return 0
# Look for total results count
# Pattern: "2379 Search results"
results_text = soup.find(text=re.compile(r'\d+\s+Search results'))
if results_text:
match = re.search(r'(\d+)\s+Search results', str(results_text))
if match:
total_results = int(match.group(1))
total_pages = (total_results + RESULTS_PER_PAGE - 1) // RESULTS_PER_PAGE
print(f"Found {total_results} total results across {total_pages} pages")
return total_pages
# Fallback: look for pagination links
pagination = soup.find_all('a', href=re.compile(r'\?page=\d+'))
if pagination:
page_nums = []
for link in pagination:
if 'page=' in link['href']:
match = re.search(r'page=(\d+)', link['href'])
if match:
page_nums.append(int(match.group(1)))
return max(page_nums) if page_nums else 1
return 1
def harvest_all_institutions() -> List[Dict]:
"""
Harvest all ISIL records from the Swiss directory.
Returns:
List of institution records
"""
print("=" * 60)
print("Swiss ISIL Database Harvester")
print("=" * 60)
print()
total_pages = get_total_pages()
print(f"Total pages to harvest: {total_pages}")
print()
all_institution_urls = []
# Step 1: Collect all institution URLs from listing pages
print("Step 1: Collecting institution URLs from listing pages...")
for page_num in range(1, total_pages + 1):
soup = fetch_page(page_num)
if soup:
links = extract_institution_links(soup)
all_institution_urls.extend(links)
print(f" Found {len(links)} institutions on page {page_num}")
time.sleep(REQUEST_DELAY)
print(f"\nCollected {len(all_institution_urls)} unique institution URLs")
print()
# Step 2: Fetch detailed information for each institution
print("Step 2: Fetching detailed information...")
institutions = []
for idx, url in enumerate(all_institution_urls, 1):
print(f"[{idx}/{len(all_institution_urls)}] ", end='')
record = fetch_institution_detail(url)
if record:
institutions.append(record)
time.sleep(REQUEST_DELAY)
print()
print(f"Successfully harvested {len(institutions)} institutions")
return institutions
def save_results(institutions: List[Dict], timestamp: str):
"""
Save harvested data to JSON and JSONL files.
Args:
institutions: List of institution records
timestamp: Timestamp string for filenames
"""
# Save as JSON
json_path = OUTPUT_DIR / f"swiss_isil_complete_{timestamp}.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(institutions, f, indent=2, ensure_ascii=False)
print(f"\nSaved JSON: {json_path}")
# Save as JSONL
jsonl_path = OUTPUT_DIR / f"swiss_isil_complete_{timestamp}.jsonl"
with open(jsonl_path, 'w', encoding='utf-8') as f:
for inst in institutions:
f.write(json.dumps(inst, ensure_ascii=False) + '\n')
print(f"Saved JSONL: {jsonl_path}")
# Save statistics
stats = {
'total_institutions': len(institutions),
'active_institutions': sum(1 for i in institutions if i.get('status') == 'active'),
'inactive_institutions': sum(1 for i in institutions if i.get('status') == 'inactive'),
'harvest_date': datetime.now().isoformat(),
'source': 'https://www.isil.nb.admin.ch/en/'
}
stats_path = OUTPUT_DIR / f"swiss_isil_stats_{timestamp}.json"
with open(stats_path, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2)
print(f"Saved stats: {stats_path}")
def main():
"""Main execution function."""
start_time = time.time()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
try:
# Harvest all institutions
institutions = harvest_all_institutions()
# Save results
save_results(institutions, timestamp)
# Print summary
elapsed = time.time() - start_time
print()
print("=" * 60)
print("Harvest Complete!")
print("=" * 60)
print(f"Total institutions: {len(institutions)}")
print(f"Time elapsed: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
print(f"Output directory: {OUTPUT_DIR}")
except KeyboardInterrupt:
print("\n\nHarvest interrupted by user")
except Exception as e:
print(f"\n\nError during harvest: {e}")
raise
if __name__ == "__main__":
main()