glam/scripts/scrapers/harvest_german_isil.py
2025-11-19 23:25:22 +01:00

264 lines
8 KiB
Python

#!/usr/bin/env python3
"""
German ISIL Database Harvester
This script harvests all German ISIL (International Standard Identifier for
Libraries and Related Organizations) records from the Staatsbibliothek zu Berlin
database.
Data source: https://sigel.staatsbibliothek-berlin.de/
API documentation: https://sigel.staatsbibliothek-berlin.de/schnittstellen/api/json-api
The database contains ~17,000 records covering:
- Libraries (Bibliotheken)
- Archives (Archive)
- Museums (Museen)
- Related organizations
APIs available:
1. JSON-API (used here): https://isil.staatsbibliothek-berlin.de/api/org.jsonld
2. SRU: http://services.dnb.de/sru/bib
3. Linked Data: https://ld.zdb-services.de/resource/organisations/<ISIL>
Author: OpenCode + MCP Tools
Date: 2025-11-19
"""
import json
import time
import sys
from pathlib import Path
from typing import Dict, List, Optional
from datetime import datetime
import requests
from urllib.parse import quote
# Configuration
BASE_URL = "https://isil.staatsbibliothek-berlin.de/api/org.jsonld"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil")
PAGE_SIZE = 100 # Max records per page
REQUEST_DELAY = 0.5 # Seconds between requests (be respectful to the server)
# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
def fetch_page(query: str, page: int, size: int) -> Optional[Dict]:
"""
Fetch a single page of results from the German ISIL API.
Args:
query: Search query (CQL format)
page: Page number (1-indexed)
size: Number of results per page
Returns:
JSON response or None on error
"""
params = {
'q': query,
'page': page,
'size': size
}
# Construct URL
param_str = '&'.join(f"{k}={quote(str(v))}" for k, v in params.items())
url = f"{BASE_URL}?{param_str}"
try:
print(f"Fetching page {page} (size={size})...", end=' ')
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
# Check for API errors
if data.get('type') == 'Error':
print(f"API Error: {data.get('description')}")
return None
total_items = data.get('totalItems', 0)
page_items = data.get('view', {}).get('totalItems', 0)
print(f"OK ({page_items} records, {total_items} total)")
return data
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
return None
except json.JSONDecodeError as e:
print(f"JSON decode error: {e}")
return None
def harvest_all_isil(query: str = "isil=DE-*") -> List[Dict]:
"""
Harvest all ISIL records matching the query.
Args:
query: CQL query string (default: all German ISIL records)
Returns:
List of all records
"""
all_records = []
page = 1
print(f"\n{'='*70}")
print(f"Harvesting German ISIL Database")
print(f"Query: {query}")
print(f"{'='*70}\n")
# Get first page to determine total
first_page = fetch_page(query, page, PAGE_SIZE)
if not first_page:
print("Failed to fetch first page. Aborting.")
return []
total_items = first_page.get('totalItems', 0)
total_pages = first_page.get('view', {}).get('numberOfPages', 0)
print(f"\nTotal records: {total_items}")
print(f"Total pages: {total_pages}")
print(f"Records per page: {PAGE_SIZE}\n")
# Extract records from first page
records = first_page.get('member', [])
all_records.extend(records)
print(f"Progress: {len(all_records)}/{total_items} records")
# Fetch remaining pages
for page in range(2, total_pages + 1):
time.sleep(REQUEST_DELAY) # Be respectful to server
page_data = fetch_page(query, page, PAGE_SIZE)
if not page_data:
print(f"Warning: Failed to fetch page {page}. Continuing...")
continue
records = page_data.get('member', [])
all_records.extend(records)
print(f"Progress: {len(all_records)}/{total_items} records")
print(f"\n{'='*70}")
print(f"Harvest complete: {len(all_records)} records retrieved")
print(f"{'='*70}\n")
return all_records
def save_records(records: List[Dict], output_file: Path):
"""Save records to JSON file with metadata."""
output = {
'metadata': {
'source': 'German ISIL Database (Staatsbibliothek zu Berlin)',
'source_url': 'https://sigel.staatsbibliothek-berlin.de/',
'api_url': BASE_URL,
'harvest_date': datetime.utcnow().isoformat() + 'Z',
'total_records': len(records),
'format': 'JSON-LD',
'license': 'CC0 1.0 Universal (Public Domain)',
'notes': 'Data covers German libraries, archives, museums, and related organizations with ISIL identifiers'
},
'records': records
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"Records saved to: {output_file}")
print(f"File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")
def extract_summary_stats(records: List[Dict]) -> Dict:
"""Extract summary statistics from records."""
stats = {
'total_records': len(records),
'by_type': {},
'by_state': {},
'with_url': 0,
'with_email': 0,
'archives': 0,
'libraries': 0,
'museums': 0,
'other': 0
}
for record in records:
# Count by type
types = record.get('type', [])
if isinstance(types, list):
for t in types:
stats['by_type'][t] = stats['by_type'].get(t, 0) + 1
# Aggregate counts
if 'Library' in t:
stats['libraries'] += 1
elif 'Archive' in t:
stats['archives'] += 1
elif 'Museum' in t:
stats['museums'] += 1
# Count records with URLs/emails
data = record.get('data', {})
if any('009Q' in key for key in data.keys()):
stats['with_url'] += 1
if any('032P' in key for key in data.keys()):
stats['with_email'] += 1
return stats
def main():
"""Main execution function."""
print(f"\n{'#'*70}")
print(f"# German ISIL Database Harvester")
print(f"# Staatsbibliothek zu Berlin")
print(f"{'#'*70}\n")
# Harvest all German ISIL records
records = harvest_all_isil(query="isil=DE-*")
if not records:
print("No records harvested. Exiting.")
sys.exit(1)
# Save to JSON file
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = OUTPUT_DIR / f"german_isil_complete_{timestamp}.json"
save_records(records, output_file)
# Generate summary statistics
print("Generating summary statistics...")
stats = extract_summary_stats(records)
print(f"\n{'='*70}")
print("Summary Statistics:")
print(f"{'='*70}")
print(f"Total records: {stats['total_records']}")
print(f" - Libraries: {stats['libraries']}")
print(f" - Archives: {stats['archives']}")
print(f" - Museums: {stats['museums']}")
print(f" - Other: {stats['other']}")
print(f"\nRecords with URLs: {stats['with_url']}")
print(f"Records with email: {stats['with_email']}")
print(f"\nTop institution types:")
for type_name, count in sorted(stats['by_type'].items(),
key=lambda x: x[1],
reverse=True)[:10]:
print(f" - {type_name}: {count}")
print(f"\n{'='*70}\n")
# Save summary stats
stats_file = OUTPUT_DIR / f"german_isil_summary_{timestamp}.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, ensure_ascii=False, indent=2)
print(f"Summary statistics saved to: {stats_file}\n")
print("✓ Harvest complete!\n")
if __name__ == "__main__":
main()