264 lines
8 KiB
Python
264 lines
8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
German ISIL Database Harvester
|
|
|
|
This script harvests all German ISIL (International Standard Identifier for
|
|
Libraries and Related Organizations) records from the Staatsbibliothek zu Berlin
|
|
database.
|
|
|
|
Data source: https://sigel.staatsbibliothek-berlin.de/
|
|
API documentation: https://sigel.staatsbibliothek-berlin.de/schnittstellen/api/json-api
|
|
|
|
The database contains ~17,000 records covering:
|
|
- Libraries (Bibliotheken)
|
|
- Archives (Archive)
|
|
- Museums (Museen)
|
|
- Related organizations
|
|
|
|
APIs available:
|
|
1. JSON-API (used here): https://isil.staatsbibliothek-berlin.de/api/org.jsonld
|
|
2. SRU: http://services.dnb.de/sru/bib
|
|
3. Linked Data: https://ld.zdb-services.de/resource/organisations/<ISIL>
|
|
|
|
Author: OpenCode + MCP Tools
|
|
Date: 2025-11-19
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
from datetime import datetime
|
|
import requests
|
|
from urllib.parse import quote
|
|
|
|
# Configuration
|
|
BASE_URL = "https://isil.staatsbibliothek-berlin.de/api/org.jsonld"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil")
|
|
PAGE_SIZE = 100 # Max records per page
|
|
REQUEST_DELAY = 0.5 # Seconds between requests (be respectful to the server)
|
|
|
|
# Create output directory
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def fetch_page(query: str, page: int, size: int) -> Optional[Dict]:
|
|
"""
|
|
Fetch a single page of results from the German ISIL API.
|
|
|
|
Args:
|
|
query: Search query (CQL format)
|
|
page: Page number (1-indexed)
|
|
size: Number of results per page
|
|
|
|
Returns:
|
|
JSON response or None on error
|
|
"""
|
|
params = {
|
|
'q': query,
|
|
'page': page,
|
|
'size': size
|
|
}
|
|
|
|
# Construct URL
|
|
param_str = '&'.join(f"{k}={quote(str(v))}" for k, v in params.items())
|
|
url = f"{BASE_URL}?{param_str}"
|
|
|
|
try:
|
|
print(f"Fetching page {page} (size={size})...", end=' ')
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
# Check for API errors
|
|
if data.get('type') == 'Error':
|
|
print(f"API Error: {data.get('description')}")
|
|
return None
|
|
|
|
total_items = data.get('totalItems', 0)
|
|
page_items = data.get('view', {}).get('totalItems', 0)
|
|
print(f"OK ({page_items} records, {total_items} total)")
|
|
|
|
return data
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Request failed: {e}")
|
|
return None
|
|
except json.JSONDecodeError as e:
|
|
print(f"JSON decode error: {e}")
|
|
return None
|
|
|
|
|
|
def harvest_all_isil(query: str = "isil=DE-*") -> List[Dict]:
|
|
"""
|
|
Harvest all ISIL records matching the query.
|
|
|
|
Args:
|
|
query: CQL query string (default: all German ISIL records)
|
|
|
|
Returns:
|
|
List of all records
|
|
"""
|
|
all_records = []
|
|
page = 1
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"Harvesting German ISIL Database")
|
|
print(f"Query: {query}")
|
|
print(f"{'='*70}\n")
|
|
|
|
# Get first page to determine total
|
|
first_page = fetch_page(query, page, PAGE_SIZE)
|
|
if not first_page:
|
|
print("Failed to fetch first page. Aborting.")
|
|
return []
|
|
|
|
total_items = first_page.get('totalItems', 0)
|
|
total_pages = first_page.get('view', {}).get('numberOfPages', 0)
|
|
|
|
print(f"\nTotal records: {total_items}")
|
|
print(f"Total pages: {total_pages}")
|
|
print(f"Records per page: {PAGE_SIZE}\n")
|
|
|
|
# Extract records from first page
|
|
records = first_page.get('member', [])
|
|
all_records.extend(records)
|
|
print(f"Progress: {len(all_records)}/{total_items} records")
|
|
|
|
# Fetch remaining pages
|
|
for page in range(2, total_pages + 1):
|
|
time.sleep(REQUEST_DELAY) # Be respectful to server
|
|
|
|
page_data = fetch_page(query, page, PAGE_SIZE)
|
|
if not page_data:
|
|
print(f"Warning: Failed to fetch page {page}. Continuing...")
|
|
continue
|
|
|
|
records = page_data.get('member', [])
|
|
all_records.extend(records)
|
|
print(f"Progress: {len(all_records)}/{total_items} records")
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"Harvest complete: {len(all_records)} records retrieved")
|
|
print(f"{'='*70}\n")
|
|
|
|
return all_records
|
|
|
|
|
|
def save_records(records: List[Dict], output_file: Path):
|
|
"""Save records to JSON file with metadata."""
|
|
output = {
|
|
'metadata': {
|
|
'source': 'German ISIL Database (Staatsbibliothek zu Berlin)',
|
|
'source_url': 'https://sigel.staatsbibliothek-berlin.de/',
|
|
'api_url': BASE_URL,
|
|
'harvest_date': datetime.utcnow().isoformat() + 'Z',
|
|
'total_records': len(records),
|
|
'format': 'JSON-LD',
|
|
'license': 'CC0 1.0 Universal (Public Domain)',
|
|
'notes': 'Data covers German libraries, archives, museums, and related organizations with ISIL identifiers'
|
|
},
|
|
'records': records
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"Records saved to: {output_file}")
|
|
print(f"File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")
|
|
|
|
|
|
def extract_summary_stats(records: List[Dict]) -> Dict:
|
|
"""Extract summary statistics from records."""
|
|
stats = {
|
|
'total_records': len(records),
|
|
'by_type': {},
|
|
'by_state': {},
|
|
'with_url': 0,
|
|
'with_email': 0,
|
|
'archives': 0,
|
|
'libraries': 0,
|
|
'museums': 0,
|
|
'other': 0
|
|
}
|
|
|
|
for record in records:
|
|
# Count by type
|
|
types = record.get('type', [])
|
|
if isinstance(types, list):
|
|
for t in types:
|
|
stats['by_type'][t] = stats['by_type'].get(t, 0) + 1
|
|
|
|
# Aggregate counts
|
|
if 'Library' in t:
|
|
stats['libraries'] += 1
|
|
elif 'Archive' in t:
|
|
stats['archives'] += 1
|
|
elif 'Museum' in t:
|
|
stats['museums'] += 1
|
|
|
|
# Count records with URLs/emails
|
|
data = record.get('data', {})
|
|
if any('009Q' in key for key in data.keys()):
|
|
stats['with_url'] += 1
|
|
if any('032P' in key for key in data.keys()):
|
|
stats['with_email'] += 1
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
print(f"\n{'#'*70}")
|
|
print(f"# German ISIL Database Harvester")
|
|
print(f"# Staatsbibliothek zu Berlin")
|
|
print(f"{'#'*70}\n")
|
|
|
|
# Harvest all German ISIL records
|
|
records = harvest_all_isil(query="isil=DE-*")
|
|
|
|
if not records:
|
|
print("No records harvested. Exiting.")
|
|
sys.exit(1)
|
|
|
|
# Save to JSON file
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_file = OUTPUT_DIR / f"german_isil_complete_{timestamp}.json"
|
|
save_records(records, output_file)
|
|
|
|
# Generate summary statistics
|
|
print("Generating summary statistics...")
|
|
stats = extract_summary_stats(records)
|
|
|
|
print(f"\n{'='*70}")
|
|
print("Summary Statistics:")
|
|
print(f"{'='*70}")
|
|
print(f"Total records: {stats['total_records']}")
|
|
print(f" - Libraries: {stats['libraries']}")
|
|
print(f" - Archives: {stats['archives']}")
|
|
print(f" - Museums: {stats['museums']}")
|
|
print(f" - Other: {stats['other']}")
|
|
print(f"\nRecords with URLs: {stats['with_url']}")
|
|
print(f"Records with email: {stats['with_email']}")
|
|
|
|
print(f"\nTop institution types:")
|
|
for type_name, count in sorted(stats['by_type'].items(),
|
|
key=lambda x: x[1],
|
|
reverse=True)[:10]:
|
|
print(f" - {type_name}: {count}")
|
|
|
|
print(f"\n{'='*70}\n")
|
|
|
|
# Save summary stats
|
|
stats_file = OUTPUT_DIR / f"german_isil_summary_{timestamp}.json"
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, ensure_ascii=False, indent=2)
|
|
print(f"Summary statistics saved to: {stats_file}\n")
|
|
|
|
print("✓ Harvest complete!\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|