glam/scripts/scrapers/harvest_german_isil.py

#!/usr/bin/env python3
"""
German ISIL Database Harvester

This script harvests all German ISIL (International Standard Identifier for
Libraries and Related Organizations) records from the Staatsbibliothek zu Berlin
database.

Data source: https://sigel.staatsbibliothek-berlin.de/
API documentation: https://sigel.staatsbibliothek-berlin.de/schnittstellen/api/json-api

The database contains ~17,000 records covering:
- Libraries (Bibliotheken)
- Archives (Archive)
- Museums (Museen)
- Related organizations

APIs available:
1. JSON-API (used here): https://isil.staatsbibliothek-berlin.de/api/org.jsonld
2. SRU: http://services.dnb.de/sru/bib
3. Linked Data: https://ld.zdb-services.de/resource/organisations/<ISIL>

Author: OpenCode + MCP Tools
Date: 2025-11-19
"""

import json
import time
import sys
from pathlib import Path
from typing import Dict, List, Optional
from datetime import datetime
import requests
from urllib.parse import quote

# Configuration
BASE_URL = "https://isil.staatsbibliothek-berlin.de/api/org.jsonld"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil")
PAGE_SIZE = 100  # Max records per page
REQUEST_DELAY = 0.5  # Seconds between requests (be respectful to the server)

# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


def fetch_page(query: str, page: int, size: int) -> Optional[Dict]:
    """
    Fetch a single page of results from the German ISIL API.

    Args:
        query: Search query (CQL format)
        page: Page number (1-indexed)
        size: Number of results per page

    Returns:
        JSON response or None on error
    """
    params = {
        'q': query,
        'page': page,
        'size': size
    }

    # Construct URL
    param_str = '&'.join(f"{k}={quote(str(v))}" for k, v in params.items())
    url = f"{BASE_URL}?{param_str}"

    try:
        print(f"Fetching page {page} (size={size})...", end=' ')
        response = requests.get(url, timeout=30)
        response.raise_for_status()

        data = response.json()

        # Check for API errors
        if data.get('type') == 'Error':
            print(f"API Error: {data.get('description')}")
            return None

        total_items = data.get('totalItems', 0)
        page_items = data.get('view', {}).get('totalItems', 0)
        print(f"OK ({page_items} records, {total_items} total)")

        return data

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON decode error: {e}")
        return None


def harvest_all_isil(query: str = "isil=DE-*") -> List[Dict]:
    """
    Harvest all ISIL records matching the query.

    Args:
        query: CQL query string (default: all German ISIL records)

    Returns:
        List of all records
    """
    all_records = []
    page = 1

    print(f"\n{'='*70}")
    print(f"Harvesting German ISIL Database")
    print(f"Query: {query}")
    print(f"{'='*70}\n")

    # Get first page to determine total
    first_page = fetch_page(query, page, PAGE_SIZE)
    if not first_page:
        print("Failed to fetch first page. Aborting.")
        return []

    total_items = first_page.get('totalItems', 0)
    total_pages = first_page.get('view', {}).get('numberOfPages', 0)

    print(f"\nTotal records: {total_items}")
    print(f"Total pages: {total_pages}")
    print(f"Records per page: {PAGE_SIZE}\n")

    # Extract records from first page
    records = first_page.get('member', [])
    all_records.extend(records)
    print(f"Progress: {len(all_records)}/{total_items} records")

    # Fetch remaining pages
    for page in range(2, total_pages + 1):
        time.sleep(REQUEST_DELAY)  # Be respectful to server

        page_data = fetch_page(query, page, PAGE_SIZE)
        if not page_data:
            print(f"Warning: Failed to fetch page {page}. Continuing...")
            continue

        records = page_data.get('member', [])
        all_records.extend(records)
        print(f"Progress: {len(all_records)}/{total_items} records")

    print(f"\n{'='*70}")
    print(f"Harvest complete: {len(all_records)} records retrieved")
    print(f"{'='*70}\n")

    return all_records


def save_records(records: List[Dict], output_file: Path):
    """Save records to JSON file with metadata."""
    output = {
        'metadata': {
            'source': 'German ISIL Database (Staatsbibliothek zu Berlin)',
            'source_url': 'https://sigel.staatsbibliothek-berlin.de/',
            'api_url': BASE_URL,
            'harvest_date': datetime.utcnow().isoformat() + 'Z',
            'total_records': len(records),
            'format': 'JSON-LD',
            'license': 'CC0 1.0 Universal (Public Domain)',
            'notes': 'Data covers German libraries, archives, museums, and related organizations with ISIL identifiers'
        },
        'records': records
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print(f"Records saved to: {output_file}")
    print(f"File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")


def extract_summary_stats(records: List[Dict]) -> Dict:
    """Extract summary statistics from records."""
    stats = {
        'total_records': len(records),
        'by_type': {},
        'by_state': {},
        'with_url': 0,
        'with_email': 0,
        'archives': 0,
        'libraries': 0,
        'museums': 0,
        'other': 0
    }

    for record in records:
        # Count by type
        types = record.get('type', [])
        if isinstance(types, list):
            for t in types:
                stats['by_type'][t] = stats['by_type'].get(t, 0) + 1

                # Aggregate counts
                if 'Library' in t:
                    stats['libraries'] += 1
                elif 'Archive' in t:
                    stats['archives'] += 1
                elif 'Museum' in t:
                    stats['museums'] += 1

        # Count records with URLs/emails
        data = record.get('data', {})
        if any('009Q' in key for key in data.keys()):
            stats['with_url'] += 1
        if any('032P' in key for key in data.keys()):
            stats['with_email'] += 1

    return stats


def main():
    """Main execution function."""
    print(f"\n{'#'*70}")
    print(f"# German ISIL Database Harvester")
    print(f"# Staatsbibliothek zu Berlin")
    print(f"{'#'*70}\n")

    # Harvest all German ISIL records
    records = harvest_all_isil(query="isil=DE-*")

    if not records:
        print("No records harvested. Exiting.")
        sys.exit(1)

    # Save to JSON file
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = OUTPUT_DIR / f"german_isil_complete_{timestamp}.json"
    save_records(records, output_file)

    # Generate summary statistics
    print("Generating summary statistics...")
    stats = extract_summary_stats(records)

    print(f"\n{'='*70}")
    print("Summary Statistics:")
    print(f"{'='*70}")
    print(f"Total records: {stats['total_records']}")
    print(f"  - Libraries: {stats['libraries']}")
    print(f"  - Archives: {stats['archives']}")
    print(f"  - Museums: {stats['museums']}")
    print(f"  - Other: {stats['other']}")
    print(f"\nRecords with URLs: {stats['with_url']}")
    print(f"Records with email: {stats['with_email']}")

    print(f"\nTop institution types:")
    for type_name, count in sorted(stats['by_type'].items(),
                                   key=lambda x: x[1],
                                   reverse=True)[:10]:
        print(f"  - {type_name}: {count}")

    print(f"\n{'='*70}\n")

    # Save summary stats
    stats_file = OUTPUT_DIR / f"german_isil_summary_{timestamp}.json"
    with open(stats_file, 'w', encoding='utf-8') as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)
    print(f"Summary statistics saved to: {stats_file}\n")

    print("✓ Harvest complete!\n")


if __name__ == "__main__":
    main()