glam/scripts/scrapers/harvest_archivportal_d_api.py

#!/usr/bin/env python3
"""
Archivportal-D API Harvester
Fetches all German archives via Deutsche Digitale Bibliothek REST API

This script harvests complete German archive data from the DDB API, which
aggregates archives from all 16 federal states and 9 archive sectors.

Portal: https://www.archivportal-d.de/
API: https://api.deutsche-digitale-bibliothek.de/
Operator: Deutsche Digitale Bibliothek (DDB)

Author: OpenCode + MCP Tools
Date: 2025-11-19
"""

import requests
import json
import time
import os
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional
from dotenv import load_dotenv

# Load environment variables from .env file
env_path = Path("/Users/kempersc/apps/glam/data/isil/germany/.env")
load_dotenv(env_path)

# Configuration
API_BASE_URL = "https://api.deutsche-digitale-bibliothek.de"
API_KEY = os.getenv("DDB_API_KEY", "YOUR_API_KEY_HERE")
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
BATCH_SIZE = 100  # Archives per request
REQUEST_DELAY = 0.5  # Seconds between requests
MAX_RETRIES = 3

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


def fetch_archives_batch(offset: int = 0, rows: int = 100) -> Optional[Dict]:
    """
    Fetch a batch of archives via DDB API.

    Args:
        offset: Starting record number
        rows: Number of records to fetch

    Returns:
        API response dict or None on error
    """
    headers = {
        "Accept": "application/json"
    }

    params = {
        "query": "*",  # All archives
        "sector": "sec_01",  # Archives sector (sec_01 per OpenAPI spec)
        "rows": rows,
        "offset": offset,
        "oauth_consumer_key": API_KEY  # API key as query parameter
    }

    for attempt in range(MAX_RETRIES):
        try:
            print(f"Fetching archives {offset}-{offset+rows-1}...", end=' ')
            response = requests.get(
                f"{API_BASE_URL}/search",
                headers=headers,
                params=params,
                timeout=30
            )
            response.raise_for_status()

            data = response.json()
            total = data.get('numberOfResults', 0)
            print(f"OK (total: {total})")

            return data

        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt+1}/{MAX_RETRIES} failed: {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(REQUEST_DELAY * (attempt + 1))
            else:
                return None


def parse_archive_record(record: Dict) -> Dict:
    """
    Parse DDB API archive record into simplified format.

    Args:
        record: Raw API record

    Returns:
        Parsed archive dictionary
    """
    return {
        'id': record.get('id'),
        'name': record.get('title'),
        'location': record.get('place'),
        'federal_state': record.get('federalState'),
        'archive_type': record.get('label'),
        'isil': record.get('isil'),
        'latitude': record.get('latitude'),
        'longitude': record.get('longitude'),
        'thumbnail': record.get('thumbnail'),
        'profile_url': f"https://www.archivportal-d.de/item/{record.get('id')}" if record.get('id') else None
    }


def harvest_all_archives() -> List[Dict]:
    """
    Harvest all archives from DDB API.

    Returns:
        List of parsed archive records
    """
    print(f"\n{'='*70}")
    print(f"Harvesting Archivportal-D via DDB API")
    print(f"Endpoint: {API_BASE_URL}/search")
    print(f"{'='*70}\n")

    all_archives = []
    offset = 0

    while True:
        # Fetch batch
        data = fetch_archives_batch(offset, BATCH_SIZE)
        if not data:
            print(f"Warning: Failed to fetch batch at offset {offset}. Stopping.")
            break

        # Parse results
        results = data.get('results', [])
        for result in results:
            archive = parse_archive_record(result)
            all_archives.append(archive)

        print(f"Progress: {len(all_archives)} archives collected")

        # Check if done
        total = data.get('numberOfResults', 0)
        if len(all_archives) >= total or len(results) < BATCH_SIZE:
            break

        offset += BATCH_SIZE
        time.sleep(REQUEST_DELAY)

    print(f"\n{'='*70}")
    print(f"Harvest complete: {len(all_archives)} archives")
    print(f"{'='*70}\n")

    return all_archives


def save_archives(archives: List[Dict]):
    """Save archives to JSON file."""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = OUTPUT_DIR / f"archivportal_d_api_{timestamp}.json"

    output = {
        'metadata': {
            'source': 'Archivportal-D via DDB API',
            'source_url': 'https://www.archivportal-d.de',
            'api_endpoint': f'{API_BASE_URL}/search',
            'operator': 'Deutsche Digitale Bibliothek',
            'harvest_date': datetime.utcnow().isoformat() + 'Z',
            'total_archives': len(archives),
            'method': 'REST API',
            'license': 'CC0 1.0 Universal (Public Domain)'
        },
        'archives': archives
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print(f"✓ Saved to: {output_file}")
    print(f"  File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")

    return output_file


def generate_statistics(archives: List[Dict]):
    """Generate statistics."""
    stats = {
        'total': len(archives),
        'by_state': {},
        'by_type': {},
        'with_isil': 0,
        'with_coordinates': 0
    }

    for archive in archives:
        # By state
        state = archive.get('federal_state', 'Unknown')
        stats['by_state'][state] = stats['by_state'].get(state, 0) + 1

        # By type
        arch_type = archive.get('archive_type', 'Unknown')
        stats['by_type'][arch_type] = stats['by_type'].get(arch_type, 0) + 1

        # Completeness
        if archive.get('isil'):
            stats['with_isil'] += 1
        if archive.get('latitude'):
            stats['with_coordinates'] += 1

    print(f"\n{'='*70}")
    print("Statistics:")
    print(f"{'='*70}")
    print(f"Total archives: {stats['total']}")
    print(f"With ISIL: {stats['with_isil']} ({stats['with_isil']/stats['total']*100:.1f}%)")
    print(f"With coordinates: {stats['with_coordinates']} ({stats['with_coordinates']/stats['total']*100:.1f}%)")

    print(f"\nTop 10 federal states:")
    for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  {state}: {count}")

    print(f"\nTop 10 archive types:")
    for arch_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  {arch_type}: {count}")

    print(f"{'='*70}\n")

    return stats


def main():
    """Main execution."""
    print(f"\n{'#'*70}")
    print(f"# Archivportal-D API Harvester")
    print(f"# Deutsche Digitale Bibliothek REST API")
    print(f"{'#'*70}\n")

    if API_KEY == "YOUR_API_KEY_HERE":
        print("ERROR: Please set your DDB API key in the script!")
        print("Edit line 21: API_KEY = 'your-actual-api-key'")
        print("\nTo get an API key:")
        print("  1. Visit: https://www.deutsche-digitale-bibliothek.de/")
        print("  2. Register for an account (10 minutes)")
        print("  3. Log in and navigate to 'Meine DDB'")
        print("  4. Generate API key in the API section")
        print("  5. Copy the key and paste it in line 21 of this script\n")
        return

    # Harvest
    archives = harvest_all_archives()

    if not archives:
        print("No archives harvested. Exiting.")
        return

    # Save
    output_file = save_archives(archives)

    # Statistics
    stats = generate_statistics(archives)

    # Save stats
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    stats_file = OUTPUT_DIR / f"archivportal_d_api_stats_{timestamp}.json"
    with open(stats_file, 'w', encoding='utf-8') as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)
    print(f"✓ Statistics saved to: {stats_file}\n")

    print("✓ Harvest complete!\n")
    print("Next steps:")
    print(f"  1. Review data: {output_file}")
    print("  2. Run merge script to cross-reference with ISIL data")
    print("  3. Create unified German dataset\n")


if __name__ == "__main__":
    main()