glam/scripts/scrapers/harvest_archivportal_d_api.py
2025-11-19 23:25:22 +01:00

277 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""
Archivportal-D API Harvester
Fetches all German archives via Deutsche Digitale Bibliothek REST API
This script harvests complete German archive data from the DDB API, which
aggregates archives from all 16 federal states and 9 archive sectors.
Portal: https://www.archivportal-d.de/
API: https://api.deutsche-digitale-bibliothek.de/
Operator: Deutsche Digitale Bibliothek (DDB)
Author: OpenCode + MCP Tools
Date: 2025-11-19
"""
import requests
import json
import time
import os
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional
from dotenv import load_dotenv
# Load environment variables from .env file
env_path = Path("/Users/kempersc/apps/glam/data/isil/germany/.env")
load_dotenv(env_path)
# Configuration
API_BASE_URL = "https://api.deutsche-digitale-bibliothek.de"
API_KEY = os.getenv("DDB_API_KEY", "YOUR_API_KEY_HERE")
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
BATCH_SIZE = 100 # Archives per request
REQUEST_DELAY = 0.5 # Seconds between requests
MAX_RETRIES = 3
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
def fetch_archives_batch(offset: int = 0, rows: int = 100) -> Optional[Dict]:
"""
Fetch a batch of archives via DDB API.
Args:
offset: Starting record number
rows: Number of records to fetch
Returns:
API response dict or None on error
"""
headers = {
"Accept": "application/json"
}
params = {
"query": "*", # All archives
"sector": "sec_01", # Archives sector (sec_01 per OpenAPI spec)
"rows": rows,
"offset": offset,
"oauth_consumer_key": API_KEY # API key as query parameter
}
for attempt in range(MAX_RETRIES):
try:
print(f"Fetching archives {offset}-{offset+rows-1}...", end=' ')
response = requests.get(
f"{API_BASE_URL}/search",
headers=headers,
params=params,
timeout=30
)
response.raise_for_status()
data = response.json()
total = data.get('numberOfResults', 0)
print(f"OK (total: {total})")
return data
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt+1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(REQUEST_DELAY * (attempt + 1))
else:
return None
def parse_archive_record(record: Dict) -> Dict:
"""
Parse DDB API archive record into simplified format.
Args:
record: Raw API record
Returns:
Parsed archive dictionary
"""
return {
'id': record.get('id'),
'name': record.get('title'),
'location': record.get('place'),
'federal_state': record.get('federalState'),
'archive_type': record.get('label'),
'isil': record.get('isil'),
'latitude': record.get('latitude'),
'longitude': record.get('longitude'),
'thumbnail': record.get('thumbnail'),
'profile_url': f"https://www.archivportal-d.de/item/{record.get('id')}" if record.get('id') else None
}
def harvest_all_archives() -> List[Dict]:
"""
Harvest all archives from DDB API.
Returns:
List of parsed archive records
"""
print(f"\n{'='*70}")
print(f"Harvesting Archivportal-D via DDB API")
print(f"Endpoint: {API_BASE_URL}/search")
print(f"{'='*70}\n")
all_archives = []
offset = 0
while True:
# Fetch batch
data = fetch_archives_batch(offset, BATCH_SIZE)
if not data:
print(f"Warning: Failed to fetch batch at offset {offset}. Stopping.")
break
# Parse results
results = data.get('results', [])
for result in results:
archive = parse_archive_record(result)
all_archives.append(archive)
print(f"Progress: {len(all_archives)} archives collected")
# Check if done
total = data.get('numberOfResults', 0)
if len(all_archives) >= total or len(results) < BATCH_SIZE:
break
offset += BATCH_SIZE
time.sleep(REQUEST_DELAY)
print(f"\n{'='*70}")
print(f"Harvest complete: {len(all_archives)} archives")
print(f"{'='*70}\n")
return all_archives
def save_archives(archives: List[Dict]):
"""Save archives to JSON file."""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = OUTPUT_DIR / f"archivportal_d_api_{timestamp}.json"
output = {
'metadata': {
'source': 'Archivportal-D via DDB API',
'source_url': 'https://www.archivportal-d.de',
'api_endpoint': f'{API_BASE_URL}/search',
'operator': 'Deutsche Digitale Bibliothek',
'harvest_date': datetime.utcnow().isoformat() + 'Z',
'total_archives': len(archives),
'method': 'REST API',
'license': 'CC0 1.0 Universal (Public Domain)'
},
'archives': archives
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"✓ Saved to: {output_file}")
print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")
return output_file
def generate_statistics(archives: List[Dict]):
"""Generate statistics."""
stats = {
'total': len(archives),
'by_state': {},
'by_type': {},
'with_isil': 0,
'with_coordinates': 0
}
for archive in archives:
# By state
state = archive.get('federal_state', 'Unknown')
stats['by_state'][state] = stats['by_state'].get(state, 0) + 1
# By type
arch_type = archive.get('archive_type', 'Unknown')
stats['by_type'][arch_type] = stats['by_type'].get(arch_type, 0) + 1
# Completeness
if archive.get('isil'):
stats['with_isil'] += 1
if archive.get('latitude'):
stats['with_coordinates'] += 1
print(f"\n{'='*70}")
print("Statistics:")
print(f"{'='*70}")
print(f"Total archives: {stats['total']}")
print(f"With ISIL: {stats['with_isil']} ({stats['with_isil']/stats['total']*100:.1f}%)")
print(f"With coordinates: {stats['with_coordinates']} ({stats['with_coordinates']/stats['total']*100:.1f}%)")
print(f"\nTop 10 federal states:")
for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True)[:10]:
print(f" {state}: {count}")
print(f"\nTop 10 archive types:")
for arch_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True)[:10]:
print(f" {arch_type}: {count}")
print(f"{'='*70}\n")
return stats
def main():
"""Main execution."""
print(f"\n{'#'*70}")
print(f"# Archivportal-D API Harvester")
print(f"# Deutsche Digitale Bibliothek REST API")
print(f"{'#'*70}\n")
if API_KEY == "YOUR_API_KEY_HERE":
print("ERROR: Please set your DDB API key in the script!")
print("Edit line 21: API_KEY = 'your-actual-api-key'")
print("\nTo get an API key:")
print(" 1. Visit: https://www.deutsche-digitale-bibliothek.de/")
print(" 2. Register for an account (10 minutes)")
print(" 3. Log in and navigate to 'Meine DDB'")
print(" 4. Generate API key in the API section")
print(" 5. Copy the key and paste it in line 21 of this script\n")
return
# Harvest
archives = harvest_all_archives()
if not archives:
print("No archives harvested. Exiting.")
return
# Save
output_file = save_archives(archives)
# Statistics
stats = generate_statistics(archives)
# Save stats
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
stats_file = OUTPUT_DIR / f"archivportal_d_api_stats_{timestamp}.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, ensure_ascii=False, indent=2)
print(f"✓ Statistics saved to: {stats_file}\n")
print("✓ Harvest complete!\n")
print("Next steps:")
print(f" 1. Review data: {output_file}")
print(" 2. Run merge script to cross-reference with ISIL data")
print(" 3. Create unified German dataset\n")
if __name__ == "__main__":
main()