277 lines
8.4 KiB
Python
277 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Archivportal-D API Harvester
|
|
Fetches all German archives via Deutsche Digitale Bibliothek REST API
|
|
|
|
This script harvests complete German archive data from the DDB API, which
|
|
aggregates archives from all 16 federal states and 9 archive sectors.
|
|
|
|
Portal: https://www.archivportal-d.de/
|
|
API: https://api.deutsche-digitale-bibliothek.de/
|
|
Operator: Deutsche Digitale Bibliothek (DDB)
|
|
|
|
Author: OpenCode + MCP Tools
|
|
Date: 2025-11-19
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
import time
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import List, Dict, Optional
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env file
|
|
env_path = Path("/Users/kempersc/apps/glam/data/isil/germany/.env")
|
|
load_dotenv(env_path)
|
|
|
|
# Configuration
|
|
API_BASE_URL = "https://api.deutsche-digitale-bibliothek.de"
|
|
API_KEY = os.getenv("DDB_API_KEY", "YOUR_API_KEY_HERE")
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
|
|
BATCH_SIZE = 100 # Archives per request
|
|
REQUEST_DELAY = 0.5 # Seconds between requests
|
|
MAX_RETRIES = 3
|
|
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def fetch_archives_batch(offset: int = 0, rows: int = 100) -> Optional[Dict]:
|
|
"""
|
|
Fetch a batch of archives via DDB API.
|
|
|
|
Args:
|
|
offset: Starting record number
|
|
rows: Number of records to fetch
|
|
|
|
Returns:
|
|
API response dict or None on error
|
|
"""
|
|
headers = {
|
|
"Accept": "application/json"
|
|
}
|
|
|
|
params = {
|
|
"query": "*", # All archives
|
|
"sector": "sec_01", # Archives sector (sec_01 per OpenAPI spec)
|
|
"rows": rows,
|
|
"offset": offset,
|
|
"oauth_consumer_key": API_KEY # API key as query parameter
|
|
}
|
|
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
print(f"Fetching archives {offset}-{offset+rows-1}...", end=' ')
|
|
response = requests.get(
|
|
f"{API_BASE_URL}/search",
|
|
headers=headers,
|
|
params=params,
|
|
timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
total = data.get('numberOfResults', 0)
|
|
print(f"OK (total: {total})")
|
|
|
|
return data
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Attempt {attempt+1}/{MAX_RETRIES} failed: {e}")
|
|
if attempt < MAX_RETRIES - 1:
|
|
time.sleep(REQUEST_DELAY * (attempt + 1))
|
|
else:
|
|
return None
|
|
|
|
|
|
def parse_archive_record(record: Dict) -> Dict:
|
|
"""
|
|
Parse DDB API archive record into simplified format.
|
|
|
|
Args:
|
|
record: Raw API record
|
|
|
|
Returns:
|
|
Parsed archive dictionary
|
|
"""
|
|
return {
|
|
'id': record.get('id'),
|
|
'name': record.get('title'),
|
|
'location': record.get('place'),
|
|
'federal_state': record.get('federalState'),
|
|
'archive_type': record.get('label'),
|
|
'isil': record.get('isil'),
|
|
'latitude': record.get('latitude'),
|
|
'longitude': record.get('longitude'),
|
|
'thumbnail': record.get('thumbnail'),
|
|
'profile_url': f"https://www.archivportal-d.de/item/{record.get('id')}" if record.get('id') else None
|
|
}
|
|
|
|
|
|
def harvest_all_archives() -> List[Dict]:
|
|
"""
|
|
Harvest all archives from DDB API.
|
|
|
|
Returns:
|
|
List of parsed archive records
|
|
"""
|
|
print(f"\n{'='*70}")
|
|
print(f"Harvesting Archivportal-D via DDB API")
|
|
print(f"Endpoint: {API_BASE_URL}/search")
|
|
print(f"{'='*70}\n")
|
|
|
|
all_archives = []
|
|
offset = 0
|
|
|
|
while True:
|
|
# Fetch batch
|
|
data = fetch_archives_batch(offset, BATCH_SIZE)
|
|
if not data:
|
|
print(f"Warning: Failed to fetch batch at offset {offset}. Stopping.")
|
|
break
|
|
|
|
# Parse results
|
|
results = data.get('results', [])
|
|
for result in results:
|
|
archive = parse_archive_record(result)
|
|
all_archives.append(archive)
|
|
|
|
print(f"Progress: {len(all_archives)} archives collected")
|
|
|
|
# Check if done
|
|
total = data.get('numberOfResults', 0)
|
|
if len(all_archives) >= total or len(results) < BATCH_SIZE:
|
|
break
|
|
|
|
offset += BATCH_SIZE
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"Harvest complete: {len(all_archives)} archives")
|
|
print(f"{'='*70}\n")
|
|
|
|
return all_archives
|
|
|
|
|
|
def save_archives(archives: List[Dict]):
|
|
"""Save archives to JSON file."""
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_file = OUTPUT_DIR / f"archivportal_d_api_{timestamp}.json"
|
|
|
|
output = {
|
|
'metadata': {
|
|
'source': 'Archivportal-D via DDB API',
|
|
'source_url': 'https://www.archivportal-d.de',
|
|
'api_endpoint': f'{API_BASE_URL}/search',
|
|
'operator': 'Deutsche Digitale Bibliothek',
|
|
'harvest_date': datetime.utcnow().isoformat() + 'Z',
|
|
'total_archives': len(archives),
|
|
'method': 'REST API',
|
|
'license': 'CC0 1.0 Universal (Public Domain)'
|
|
},
|
|
'archives': archives
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"✓ Saved to: {output_file}")
|
|
print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")
|
|
|
|
return output_file
|
|
|
|
|
|
def generate_statistics(archives: List[Dict]):
|
|
"""Generate statistics."""
|
|
stats = {
|
|
'total': len(archives),
|
|
'by_state': {},
|
|
'by_type': {},
|
|
'with_isil': 0,
|
|
'with_coordinates': 0
|
|
}
|
|
|
|
for archive in archives:
|
|
# By state
|
|
state = archive.get('federal_state', 'Unknown')
|
|
stats['by_state'][state] = stats['by_state'].get(state, 0) + 1
|
|
|
|
# By type
|
|
arch_type = archive.get('archive_type', 'Unknown')
|
|
stats['by_type'][arch_type] = stats['by_type'].get(arch_type, 0) + 1
|
|
|
|
# Completeness
|
|
if archive.get('isil'):
|
|
stats['with_isil'] += 1
|
|
if archive.get('latitude'):
|
|
stats['with_coordinates'] += 1
|
|
|
|
print(f"\n{'='*70}")
|
|
print("Statistics:")
|
|
print(f"{'='*70}")
|
|
print(f"Total archives: {stats['total']}")
|
|
print(f"With ISIL: {stats['with_isil']} ({stats['with_isil']/stats['total']*100:.1f}%)")
|
|
print(f"With coordinates: {stats['with_coordinates']} ({stats['with_coordinates']/stats['total']*100:.1f}%)")
|
|
|
|
print(f"\nTop 10 federal states:")
|
|
for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True)[:10]:
|
|
print(f" {state}: {count}")
|
|
|
|
print(f"\nTop 10 archive types:")
|
|
for arch_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True)[:10]:
|
|
print(f" {arch_type}: {count}")
|
|
|
|
print(f"{'='*70}\n")
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
print(f"\n{'#'*70}")
|
|
print(f"# Archivportal-D API Harvester")
|
|
print(f"# Deutsche Digitale Bibliothek REST API")
|
|
print(f"{'#'*70}\n")
|
|
|
|
if API_KEY == "YOUR_API_KEY_HERE":
|
|
print("ERROR: Please set your DDB API key in the script!")
|
|
print("Edit line 21: API_KEY = 'your-actual-api-key'")
|
|
print("\nTo get an API key:")
|
|
print(" 1. Visit: https://www.deutsche-digitale-bibliothek.de/")
|
|
print(" 2. Register for an account (10 minutes)")
|
|
print(" 3. Log in and navigate to 'Meine DDB'")
|
|
print(" 4. Generate API key in the API section")
|
|
print(" 5. Copy the key and paste it in line 21 of this script\n")
|
|
return
|
|
|
|
# Harvest
|
|
archives = harvest_all_archives()
|
|
|
|
if not archives:
|
|
print("No archives harvested. Exiting.")
|
|
return
|
|
|
|
# Save
|
|
output_file = save_archives(archives)
|
|
|
|
# Statistics
|
|
stats = generate_statistics(archives)
|
|
|
|
# Save stats
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
stats_file = OUTPUT_DIR / f"archivportal_d_api_stats_{timestamp}.json"
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Statistics saved to: {stats_file}\n")
|
|
|
|
print("✓ Harvest complete!\n")
|
|
print("Next steps:")
|
|
print(f" 1. Review data: {output_file}")
|
|
print(" 2. Run merge script to cross-reference with ISIL data")
|
|
print(" 3. Create unified German dataset\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|