glam/scripts/scrapers/harvest_sachsen_anhalt_ddb_api.py
2025-11-21 22:12:33 +01:00

76 lines
2.3 KiB
Python

#!/usr/bin/env python3
"""
Sachsen-Anhalt GLAM Institutions - DDB API Harvest
Extracts museums, libraries, and archives via DDB Search API
"""
import requests
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
import time
def query_ddb_search_api(query: str, rows: int = 100) -> List[Dict[str, Any]]:
"""Query DDB Search API for institutions."""
# DDB Search API endpoint
base_url = "https://api.deutsche-digitale-bibliothek.de/search"
# API key (public key from DDB documentation)
api_key = "YOUR_API_KEY" # Note: DDB requires registration for API key
params = {
'query': query,
'rows': rows,
'offset': 0,
'facet': 'sector_fct', # Filter by sector (archives, libraries, museums)
'oauth_consumer_key': api_key
}
print(f"Querying DDB Search API: {query}")
print(f"URL: {base_url}")
try:
response = requests.get(base_url, params=params, timeout=30)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"❌ API request failed: {e}")
return {}
def scrape_ddb_web_interface() -> List[Dict[str, Any]]:
"""
Alternative: Scrape DDB web interface for Sachsen-Anhalt institutions.
Uses the public search interface without requiring API key.
"""
print("DDB API requires authentication. Switching to Archivportal-D approach...")
print()
# Archivportal-D has better coverage and no API key requirement
return []
def main():
"""Main execution."""
print("=" * 80)
print("Sachsen-Anhalt GLAM Institutions - DDB API Harvest")
print("=" * 80)
print()
print("⚠️ DDB SPARQL endpoint is unavailable (404 Not Found)")
print("⚠️ DDB Search API requires authentication key")
print()
print("Alternative approach: Use Archivportal-D for archive coverage")
print(" + Direct website scraping for museums/libraries")
print()
print("Next steps:")
print(" 1. Harvest from Archivportal-D (Sachsen-Anhalt filter)")
print(" 2. Scrape Museumsverband Sachsen-Anhalt website")
print(" 3. Scrape regional library networks")
print()
print("=" * 80)
if __name__ == '__main__':
main()