glam/scripts/scrapers/harvest_ddb_institutions.py
2025-11-19 23:25:22 +01:00

240 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
DDB Institutions Harvester
Fetches German heritage institutions from Deutsche Digitale Bibliothek API
This script harvests archive institutions from the DDB /institutions endpoint,
which provides structured hierarchical data with geocoding.
API: https://api.deutsche-digitale-bibliothek.de/institutions
Operator: Deutsche Digitale Bibliothek (DDB)
Author: OpenCode + MCP Tools
Date: 2025-11-19
"""
import requests
import json
import os
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict
from dotenv import load_dotenv
# Load environment variables
env_path = Path("/Users/kempersc/apps/glam/data/isil/germany/.env")
load_dotenv(env_path)
# Configuration
API_BASE_URL = "https://api.deutsche-digitale-bibliothek.de"
API_KEY = os.getenv("DDB_API_KEY", "YOUR_API_KEY_HERE")
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Sector codes from OpenAPI spec
SECTORS = {
"sec_01": "Archive",
"sec_02": "Library",
"sec_03": "Monument protection",
"sec_04": "Research",
"sec_05": "Media",
"sec_06": "Museum",
"sec_07": "Other"
}
def fetch_institutions(sector: str = "sec_01") -> List[Dict]:
"""
Fetch institutions from DDB API.
Args:
sector: Sector code (sec_01 for archives, etc.)
Returns:
List of institution dictionaries
"""
url = f"{API_BASE_URL}/institutions"
params = {
"sector": sector,
"oauth_consumer_key": API_KEY
}
print(f"Fetching {SECTORS.get(sector, sector)} institutions...")
response = requests.get(url, params=params, timeout=60)
response.raise_for_status()
institutions = response.json()
print(f" ✓ Received {len(institutions)} institutions")
return institutions
def flatten_hierarchy(institutions: List[Dict]) -> List[Dict]:
"""
Flatten hierarchical institution structure.
Args:
institutions: List of institutions (may have nested children)
Returns:
Flattened list with all institutions and children
"""
flat_list = []
def process_institution(inst: Dict, parent_id: str = None):
# Add parent reference
if parent_id:
inst['parent_id'] = parent_id
# Extract children before adding
children = inst.pop('children', [])
# Add this institution
flat_list.append(inst)
# Process children recursively
for child in children:
process_institution(child, inst['id'])
for inst in institutions:
process_institution(inst)
return flat_list
def save_institutions(institutions: List[Dict], sector: str):
"""Save institutions to JSON file."""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
sector_name = SECTORS.get(sector, sector).lower().replace(" ", "_")
output_file = OUTPUT_DIR / f"ddb_institutions_{sector_name}_{timestamp}.json"
# Flatten hierarchy
flat_institutions = flatten_hierarchy(institutions)
output = {
'metadata': {
'source': 'Deutsche Digitale Bibliothek (DDB)',
'source_url': 'https://www.deutsche-digitale-bibliothek.de',
'api_endpoint': f'{API_BASE_URL}/institutions',
'operator': 'Deutsche Digitale Bibliothek',
'harvest_date': datetime.now(timezone.utc).isoformat(),
'sector': sector,
'sector_name': SECTORS.get(sector, sector),
'total_institutions': len(flat_institutions),
'method': 'REST API /institutions',
'license': 'CC0 1.0 Universal (Public Domain)'
},
'institutions': flat_institutions
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"✓ Saved to: {output_file}")
print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
print(f" Total institutions (flat): {len(flat_institutions)}\n")
return output_file, flat_institutions
def generate_statistics(institutions: List[Dict], sector: str):
"""Generate statistics."""
stats = {
'sector': sector,
'sector_name': SECTORS.get(sector, sector),
'total': len(institutions),
'with_items': sum(1 for i in institutions if i.get('hasItems')),
'with_coordinates': sum(1 for i in institutions if i.get('latitude')),
'by_state': {},
'top_institutions': []
}
# Extract state from location
for inst in institutions:
loc = inst.get('locationDisplayName', '')
# State is typically second-to-last in comma-separated location
parts = [p.strip() for p in loc.split(',')]
state = parts[-2] if len(parts) >= 2 else 'Unknown'
stats['by_state'][state] = stats['by_state'].get(state, 0) + 1
# Top 10 by item count
sorted_insts = sorted(
[i for i in institutions if i.get('numberOfItems', 0) > 0],
key=lambda x: x.get('numberOfItems', 0),
reverse=True
)
stats['top_institutions'] = [
{
'name': i['name'],
'items': i['numberOfItems'],
'location': i.get('locationDisplayName', 'Unknown')
}
for i in sorted_insts[:10]
]
print(f"\n{'='*70}")
print("Statistics:")
print(f"{'='*70}")
print(f"Sector: {stats['sector_name']} ({stats['sector']})")
print(f"Total institutions: {stats['total']}")
print(f"With items: {stats['with_items']} ({stats['with_items']/stats['total']*100:.1f}%)")
print(f"With coordinates: {stats['with_coordinates']} ({stats['with_coordinates']/stats['total']*100:.1f}%)")
print(f"\nTop 10 states:")
for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True)[:10]:
print(f" {state}: {count}")
print(f"\nTop 10 institutions by items:")
for i, inst in enumerate(stats['top_institutions'], 1):
print(f" {i}. {inst['name']} ({inst['items']:,} items)")
print(f"{'='*70}\n")
return stats
def main():
"""Main execution."""
print(f"\n{'#'*70}")
print(f"# DDB Institutions Harvester")
print(f"# Deutsche Digitale Bibliothek /institutions API")
print(f"{'#'*70}\n")
if API_KEY == "YOUR_API_KEY_HERE":
print("ERROR: Please set your DDB API key in .env file!")
print("Location: /Users/kempersc/apps/glam/data/isil/germany/.env")
print("Format: DDB_API_KEY=your-actual-api-key\n")
return
# Harvest archives (sec_01)
print("Harvesting ARCHIVES (sec_01)...\n")
institutions = fetch_institutions("sec_01")
if not institutions:
print("No institutions harvested. Exiting.")
return
# Save
output_file, flat_institutions = save_institutions(institutions, "sec_01")
# Statistics
stats = generate_statistics(flat_institutions, "sec_01")
# Save stats
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
stats_file = OUTPUT_DIR / f"ddb_institutions_archive_stats_{timestamp}.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, ensure_ascii=False, indent=2)
print(f"✓ Statistics saved to: {stats_file}\n")
print("✓ Harvest complete!\n")
print("Next steps:")
print(f" 1. Review data: {output_file}")
print(" 2. Optional: Harvest other sectors (libraries, museums)")
print(" 3. Cross-reference with ISIL data (german_isil_complete_*.json)")
print(" 4. Create unified German heritage dataset\n")
if __name__ == "__main__":
main()