#!/usr/bin/env python3 """ DDB Institutions Harvester Fetches German heritage institutions from Deutsche Digitale Bibliothek API This script harvests archive institutions from the DDB /institutions endpoint, which provides structured hierarchical data with geocoding. API: https://api.deutsche-digitale-bibliothek.de/institutions Operator: Deutsche Digitale Bibliothek (DDB) Author: OpenCode + MCP Tools Date: 2025-11-19 """ import requests import json import os from pathlib import Path from datetime import datetime, timezone from typing import List, Dict from dotenv import load_dotenv # Load environment variables env_path = Path("/Users/kempersc/apps/glam/data/isil/germany/.env") load_dotenv(env_path) # Configuration API_BASE_URL = "https://api.deutsche-digitale-bibliothek.de" API_KEY = os.getenv("DDB_API_KEY", "YOUR_API_KEY_HERE") OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Sector codes from OpenAPI spec SECTORS = { "sec_01": "Archive", "sec_02": "Library", "sec_03": "Monument protection", "sec_04": "Research", "sec_05": "Media", "sec_06": "Museum", "sec_07": "Other" } def fetch_institutions(sector: str = "sec_01") -> List[Dict]: """ Fetch institutions from DDB API. Args: sector: Sector code (sec_01 for archives, etc.) Returns: List of institution dictionaries """ url = f"{API_BASE_URL}/institutions" params = { "sector": sector, "oauth_consumer_key": API_KEY } print(f"Fetching {SECTORS.get(sector, sector)} institutions...") response = requests.get(url, params=params, timeout=60) response.raise_for_status() institutions = response.json() print(f" ✓ Received {len(institutions)} institutions") return institutions def flatten_hierarchy(institutions: List[Dict]) -> List[Dict]: """ Flatten hierarchical institution structure. Args: institutions: List of institutions (may have nested children) Returns: Flattened list with all institutions and children """ flat_list = [] def process_institution(inst: Dict, parent_id: str = None): # Add parent reference if parent_id: inst['parent_id'] = parent_id # Extract children before adding children = inst.pop('children', []) # Add this institution flat_list.append(inst) # Process children recursively for child in children: process_institution(child, inst['id']) for inst in institutions: process_institution(inst) return flat_list def save_institutions(institutions: List[Dict], sector: str): """Save institutions to JSON file.""" timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') sector_name = SECTORS.get(sector, sector).lower().replace(" ", "_") output_file = OUTPUT_DIR / f"ddb_institutions_{sector_name}_{timestamp}.json" # Flatten hierarchy flat_institutions = flatten_hierarchy(institutions) output = { 'metadata': { 'source': 'Deutsche Digitale Bibliothek (DDB)', 'source_url': 'https://www.deutsche-digitale-bibliothek.de', 'api_endpoint': f'{API_BASE_URL}/institutions', 'operator': 'Deutsche Digitale Bibliothek', 'harvest_date': datetime.now(timezone.utc).isoformat(), 'sector': sector, 'sector_name': SECTORS.get(sector, sector), 'total_institutions': len(flat_institutions), 'method': 'REST API /institutions', 'license': 'CC0 1.0 Universal (Public Domain)' }, 'institutions': flat_institutions } with open(output_file, 'w', encoding='utf-8') as f: json.dump(output, f, ensure_ascii=False, indent=2) print(f"✓ Saved to: {output_file}") print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB") print(f" Total institutions (flat): {len(flat_institutions)}\n") return output_file, flat_institutions def generate_statistics(institutions: List[Dict], sector: str): """Generate statistics.""" stats = { 'sector': sector, 'sector_name': SECTORS.get(sector, sector), 'total': len(institutions), 'with_items': sum(1 for i in institutions if i.get('hasItems')), 'with_coordinates': sum(1 for i in institutions if i.get('latitude')), 'by_state': {}, 'top_institutions': [] } # Extract state from location for inst in institutions: loc = inst.get('locationDisplayName', '') # State is typically second-to-last in comma-separated location parts = [p.strip() for p in loc.split(',')] state = parts[-2] if len(parts) >= 2 else 'Unknown' stats['by_state'][state] = stats['by_state'].get(state, 0) + 1 # Top 10 by item count sorted_insts = sorted( [i for i in institutions if i.get('numberOfItems', 0) > 0], key=lambda x: x.get('numberOfItems', 0), reverse=True ) stats['top_institutions'] = [ { 'name': i['name'], 'items': i['numberOfItems'], 'location': i.get('locationDisplayName', 'Unknown') } for i in sorted_insts[:10] ] print(f"\n{'='*70}") print("Statistics:") print(f"{'='*70}") print(f"Sector: {stats['sector_name']} ({stats['sector']})") print(f"Total institutions: {stats['total']}") print(f"With items: {stats['with_items']} ({stats['with_items']/stats['total']*100:.1f}%)") print(f"With coordinates: {stats['with_coordinates']} ({stats['with_coordinates']/stats['total']*100:.1f}%)") print(f"\nTop 10 states:") for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True)[:10]: print(f" {state}: {count}") print(f"\nTop 10 institutions by items:") for i, inst in enumerate(stats['top_institutions'], 1): print(f" {i}. {inst['name']} ({inst['items']:,} items)") print(f"{'='*70}\n") return stats def main(): """Main execution.""" print(f"\n{'#'*70}") print(f"# DDB Institutions Harvester") print(f"# Deutsche Digitale Bibliothek /institutions API") print(f"{'#'*70}\n") if API_KEY == "YOUR_API_KEY_HERE": print("ERROR: Please set your DDB API key in .env file!") print("Location: /Users/kempersc/apps/glam/data/isil/germany/.env") print("Format: DDB_API_KEY=your-actual-api-key\n") return # Harvest archives (sec_01) print("Harvesting ARCHIVES (sec_01)...\n") institutions = fetch_institutions("sec_01") if not institutions: print("No institutions harvested. Exiting.") return # Save output_file, flat_institutions = save_institutions(institutions, "sec_01") # Statistics stats = generate_statistics(flat_institutions, "sec_01") # Save stats timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') stats_file = OUTPUT_DIR / f"ddb_institutions_archive_stats_{timestamp}.json" with open(stats_file, 'w', encoding='utf-8') as f: json.dump(stats, f, ensure_ascii=False, indent=2) print(f"✓ Statistics saved to: {stats_file}\n") print("✓ Harvest complete!\n") print("Next steps:") print(f" 1. Review data: {output_file}") print(" 2. Optional: Harvest other sectors (libraries, museums)") print(" 3. Cross-reference with ISIL data (german_isil_complete_*.json)") print(" 4. Create unified German heritage dataset\n") if __name__ == "__main__": main()