240 lines
7.6 KiB
Python
240 lines
7.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
DDB Institutions Harvester
|
|
Fetches German heritage institutions from Deutsche Digitale Bibliothek API
|
|
|
|
This script harvests archive institutions from the DDB /institutions endpoint,
|
|
which provides structured hierarchical data with geocoding.
|
|
|
|
API: https://api.deutsche-digitale-bibliothek.de/institutions
|
|
Operator: Deutsche Digitale Bibliothek (DDB)
|
|
|
|
Author: OpenCode + MCP Tools
|
|
Date: 2025-11-19
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
env_path = Path("/Users/kempersc/apps/glam/data/isil/germany/.env")
|
|
load_dotenv(env_path)
|
|
|
|
# Configuration
|
|
API_BASE_URL = "https://api.deutsche-digitale-bibliothek.de"
|
|
API_KEY = os.getenv("DDB_API_KEY", "YOUR_API_KEY_HERE")
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
|
|
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Sector codes from OpenAPI spec
|
|
SECTORS = {
|
|
"sec_01": "Archive",
|
|
"sec_02": "Library",
|
|
"sec_03": "Monument protection",
|
|
"sec_04": "Research",
|
|
"sec_05": "Media",
|
|
"sec_06": "Museum",
|
|
"sec_07": "Other"
|
|
}
|
|
|
|
|
|
def fetch_institutions(sector: str = "sec_01") -> List[Dict]:
|
|
"""
|
|
Fetch institutions from DDB API.
|
|
|
|
Args:
|
|
sector: Sector code (sec_01 for archives, etc.)
|
|
|
|
Returns:
|
|
List of institution dictionaries
|
|
"""
|
|
url = f"{API_BASE_URL}/institutions"
|
|
params = {
|
|
"sector": sector,
|
|
"oauth_consumer_key": API_KEY
|
|
}
|
|
|
|
print(f"Fetching {SECTORS.get(sector, sector)} institutions...")
|
|
response = requests.get(url, params=params, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
institutions = response.json()
|
|
print(f" ✓ Received {len(institutions)} institutions")
|
|
|
|
return institutions
|
|
|
|
|
|
def flatten_hierarchy(institutions: List[Dict]) -> List[Dict]:
|
|
"""
|
|
Flatten hierarchical institution structure.
|
|
|
|
Args:
|
|
institutions: List of institutions (may have nested children)
|
|
|
|
Returns:
|
|
Flattened list with all institutions and children
|
|
"""
|
|
flat_list = []
|
|
|
|
def process_institution(inst: Dict, parent_id: str = None):
|
|
# Add parent reference
|
|
if parent_id:
|
|
inst['parent_id'] = parent_id
|
|
|
|
# Extract children before adding
|
|
children = inst.pop('children', [])
|
|
|
|
# Add this institution
|
|
flat_list.append(inst)
|
|
|
|
# Process children recursively
|
|
for child in children:
|
|
process_institution(child, inst['id'])
|
|
|
|
for inst in institutions:
|
|
process_institution(inst)
|
|
|
|
return flat_list
|
|
|
|
|
|
def save_institutions(institutions: List[Dict], sector: str):
|
|
"""Save institutions to JSON file."""
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
sector_name = SECTORS.get(sector, sector).lower().replace(" ", "_")
|
|
output_file = OUTPUT_DIR / f"ddb_institutions_{sector_name}_{timestamp}.json"
|
|
|
|
# Flatten hierarchy
|
|
flat_institutions = flatten_hierarchy(institutions)
|
|
|
|
output = {
|
|
'metadata': {
|
|
'source': 'Deutsche Digitale Bibliothek (DDB)',
|
|
'source_url': 'https://www.deutsche-digitale-bibliothek.de',
|
|
'api_endpoint': f'{API_BASE_URL}/institutions',
|
|
'operator': 'Deutsche Digitale Bibliothek',
|
|
'harvest_date': datetime.now(timezone.utc).isoformat(),
|
|
'sector': sector,
|
|
'sector_name': SECTORS.get(sector, sector),
|
|
'total_institutions': len(flat_institutions),
|
|
'method': 'REST API /institutions',
|
|
'license': 'CC0 1.0 Universal (Public Domain)'
|
|
},
|
|
'institutions': flat_institutions
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"✓ Saved to: {output_file}")
|
|
print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
|
|
print(f" Total institutions (flat): {len(flat_institutions)}\n")
|
|
|
|
return output_file, flat_institutions
|
|
|
|
|
|
def generate_statistics(institutions: List[Dict], sector: str):
|
|
"""Generate statistics."""
|
|
stats = {
|
|
'sector': sector,
|
|
'sector_name': SECTORS.get(sector, sector),
|
|
'total': len(institutions),
|
|
'with_items': sum(1 for i in institutions if i.get('hasItems')),
|
|
'with_coordinates': sum(1 for i in institutions if i.get('latitude')),
|
|
'by_state': {},
|
|
'top_institutions': []
|
|
}
|
|
|
|
# Extract state from location
|
|
for inst in institutions:
|
|
loc = inst.get('locationDisplayName', '')
|
|
# State is typically second-to-last in comma-separated location
|
|
parts = [p.strip() for p in loc.split(',')]
|
|
state = parts[-2] if len(parts) >= 2 else 'Unknown'
|
|
stats['by_state'][state] = stats['by_state'].get(state, 0) + 1
|
|
|
|
# Top 10 by item count
|
|
sorted_insts = sorted(
|
|
[i for i in institutions if i.get('numberOfItems', 0) > 0],
|
|
key=lambda x: x.get('numberOfItems', 0),
|
|
reverse=True
|
|
)
|
|
stats['top_institutions'] = [
|
|
{
|
|
'name': i['name'],
|
|
'items': i['numberOfItems'],
|
|
'location': i.get('locationDisplayName', 'Unknown')
|
|
}
|
|
for i in sorted_insts[:10]
|
|
]
|
|
|
|
print(f"\n{'='*70}")
|
|
print("Statistics:")
|
|
print(f"{'='*70}")
|
|
print(f"Sector: {stats['sector_name']} ({stats['sector']})")
|
|
print(f"Total institutions: {stats['total']}")
|
|
print(f"With items: {stats['with_items']} ({stats['with_items']/stats['total']*100:.1f}%)")
|
|
print(f"With coordinates: {stats['with_coordinates']} ({stats['with_coordinates']/stats['total']*100:.1f}%)")
|
|
|
|
print(f"\nTop 10 states:")
|
|
for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True)[:10]:
|
|
print(f" {state}: {count}")
|
|
|
|
print(f"\nTop 10 institutions by items:")
|
|
for i, inst in enumerate(stats['top_institutions'], 1):
|
|
print(f" {i}. {inst['name']} ({inst['items']:,} items)")
|
|
|
|
print(f"{'='*70}\n")
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
print(f"\n{'#'*70}")
|
|
print(f"# DDB Institutions Harvester")
|
|
print(f"# Deutsche Digitale Bibliothek /institutions API")
|
|
print(f"{'#'*70}\n")
|
|
|
|
if API_KEY == "YOUR_API_KEY_HERE":
|
|
print("ERROR: Please set your DDB API key in .env file!")
|
|
print("Location: /Users/kempersc/apps/glam/data/isil/germany/.env")
|
|
print("Format: DDB_API_KEY=your-actual-api-key\n")
|
|
return
|
|
|
|
# Harvest archives (sec_01)
|
|
print("Harvesting ARCHIVES (sec_01)...\n")
|
|
institutions = fetch_institutions("sec_01")
|
|
|
|
if not institutions:
|
|
print("No institutions harvested. Exiting.")
|
|
return
|
|
|
|
# Save
|
|
output_file, flat_institutions = save_institutions(institutions, "sec_01")
|
|
|
|
# Statistics
|
|
stats = generate_statistics(flat_institutions, "sec_01")
|
|
|
|
# Save stats
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
stats_file = OUTPUT_DIR / f"ddb_institutions_archive_stats_{timestamp}.json"
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Statistics saved to: {stats_file}\n")
|
|
|
|
print("✓ Harvest complete!\n")
|
|
print("Next steps:")
|
|
print(f" 1. Review data: {output_file}")
|
|
print(" 2. Optional: Harvest other sectors (libraries, museums)")
|
|
print(" 3. Cross-reference with ISIL data (german_isil_complete_*.json)")
|
|
print(" 4. Create unified German heritage dataset\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|