glam/scripts/scrapers/harvest_nrw_archives.py
2025-11-19 23:25:22 +01:00

271 lines
9.2 KiB
Python

#!/usr/bin/env python3
"""
NRW Archives Harvester
Extracts archive institutions from archive.nrw.de portal
This script harvests all archives listed on the Nordrhein-Westfalen archive portal,
which uses a JavaScript-rendered hierarchical navigation interface (Drupal-based).
Portal: https://www.archive.nrw.de/archivsuche
Operator: Landesarchiv Nordrhein-Westfalen
Data: 523+ archives across 7 archive types (Archivsparten)
Uses Playwright for JavaScript rendering.
Author: OpenCode + AI Agent
Date: 2025-11-19
"""
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
import re
# Configuration
BASE_URL = "https://www.archive.nrw.de"
SEARCH_URL = f"{BASE_URL}/archivsuche"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Map German archive types to GLAM taxonomy
ARCHIVE_TYPE_MAPPING = {
"Landesarchiv": "OFFICIAL_INSTITUTION", # State archive
"Stadtarchiv": "ARCHIVE", # City archive
"Gemeindearchiv": "ARCHIVE", # Municipal archive
"Kreisarchiv": "ARCHIVE", # District archive
"Stiftsarchiv": "ARCHIVE", # Foundation archive
"Kommunalarchiv": "ARCHIVE", # Local archive
"Stadt- und": "ARCHIVE", # Combined city/district archive
"Institut für": "RESEARCH_CENTER", # Research institute
"Archiv des": "ARCHIVE", # Archive of (organization)
"Historisches": "RESEARCH_CENTER" # Historical center/archive
}
def infer_institution_type(name: str) -> str:
"""Infer institution type from German name."""
for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
if keyword in name:
return inst_type
return "ARCHIVE" # Default to generic archive
def extract_city_from_name(name: str) -> Optional[str]:
"""
Extract city name from German archive names.
Patterns:
- Stadtarchiv München → München
- Gemeindearchiv Bedburg-Hau → Bedburg-Hau
- Kreisarchiv Viersen → Viersen
- Archiv der Stadt Gummersbach → Gummersbach
"""
patterns = [
r'Stadtarchiv\s+(.+)',
r'Gemeindearchiv\s+(.+)',
r'Kreisarchiv\s+(.+)',
r'Kommunalarchiv\s+(.+)',
r'Stadt-\s+und\s+\w+\s+(.+)', # Stadt- und Kreisarchiv X
r'Archiv\s+der\s+(?:Stadt|Kreis-\s+und\s+Hochschulstadt)\s+(.+)',
r'Institut\s+für\s+Stadtgeschichte[/\s]+(.+)',
r'Historisches\s+Zentrum\s+(.+)',
r'Stiftsarchiv\s+(.+)'
]
for pattern in patterns:
match = re.search(pattern, name)
if match:
city = match.group(1).strip()
# Remove trailing qualifiers
city = re.sub(r'\s+\(.*\)$', '', city) # Remove (Westf.), (Ruhr), etc.
return city
return None
def harvest_archives_with_playwright() -> List[Dict]:
"""
Harvest archives using Playwright to render JavaScript.
Strategy:
1. Navigate to search page
2. Click on "Navigierende Suche" tab
3. Select "Kommunale Archive" (Municipal Archives) - largest category
4. Extract all archive names from rendered list
5. Repeat for other archive categories
"""
archives = []
with sync_playwright() as p:
print("Launching browser...")
browser = p.chromium.launch(headless=True)
page = browser.new_page()
try:
print(f"Navigating to {SEARCH_URL}...")
page.goto(SEARCH_URL, wait_until="networkidle", timeout=30000)
# Accept cookies if present
try:
page.get_by_role("button", name="Okay, ich bin einverstanden").click(timeout=5000)
print(" ✓ Accepted cookies")
except PlaywrightTimeout:
pass # No cookie banner
# Click on "Navigierende Suche" tab
try:
page.get_by_text("Navigierende Suche").click(timeout=5000)
time.sleep(2) # Wait for content to load
print(" ✓ Switched to Navigierende Suche")
except PlaywrightTimeout:
print(" ⚠ Could not find Navigierende Suche tab")
# Click on Archivsparte dropdown
try:
page.get_by_text("Bitte Archivsparte auswählen").click(timeout=5000)
time.sleep(1)
print(" ✓ Opened Archivsparte dropdown")
except PlaywrightTimeout:
print(" ⚠ Could not open Archivsparte dropdown")
# Select "Kommunale Archive" (largest category)
try:
page.get_by_text("Kommunale Archive", exact=False).click(timeout=5000)
time.sleep(3) # Wait for archive list to populate
print(" ✓ Selected Kommunale Archive")
except PlaywrightTimeout:
print(" ⚠ Could not select Kommunale Archive")
# Extract all archive buttons from the list
print("\nExtracting archive names...")
# Find all list items containing archive names
# Based on browser inspection, archives are in role="button" elements within role="listitem"
archive_buttons = page.get_by_role("button").all()
print(f"Found {len(archive_buttons)} buttons to process...")
for button in archive_buttons:
try:
text = button.text_content()
if not text:
continue
text = text.strip()
# Skip if not an archive name
if not any(keyword in text for keyword in ['archiv', 'Archiv', 'Institut', 'Zentrum']):
continue
# Skip internal collection names (start with *)
if text.startswith('*'):
continue
# Skip if already in list
if any(a['name'] == text for a in archives):
continue
city = extract_city_from_name(text)
inst_type = infer_institution_type(text)
record = {
"name": text,
"city": city,
"country": "DE",
"region": "Nordrhein-Westfalen",
"institution_type": inst_type,
"url": SEARCH_URL,
"source": "archive.nrw.de",
"harvest_date": datetime.now(timezone.utc).isoformat()
}
archives.append(record)
print(f"{text} ({city or 'unknown city'})")
except Exception as e:
continue # Skip problematic buttons
except Exception as e:
print(f"❌ Error during harvest: {e}")
finally:
browser.close()
return archives
def deduplicate_archives(archives: List[Dict]) -> List[Dict]:
"""Remove duplicate archive entries based on name."""
seen = set()
unique = []
for archive in archives:
key = archive['name'].lower().strip()
if key not in seen:
seen.add(key)
unique.append(archive)
return unique
def main():
"""Main harvest workflow."""
print("=" * 70)
print("NRW Archives Harvester (Playwright Edition)")
print("=" * 70)
print()
start_time = time.time()
# Harvest archives using Playwright
archives = harvest_archives_with_playwright()
if not archives:
print("❌ No archives found. The page structure may have changed.")
return
# Deduplicate
archives = deduplicate_archives(archives)
print()
print(f"✅ Harvested {len(archives)} unique NRW archives")
print()
# Statistics
cities = set(a['city'] for a in archives if a['city'])
types = {}
for archive in archives:
inst_type = archive['institution_type']
types[inst_type] = types.get(inst_type, 0) + 1
print("Statistics:")
print(f" Cities covered: {len(cities)}")
print(f" Institution types:")
for inst_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type}: {count}")
# Export to JSON
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
output_file = OUTPUT_DIR / f"nrw_archives_{timestamp}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(archives, f, ensure_ascii=False, indent=2)
print()
print(f"📁 Output: {output_file}")
print(f"⏱️ Time: {time.time() - start_time:.1f}s")
print()
# Show sample records
print("Sample records:")
for i, archive in enumerate(archives[:5], 1):
print(f"\n{i}. {archive['name']}")
print(f" City: {archive['city'] or 'Unknown'}")
print(f" Type: {archive['institution_type']}")
if __name__ == "__main__":
main()