271 lines
9.2 KiB
Python
271 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
NRW Archives Harvester
|
|
Extracts archive institutions from archive.nrw.de portal
|
|
|
|
This script harvests all archives listed on the Nordrhein-Westfalen archive portal,
|
|
which uses a JavaScript-rendered hierarchical navigation interface (Drupal-based).
|
|
|
|
Portal: https://www.archive.nrw.de/archivsuche
|
|
Operator: Landesarchiv Nordrhein-Westfalen
|
|
Data: 523+ archives across 7 archive types (Archivsparten)
|
|
|
|
Uses Playwright for JavaScript rendering.
|
|
|
|
Author: OpenCode + AI Agent
|
|
Date: 2025-11-19
|
|
"""
|
|
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Optional
|
|
import re
|
|
|
|
# Configuration
|
|
BASE_URL = "https://www.archive.nrw.de"
|
|
SEARCH_URL = f"{BASE_URL}/archivsuche"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Map German archive types to GLAM taxonomy
|
|
ARCHIVE_TYPE_MAPPING = {
|
|
"Landesarchiv": "OFFICIAL_INSTITUTION", # State archive
|
|
"Stadtarchiv": "ARCHIVE", # City archive
|
|
"Gemeindearchiv": "ARCHIVE", # Municipal archive
|
|
"Kreisarchiv": "ARCHIVE", # District archive
|
|
"Stiftsarchiv": "ARCHIVE", # Foundation archive
|
|
"Kommunalarchiv": "ARCHIVE", # Local archive
|
|
"Stadt- und": "ARCHIVE", # Combined city/district archive
|
|
"Institut für": "RESEARCH_CENTER", # Research institute
|
|
"Archiv des": "ARCHIVE", # Archive of (organization)
|
|
"Historisches": "RESEARCH_CENTER" # Historical center/archive
|
|
}
|
|
|
|
|
|
def infer_institution_type(name: str) -> str:
|
|
"""Infer institution type from German name."""
|
|
for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
|
|
if keyword in name:
|
|
return inst_type
|
|
return "ARCHIVE" # Default to generic archive
|
|
|
|
|
|
def extract_city_from_name(name: str) -> Optional[str]:
|
|
"""
|
|
Extract city name from German archive names.
|
|
|
|
Patterns:
|
|
- Stadtarchiv München → München
|
|
- Gemeindearchiv Bedburg-Hau → Bedburg-Hau
|
|
- Kreisarchiv Viersen → Viersen
|
|
- Archiv der Stadt Gummersbach → Gummersbach
|
|
"""
|
|
patterns = [
|
|
r'Stadtarchiv\s+(.+)',
|
|
r'Gemeindearchiv\s+(.+)',
|
|
r'Kreisarchiv\s+(.+)',
|
|
r'Kommunalarchiv\s+(.+)',
|
|
r'Stadt-\s+und\s+\w+\s+(.+)', # Stadt- und Kreisarchiv X
|
|
r'Archiv\s+der\s+(?:Stadt|Kreis-\s+und\s+Hochschulstadt)\s+(.+)',
|
|
r'Institut\s+für\s+Stadtgeschichte[/\s]+(.+)',
|
|
r'Historisches\s+Zentrum\s+(.+)',
|
|
r'Stiftsarchiv\s+(.+)'
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, name)
|
|
if match:
|
|
city = match.group(1).strip()
|
|
# Remove trailing qualifiers
|
|
city = re.sub(r'\s+\(.*\)$', '', city) # Remove (Westf.), (Ruhr), etc.
|
|
return city
|
|
|
|
return None
|
|
|
|
|
|
def harvest_archives_with_playwright() -> List[Dict]:
|
|
"""
|
|
Harvest archives using Playwright to render JavaScript.
|
|
|
|
Strategy:
|
|
1. Navigate to search page
|
|
2. Click on "Navigierende Suche" tab
|
|
3. Select "Kommunale Archive" (Municipal Archives) - largest category
|
|
4. Extract all archive names from rendered list
|
|
5. Repeat for other archive categories
|
|
"""
|
|
archives = []
|
|
|
|
with sync_playwright() as p:
|
|
print("Launching browser...")
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_page()
|
|
|
|
try:
|
|
print(f"Navigating to {SEARCH_URL}...")
|
|
page.goto(SEARCH_URL, wait_until="networkidle", timeout=30000)
|
|
|
|
# Accept cookies if present
|
|
try:
|
|
page.get_by_role("button", name="Okay, ich bin einverstanden").click(timeout=5000)
|
|
print(" ✓ Accepted cookies")
|
|
except PlaywrightTimeout:
|
|
pass # No cookie banner
|
|
|
|
# Click on "Navigierende Suche" tab
|
|
try:
|
|
page.get_by_text("Navigierende Suche").click(timeout=5000)
|
|
time.sleep(2) # Wait for content to load
|
|
print(" ✓ Switched to Navigierende Suche")
|
|
except PlaywrightTimeout:
|
|
print(" ⚠ Could not find Navigierende Suche tab")
|
|
|
|
# Click on Archivsparte dropdown
|
|
try:
|
|
page.get_by_text("Bitte Archivsparte auswählen").click(timeout=5000)
|
|
time.sleep(1)
|
|
print(" ✓ Opened Archivsparte dropdown")
|
|
except PlaywrightTimeout:
|
|
print(" ⚠ Could not open Archivsparte dropdown")
|
|
|
|
# Select "Kommunale Archive" (largest category)
|
|
try:
|
|
page.get_by_text("Kommunale Archive", exact=False).click(timeout=5000)
|
|
time.sleep(3) # Wait for archive list to populate
|
|
print(" ✓ Selected Kommunale Archive")
|
|
except PlaywrightTimeout:
|
|
print(" ⚠ Could not select Kommunale Archive")
|
|
|
|
# Extract all archive buttons from the list
|
|
print("\nExtracting archive names...")
|
|
|
|
# Find all list items containing archive names
|
|
# Based on browser inspection, archives are in role="button" elements within role="listitem"
|
|
archive_buttons = page.get_by_role("button").all()
|
|
|
|
print(f"Found {len(archive_buttons)} buttons to process...")
|
|
|
|
for button in archive_buttons:
|
|
try:
|
|
text = button.text_content()
|
|
if not text:
|
|
continue
|
|
|
|
text = text.strip()
|
|
|
|
# Skip if not an archive name
|
|
if not any(keyword in text for keyword in ['archiv', 'Archiv', 'Institut', 'Zentrum']):
|
|
continue
|
|
|
|
# Skip internal collection names (start with *)
|
|
if text.startswith('*'):
|
|
continue
|
|
|
|
# Skip if already in list
|
|
if any(a['name'] == text for a in archives):
|
|
continue
|
|
|
|
city = extract_city_from_name(text)
|
|
inst_type = infer_institution_type(text)
|
|
|
|
record = {
|
|
"name": text,
|
|
"city": city,
|
|
"country": "DE",
|
|
"region": "Nordrhein-Westfalen",
|
|
"institution_type": inst_type,
|
|
"url": SEARCH_URL,
|
|
"source": "archive.nrw.de",
|
|
"harvest_date": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
archives.append(record)
|
|
print(f" ✓ {text} ({city or 'unknown city'})")
|
|
|
|
except Exception as e:
|
|
continue # Skip problematic buttons
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error during harvest: {e}")
|
|
|
|
finally:
|
|
browser.close()
|
|
|
|
return archives
|
|
|
|
|
|
def deduplicate_archives(archives: List[Dict]) -> List[Dict]:
|
|
"""Remove duplicate archive entries based on name."""
|
|
seen = set()
|
|
unique = []
|
|
|
|
for archive in archives:
|
|
key = archive['name'].lower().strip()
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(archive)
|
|
|
|
return unique
|
|
|
|
|
|
def main():
|
|
"""Main harvest workflow."""
|
|
print("=" * 70)
|
|
print("NRW Archives Harvester (Playwright Edition)")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
start_time = time.time()
|
|
|
|
# Harvest archives using Playwright
|
|
archives = harvest_archives_with_playwright()
|
|
|
|
if not archives:
|
|
print("❌ No archives found. The page structure may have changed.")
|
|
return
|
|
|
|
# Deduplicate
|
|
archives = deduplicate_archives(archives)
|
|
|
|
print()
|
|
print(f"✅ Harvested {len(archives)} unique NRW archives")
|
|
print()
|
|
|
|
# Statistics
|
|
cities = set(a['city'] for a in archives if a['city'])
|
|
types = {}
|
|
for archive in archives:
|
|
inst_type = archive['institution_type']
|
|
types[inst_type] = types.get(inst_type, 0) + 1
|
|
|
|
print("Statistics:")
|
|
print(f" Cities covered: {len(cities)}")
|
|
print(f" Institution types:")
|
|
for inst_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {inst_type}: {count}")
|
|
|
|
# Export to JSON
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
output_file = OUTPUT_DIR / f"nrw_archives_{timestamp}.json"
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(archives, f, ensure_ascii=False, indent=2)
|
|
|
|
print()
|
|
print(f"📁 Output: {output_file}")
|
|
print(f"⏱️ Time: {time.time() - start_time:.1f}s")
|
|
print()
|
|
|
|
# Show sample records
|
|
print("Sample records:")
|
|
for i, archive in enumerate(archives[:5], 1):
|
|
print(f"\n{i}. {archive['name']}")
|
|
print(f" City: {archive['city'] or 'Unknown'}")
|
|
print(f" Type: {archive['institution_type']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|