glam/scripts/scrapers/harvest_nrw_archives_complete.py
2025-11-19 23:25:22 +01:00

323 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""
NRW Archives Complete Harvester
Extracts ALL 523+ archives from archive.nrw.de portal with complete metadata
This script harvests ALL archives (not just one category) and extracts:
- Archive names
- ISIL codes (from persistent links)
- City/location information
- Institution types
- Archive creation dates (when available)
Portal: https://www.archive.nrw.de/archivsuche
Operator: Landesarchiv Nordrhein-Westfalen
Data: 523+ archives across ALL archive types (Archivsparten)
Uses Playwright for JavaScript rendering.
Author: OpenCode + AI Agent
Date: 2025-11-19
Version: 2.0 (Complete Harvest)
"""
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
import re
# Configuration
BASE_URL = "https://www.archive.nrw.de"
SEARCH_URL = f"{BASE_URL}/archivsuche"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Map German archive types to GLAM taxonomy
ARCHIVE_TYPE_MAPPING = {
"Landesarchiv": "OFFICIAL_INSTITUTION", # State archive
"Stadtarchiv": "ARCHIVE", # City archive
"Gemeindearchiv": "ARCHIVE", # Municipal archive
"Kreisarchiv": "ARCHIVE", # District archive
"Stiftsarchiv": "ARCHIVE", # Foundation/monastic archive
"Kommunalarchiv": "ARCHIVE", # Local archive
"Stadt- und": "ARCHIVE", # Combined city/district archive
"Institut für": "RESEARCH_CENTER", # Research institute
"Archiv des": "ARCHIVE", # Archive of (organization)
"Archiv der": "ARCHIVE", # Archive of (institution)
"Historisches": "RESEARCH_CENTER", # Historical center/archive
"Universitätsarchiv": "EDUCATION_PROVIDER", # University archive
"Hochschularchiv": "EDUCATION_PROVIDER", # University archive
"Bistumsarchiv": "HOLY_SITES", # Diocese archive
"Erzbistumsarchiv": "HOLY_SITES", # Archdiocese archive
"Diözesanarchiv": "HOLY_SITES", # Diocesan archive
"Landeskirchliches": "HOLY_SITES", # Regional church archive
"Kirchenkreises": "HOLY_SITES", # Church district archive
"Unternehmensarchiv": "CORPORATION", # Corporate archive
"Konzernarchiv": "CORPORATION", # Corporate group archive
"Wirtschaftsarchiv": "CORPORATION", # Business archive
}
def infer_institution_type(name: str) -> str:
"""Infer institution type from German archive name."""
for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
if keyword in name:
return inst_type
return "ARCHIVE" # Default to generic archive
def extract_city_from_name(name: str) -> Optional[str]:
"""
Extract city name from German archive names.
Patterns:
- Stadtarchiv München → München
- Gemeindearchiv Bedburg-Hau → Bedburg-Hau
- Kreisarchiv Viersen → Viersen
- Archiv der Stadt Gummersbach → Gummersbach
"""
patterns = [
r'Stadtarchiv\s+(.+)',
r'Gemeindearchiv\s+(.+)',
r'Kreisarchiv\s+(.+)',
r'Kommunalarchiv\s+(.+)',
r'Stadt-\s+und\s+\w+\s+(.+)', # Stadt- und Kreisarchiv X
r'Archiv\s+der\s+(?:Stadt|Kreis-\s+und\s+Hochschulstadt)\s+(.+)',
r'Institut\s+für\s+Stadtgeschichte[/\s]+(.+)',
r'Historisches\s+Zentrum\s+(.+)',
r'Stiftsarchiv\s+(.+)',
]
for pattern in patterns:
match = re.search(pattern, name)
if match:
city = match.group(1).strip()
# Remove trailing qualifiers
city = re.sub(r'\s+\(.*\)$', '', city) # Remove (Westf.), (Ruhr), etc.
return city
return None
def extract_isil_from_link(link_url: str) -> Optional[str]:
"""
Extract ISIL code from persistent link URL.
Example: https://www.archive.nrw.de/ms/search?link=ARCHIV-DE-Due75
→ Extracts: DE-Due75
"""
match = re.search(r'link=ARCHIV-([A-Z]{2}-[A-Za-z0-9]+)', link_url)
if match:
return match.group(1)
return None
def harvest_archives_complete() -> List[Dict]:
"""
Harvest ALL 523+ archives using Playwright.
Strategy:
1. Navigate to search page
2. Click on "Navigierende Suche" tab
3. Do NOT select any category filter (get all archives)
4. Extract all archive button names
5. Click each archive to get persistent link
6. Extract ISIL code from link
"""
archives = []
with sync_playwright() as p:
print("Launching browser...")
browser = p.chromium.launch(headless=True)
page = browser.new_page()
try:
print(f"Navigating to {SEARCH_URL}...")
page.goto(SEARCH_URL, wait_until="networkidle", timeout=30000)
# Accept cookies if present
try:
page.get_by_role("button", name="Okay, ich bin einverstanden").click(timeout=5000)
print(" ✓ Accepted cookies")
except PlaywrightTimeout:
pass # No cookie banner
# Click on "Navigierende Suche" tab
try:
page.get_by_text("Navigierende Suche").click(timeout=5000)
time.sleep(3) # Wait for ALL archives to load (no filter = all 523)
print(" ✓ Switched to Navigierende Suche (all archives)")
except PlaywrightTimeout:
print(" ⚠ Could not find Navigierende Suche tab")
return archives
# Extract all archive buttons from the list
print("\n🔍 Extracting archive metadata...")
print("=" * 70)
# Find all TOP-LEVEL archive buttons (not sub-collections)
# Top-level archives are the ones visible initially before any expansion
archive_buttons = page.get_by_role("button").all()
# Filter to only archive-level buttons (names contain "archiv" keywords)
archive_buttons_filtered = []
for button in archive_buttons:
text = button.text_content()
if text and any(keyword in text.lower() for keyword in ['archiv', 'institut', 'zentrum', 'stiftung', 'bibliothek']):
# Skip sub-collections (start with numbers, asterisks, or contain "/")
if not (text.startswith('*') or text.startswith(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')) or ' / ' in text):
archive_buttons_filtered.append(button)
total_archives = len(archive_buttons_filtered)
print(f"Found {total_archives} top-level archive institutions\n")
for idx, button in enumerate(archive_buttons_filtered, 1):
try:
archive_name = button.text_content().strip()
# Click the archive button to reveal detail panel
button.click()
time.sleep(0.8) # Wait for detail panel and persistent link to load
# Extract persistent link (ISIL code source)
isil_code = None
try:
# Try multiple methods to find the persistent link
link_element = page.get_by_text("Beständigen Verweis öffnen").first
persistent_link = link_element.get_attribute('href', timeout=3000)
if persistent_link:
isil_code = extract_isil_from_link(persistent_link)
except:
try:
# Alternative: look for any link containing "ARCHIV-"
all_links = page.locator('a[href*="ARCHIV-"]').all()
if all_links:
persistent_link = all_links[0].get_attribute('href')
if persistent_link:
isil_code = extract_isil_from_link(persistent_link)
except:
pass # No persistent link available
# Extract city and institution type
city = extract_city_from_name(archive_name)
inst_type = infer_institution_type(archive_name)
record = {
"name": archive_name,
"city": city,
"country": "DE",
"region": "Nordrhein-Westfalen",
"institution_type": inst_type,
"isil_code": isil_code,
"url": SEARCH_URL,
"source": "archive.nrw.de",
"harvest_date": datetime.now(timezone.utc).isoformat()
}
archives.append(record)
# Progress indicator
isil_display = f"ISIL: {isil_code}" if isil_code else "ISIL: N/A"
city_display = f"({city})" if city else "(no city)"
print(f"[{idx}/{total_archives}] {archive_name} {city_display} - {isil_display}")
except Exception as e:
print(f" ⚠ Error processing archive {idx}: {e}")
continue
except Exception as e:
print(f"❌ Error during harvest: {e}")
finally:
browser.close()
return archives
def deduplicate_archives(archives: List[Dict]) -> List[Dict]:
"""Remove duplicate archive entries based on name."""
seen = set()
unique = []
for archive in archives:
key = archive['name'].lower().strip()
if key not in seen:
seen.add(key)
unique.append(archive)
return unique
def main():
"""Main harvest workflow."""
print("=" * 70)
print("NRW Archives COMPLETE Harvester")
print("Extracting ALL 523+ archives with ISIL codes")
print("=" * 70)
print()
start_time = time.time()
# Harvest archives using Playwright
archives = harvest_archives_complete()
if not archives:
print("❌ No archives found. The page structure may have changed.")
return
# Deduplicate
archives = deduplicate_archives(archives)
print()
print("=" * 70)
print(f"✅ Harvested {len(archives)} unique NRW archives")
print("=" * 70)
print()
# Statistics
cities = set(a['city'] for a in archives if a['city'])
types = {}
isil_count = sum(1 for a in archives if a.get('isil_code'))
for archive in archives:
inst_type = archive['institution_type']
types[inst_type] = types.get(inst_type, 0) + 1
print("📊 Statistics:")
print(f" Total archives: {len(archives)}")
print(f" Archives with ISIL codes: {isil_count} ({isil_count/len(archives)*100:.1f}%)")
print(f" Cities covered: {len(cities)}")
print(f" Archives with city data: {len([a for a in archives if a['city']])} ({len([a for a in archives if a['city']])/len(archives)*100:.1f}%)")
print()
print(" Institution types:")
for inst_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type}: {count}")
# Export to JSON
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
output_file = OUTPUT_DIR / f"nrw_archives_complete_{timestamp}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(archives, f, ensure_ascii=False, indent=2)
print()
print(f"📁 Output: {output_file}")
print(f" File size: {output_file.stat().st_size / 1024:.1f} KB")
print(f"⏱️ Time: {time.time() - start_time:.1f}s")
print()
# Show sample records with ISIL codes
print("📋 Sample records (with ISIL codes):")
samples = [a for a in archives if a.get('isil_code')][:5]
for i, archive in enumerate(samples, 1):
print(f"\n{i}. {archive['name']}")
print(f" City: {archive['city'] or 'Unknown'}")
print(f" Type: {archive['institution_type']}")
print(f" ISIL: {archive['isil_code']}")
if __name__ == "__main__":
main()