323 lines
12 KiB
Python
Executable file
323 lines
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
NRW Archives Complete Harvester
|
|
Extracts ALL 523+ archives from archive.nrw.de portal with complete metadata
|
|
|
|
This script harvests ALL archives (not just one category) and extracts:
|
|
- Archive names
|
|
- ISIL codes (from persistent links)
|
|
- City/location information
|
|
- Institution types
|
|
- Archive creation dates (when available)
|
|
|
|
Portal: https://www.archive.nrw.de/archivsuche
|
|
Operator: Landesarchiv Nordrhein-Westfalen
|
|
Data: 523+ archives across ALL archive types (Archivsparten)
|
|
|
|
Uses Playwright for JavaScript rendering.
|
|
|
|
Author: OpenCode + AI Agent
|
|
Date: 2025-11-19
|
|
Version: 2.0 (Complete Harvest)
|
|
"""
|
|
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Optional
|
|
import re
|
|
|
|
# Configuration
|
|
BASE_URL = "https://www.archive.nrw.de"
|
|
SEARCH_URL = f"{BASE_URL}/archivsuche"
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Map German archive types to GLAM taxonomy
|
|
ARCHIVE_TYPE_MAPPING = {
|
|
"Landesarchiv": "OFFICIAL_INSTITUTION", # State archive
|
|
"Stadtarchiv": "ARCHIVE", # City archive
|
|
"Gemeindearchiv": "ARCHIVE", # Municipal archive
|
|
"Kreisarchiv": "ARCHIVE", # District archive
|
|
"Stiftsarchiv": "ARCHIVE", # Foundation/monastic archive
|
|
"Kommunalarchiv": "ARCHIVE", # Local archive
|
|
"Stadt- und": "ARCHIVE", # Combined city/district archive
|
|
"Institut für": "RESEARCH_CENTER", # Research institute
|
|
"Archiv des": "ARCHIVE", # Archive of (organization)
|
|
"Archiv der": "ARCHIVE", # Archive of (institution)
|
|
"Historisches": "RESEARCH_CENTER", # Historical center/archive
|
|
"Universitätsarchiv": "EDUCATION_PROVIDER", # University archive
|
|
"Hochschularchiv": "EDUCATION_PROVIDER", # University archive
|
|
"Bistumsarchiv": "HOLY_SITES", # Diocese archive
|
|
"Erzbistumsarchiv": "HOLY_SITES", # Archdiocese archive
|
|
"Diözesanarchiv": "HOLY_SITES", # Diocesan archive
|
|
"Landeskirchliches": "HOLY_SITES", # Regional church archive
|
|
"Kirchenkreises": "HOLY_SITES", # Church district archive
|
|
"Unternehmensarchiv": "CORPORATION", # Corporate archive
|
|
"Konzernarchiv": "CORPORATION", # Corporate group archive
|
|
"Wirtschaftsarchiv": "CORPORATION", # Business archive
|
|
}
|
|
|
|
|
|
def infer_institution_type(name: str) -> str:
|
|
"""Infer institution type from German archive name."""
|
|
for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
|
|
if keyword in name:
|
|
return inst_type
|
|
return "ARCHIVE" # Default to generic archive
|
|
|
|
|
|
def extract_city_from_name(name: str) -> Optional[str]:
|
|
"""
|
|
Extract city name from German archive names.
|
|
|
|
Patterns:
|
|
- Stadtarchiv München → München
|
|
- Gemeindearchiv Bedburg-Hau → Bedburg-Hau
|
|
- Kreisarchiv Viersen → Viersen
|
|
- Archiv der Stadt Gummersbach → Gummersbach
|
|
"""
|
|
patterns = [
|
|
r'Stadtarchiv\s+(.+)',
|
|
r'Gemeindearchiv\s+(.+)',
|
|
r'Kreisarchiv\s+(.+)',
|
|
r'Kommunalarchiv\s+(.+)',
|
|
r'Stadt-\s+und\s+\w+\s+(.+)', # Stadt- und Kreisarchiv X
|
|
r'Archiv\s+der\s+(?:Stadt|Kreis-\s+und\s+Hochschulstadt)\s+(.+)',
|
|
r'Institut\s+für\s+Stadtgeschichte[/\s]+(.+)',
|
|
r'Historisches\s+Zentrum\s+(.+)',
|
|
r'Stiftsarchiv\s+(.+)',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, name)
|
|
if match:
|
|
city = match.group(1).strip()
|
|
# Remove trailing qualifiers
|
|
city = re.sub(r'\s+\(.*\)$', '', city) # Remove (Westf.), (Ruhr), etc.
|
|
return city
|
|
|
|
return None
|
|
|
|
|
|
def extract_isil_from_link(link_url: str) -> Optional[str]:
|
|
"""
|
|
Extract ISIL code from persistent link URL.
|
|
|
|
Example: https://www.archive.nrw.de/ms/search?link=ARCHIV-DE-Due75
|
|
→ Extracts: DE-Due75
|
|
"""
|
|
match = re.search(r'link=ARCHIV-([A-Z]{2}-[A-Za-z0-9]+)', link_url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def harvest_archives_complete() -> List[Dict]:
|
|
"""
|
|
Harvest ALL 523+ archives using Playwright.
|
|
|
|
Strategy:
|
|
1. Navigate to search page
|
|
2. Click on "Navigierende Suche" tab
|
|
3. Do NOT select any category filter (get all archives)
|
|
4. Extract all archive button names
|
|
5. Click each archive to get persistent link
|
|
6. Extract ISIL code from link
|
|
"""
|
|
archives = []
|
|
|
|
with sync_playwright() as p:
|
|
print("Launching browser...")
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_page()
|
|
|
|
try:
|
|
print(f"Navigating to {SEARCH_URL}...")
|
|
page.goto(SEARCH_URL, wait_until="networkidle", timeout=30000)
|
|
|
|
# Accept cookies if present
|
|
try:
|
|
page.get_by_role("button", name="Okay, ich bin einverstanden").click(timeout=5000)
|
|
print(" ✓ Accepted cookies")
|
|
except PlaywrightTimeout:
|
|
pass # No cookie banner
|
|
|
|
# Click on "Navigierende Suche" tab
|
|
try:
|
|
page.get_by_text("Navigierende Suche").click(timeout=5000)
|
|
time.sleep(3) # Wait for ALL archives to load (no filter = all 523)
|
|
print(" ✓ Switched to Navigierende Suche (all archives)")
|
|
except PlaywrightTimeout:
|
|
print(" ⚠ Could not find Navigierende Suche tab")
|
|
return archives
|
|
|
|
# Extract all archive buttons from the list
|
|
print("\n🔍 Extracting archive metadata...")
|
|
print("=" * 70)
|
|
|
|
# Find all TOP-LEVEL archive buttons (not sub-collections)
|
|
# Top-level archives are the ones visible initially before any expansion
|
|
archive_buttons = page.get_by_role("button").all()
|
|
|
|
# Filter to only archive-level buttons (names contain "archiv" keywords)
|
|
archive_buttons_filtered = []
|
|
for button in archive_buttons:
|
|
text = button.text_content()
|
|
if text and any(keyword in text.lower() for keyword in ['archiv', 'institut', 'zentrum', 'stiftung', 'bibliothek']):
|
|
# Skip sub-collections (start with numbers, asterisks, or contain "/")
|
|
if not (text.startswith('*') or text.startswith(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')) or ' / ' in text):
|
|
archive_buttons_filtered.append(button)
|
|
|
|
total_archives = len(archive_buttons_filtered)
|
|
print(f"Found {total_archives} top-level archive institutions\n")
|
|
|
|
for idx, button in enumerate(archive_buttons_filtered, 1):
|
|
try:
|
|
archive_name = button.text_content().strip()
|
|
|
|
# Click the archive button to reveal detail panel
|
|
button.click()
|
|
time.sleep(0.8) # Wait for detail panel and persistent link to load
|
|
|
|
# Extract persistent link (ISIL code source)
|
|
isil_code = None
|
|
try:
|
|
# Try multiple methods to find the persistent link
|
|
link_element = page.get_by_text("Beständigen Verweis öffnen").first
|
|
persistent_link = link_element.get_attribute('href', timeout=3000)
|
|
if persistent_link:
|
|
isil_code = extract_isil_from_link(persistent_link)
|
|
except:
|
|
try:
|
|
# Alternative: look for any link containing "ARCHIV-"
|
|
all_links = page.locator('a[href*="ARCHIV-"]').all()
|
|
if all_links:
|
|
persistent_link = all_links[0].get_attribute('href')
|
|
if persistent_link:
|
|
isil_code = extract_isil_from_link(persistent_link)
|
|
except:
|
|
pass # No persistent link available
|
|
|
|
# Extract city and institution type
|
|
city = extract_city_from_name(archive_name)
|
|
inst_type = infer_institution_type(archive_name)
|
|
|
|
record = {
|
|
"name": archive_name,
|
|
"city": city,
|
|
"country": "DE",
|
|
"region": "Nordrhein-Westfalen",
|
|
"institution_type": inst_type,
|
|
"isil_code": isil_code,
|
|
"url": SEARCH_URL,
|
|
"source": "archive.nrw.de",
|
|
"harvest_date": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
archives.append(record)
|
|
|
|
# Progress indicator
|
|
isil_display = f"ISIL: {isil_code}" if isil_code else "ISIL: N/A"
|
|
city_display = f"({city})" if city else "(no city)"
|
|
print(f"[{idx}/{total_archives}] {archive_name} {city_display} - {isil_display}")
|
|
|
|
except Exception as e:
|
|
print(f" ⚠ Error processing archive {idx}: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error during harvest: {e}")
|
|
|
|
finally:
|
|
browser.close()
|
|
|
|
return archives
|
|
|
|
|
|
def deduplicate_archives(archives: List[Dict]) -> List[Dict]:
|
|
"""Remove duplicate archive entries based on name."""
|
|
seen = set()
|
|
unique = []
|
|
|
|
for archive in archives:
|
|
key = archive['name'].lower().strip()
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(archive)
|
|
|
|
return unique
|
|
|
|
|
|
def main():
|
|
"""Main harvest workflow."""
|
|
print("=" * 70)
|
|
print("NRW Archives COMPLETE Harvester")
|
|
print("Extracting ALL 523+ archives with ISIL codes")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
start_time = time.time()
|
|
|
|
# Harvest archives using Playwright
|
|
archives = harvest_archives_complete()
|
|
|
|
if not archives:
|
|
print("❌ No archives found. The page structure may have changed.")
|
|
return
|
|
|
|
# Deduplicate
|
|
archives = deduplicate_archives(archives)
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print(f"✅ Harvested {len(archives)} unique NRW archives")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Statistics
|
|
cities = set(a['city'] for a in archives if a['city'])
|
|
types = {}
|
|
isil_count = sum(1 for a in archives if a.get('isil_code'))
|
|
|
|
for archive in archives:
|
|
inst_type = archive['institution_type']
|
|
types[inst_type] = types.get(inst_type, 0) + 1
|
|
|
|
print("📊 Statistics:")
|
|
print(f" Total archives: {len(archives)}")
|
|
print(f" Archives with ISIL codes: {isil_count} ({isil_count/len(archives)*100:.1f}%)")
|
|
print(f" Cities covered: {len(cities)}")
|
|
print(f" Archives with city data: {len([a for a in archives if a['city']])} ({len([a for a in archives if a['city']])/len(archives)*100:.1f}%)")
|
|
print()
|
|
print(" Institution types:")
|
|
for inst_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {inst_type}: {count}")
|
|
|
|
# Export to JSON
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
output_file = OUTPUT_DIR / f"nrw_archives_complete_{timestamp}.json"
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(archives, f, ensure_ascii=False, indent=2)
|
|
|
|
print()
|
|
print(f"📁 Output: {output_file}")
|
|
print(f" File size: {output_file.stat().st_size / 1024:.1f} KB")
|
|
print(f"⏱️ Time: {time.time() - start_time:.1f}s")
|
|
print()
|
|
|
|
# Show sample records with ISIL codes
|
|
print("📋 Sample records (with ISIL codes):")
|
|
samples = [a for a in archives if a.get('isil_code')][:5]
|
|
for i, archive in enumerate(samples, 1):
|
|
print(f"\n{i}. {archive['name']}")
|
|
print(f" City: {archive['city'] or 'Unknown'}")
|
|
print(f" Type: {archive['institution_type']}")
|
|
print(f" ISIL: {archive['isil_code']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|