glam/scripts/scrapers/harvest_nrw_archives.py

#!/usr/bin/env python3
"""
NRW Archives Harvester
Extracts archive institutions from archive.nrw.de portal

This script harvests all archives listed on the Nordrhein-Westfalen archive portal,
which uses a JavaScript-rendered hierarchical navigation interface (Drupal-based).

Portal: https://www.archive.nrw.de/archivsuche
Operator: Landesarchiv Nordrhein-Westfalen
Data: 523+ archives across 7 archive types (Archivsparten)

Uses Playwright for JavaScript rendering.

Author: OpenCode + AI Agent
Date: 2025-11-19
"""

from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
import re

# Configuration
BASE_URL = "https://www.archive.nrw.de"
SEARCH_URL = f"{BASE_URL}/archivsuche"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Map German archive types to GLAM taxonomy
ARCHIVE_TYPE_MAPPING = {
    "Landesarchiv": "OFFICIAL_INSTITUTION",  # State archive
    "Stadtarchiv": "ARCHIVE",  # City archive
    "Gemeindearchiv": "ARCHIVE",  # Municipal archive
    "Kreisarchiv": "ARCHIVE",  # District archive
    "Stiftsarchiv": "ARCHIVE",  # Foundation archive
    "Kommunalarchiv": "ARCHIVE",  # Local archive
    "Stadt- und": "ARCHIVE",  # Combined city/district archive
    "Institut für": "RESEARCH_CENTER",  # Research institute
    "Archiv des": "ARCHIVE",  # Archive of (organization)
    "Historisches": "RESEARCH_CENTER"  # Historical center/archive
}


def infer_institution_type(name: str) -> str:
    """Infer institution type from German name."""
    for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
        if keyword in name:
            return inst_type
    return "ARCHIVE"  # Default to generic archive


def extract_city_from_name(name: str) -> Optional[str]:
    """
    Extract city name from German archive names.

    Patterns:
    - Stadtarchiv München → München
    - Gemeindearchiv Bedburg-Hau → Bedburg-Hau
    - Kreisarchiv Viersen → Viersen
    - Archiv der Stadt Gummersbach → Gummersbach
    """
    patterns = [
        r'Stadtarchiv\s+(.+)',
        r'Gemeindearchiv\s+(.+)',
        r'Kreisarchiv\s+(.+)',
        r'Kommunalarchiv\s+(.+)',
        r'Stadt-\s+und\s+\w+\s+(.+)',  # Stadt- und Kreisarchiv X
        r'Archiv\s+der\s+(?:Stadt|Kreis-\s+und\s+Hochschulstadt)\s+(.+)',
        r'Institut\s+für\s+Stadtgeschichte[/\s]+(.+)',
        r'Historisches\s+Zentrum\s+(.+)',
        r'Stiftsarchiv\s+(.+)'
    ]

    for pattern in patterns:
        match = re.search(pattern, name)
        if match:
            city = match.group(1).strip()
            # Remove trailing qualifiers
            city = re.sub(r'\s+\(.*\)$', '', city)  # Remove (Westf.), (Ruhr), etc.
            return city

    return None


def harvest_archives_with_playwright() -> List[Dict]:
    """
    Harvest archives using Playwright to render JavaScript.

    Strategy:
    1. Navigate to search page
    2. Click on "Navigierende Suche" tab
    3. Select "Kommunale Archive" (Municipal Archives) - largest category
    4. Extract all archive names from rendered list
    5. Repeat for other archive categories
    """
    archives = []

    with sync_playwright() as p:
        print("Launching browser...")
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        try:
            print(f"Navigating to {SEARCH_URL}...")
            page.goto(SEARCH_URL, wait_until="networkidle", timeout=30000)

            # Accept cookies if present
            try:
                page.get_by_role("button", name="Okay, ich bin einverstanden").click(timeout=5000)
                print("  ✓ Accepted cookies")
            except PlaywrightTimeout:
                pass  # No cookie banner

            # Click on "Navigierende Suche" tab
            try:
                page.get_by_text("Navigierende Suche").click(timeout=5000)
                time.sleep(2)  # Wait for content to load
                print("  ✓ Switched to Navigierende Suche")
            except PlaywrightTimeout:
                print("  ⚠ Could not find Navigierende Suche tab")

            # Click on Archivsparte dropdown
            try:
                page.get_by_text("Bitte Archivsparte auswählen").click(timeout=5000)
                time.sleep(1)
                print("  ✓ Opened Archivsparte dropdown")
            except PlaywrightTimeout:
                print("  ⚠ Could not open Archivsparte dropdown")

            # Select "Kommunale Archive" (largest category)
            try:
                page.get_by_text("Kommunale Archive", exact=False).click(timeout=5000)
                time.sleep(3)  # Wait for archive list to populate
                print("  ✓ Selected Kommunale Archive")
            except PlaywrightTimeout:
                print("  ⚠ Could not select Kommunale Archive")

            # Extract all archive buttons from the list
            print("\nExtracting archive names...")

            # Find all list items containing archive names
            # Based on browser inspection, archives are in role="button" elements within role="listitem"
            archive_buttons = page.get_by_role("button").all()

            print(f"Found {len(archive_buttons)} buttons to process...")

            for button in archive_buttons:
                try:
                    text = button.text_content()
                    if not text:
                        continue

                    text = text.strip()

                    # Skip if not an archive name
                    if not any(keyword in text for keyword in ['archiv', 'Archiv', 'Institut', 'Zentrum']):
                        continue

                    # Skip internal collection names (start with *)
                    if text.startswith('*'):
                        continue

                    # Skip if already in list
                    if any(a['name'] == text for a in archives):
                        continue

                    city = extract_city_from_name(text)
                    inst_type = infer_institution_type(text)

                    record = {
                        "name": text,
                        "city": city,
                        "country": "DE",
                        "region": "Nordrhein-Westfalen",
                        "institution_type": inst_type,
                        "url": SEARCH_URL,
                        "source": "archive.nrw.de",
                        "harvest_date": datetime.now(timezone.utc).isoformat()
                    }

                    archives.append(record)
                    print(f"  ✓ {text} ({city or 'unknown city'})")

                except Exception as e:
                    continue  # Skip problematic buttons

        except Exception as e:
            print(f"❌ Error during harvest: {e}")

        finally:
            browser.close()

    return archives


def deduplicate_archives(archives: List[Dict]) -> List[Dict]:
    """Remove duplicate archive entries based on name."""
    seen = set()
    unique = []

    for archive in archives:
        key = archive['name'].lower().strip()
        if key not in seen:
            seen.add(key)
            unique.append(archive)

    return unique


def main():
    """Main harvest workflow."""
    print("=" * 70)
    print("NRW Archives Harvester (Playwright Edition)")
    print("=" * 70)
    print()

    start_time = time.time()

    # Harvest archives using Playwright
    archives = harvest_archives_with_playwright()

    if not archives:
        print("❌ No archives found. The page structure may have changed.")
        return

    # Deduplicate
    archives = deduplicate_archives(archives)

    print()
    print(f"✅ Harvested {len(archives)} unique NRW archives")
    print()

    # Statistics
    cities = set(a['city'] for a in archives if a['city'])
    types = {}
    for archive in archives:
        inst_type = archive['institution_type']
        types[inst_type] = types.get(inst_type, 0) + 1

    print("Statistics:")
    print(f"  Cities covered: {len(cities)}")
    print(f"  Institution types:")
    for inst_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
        print(f"    {inst_type}: {count}")

    # Export to JSON
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    output_file = OUTPUT_DIR / f"nrw_archives_{timestamp}.json"

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(archives, f, ensure_ascii=False, indent=2)

    print()
    print(f"📁 Output: {output_file}")
    print(f"⏱️  Time: {time.time() - start_time:.1f}s")
    print()

    # Show sample records
    print("Sample records:")
    for i, archive in enumerate(archives[:5], 1):
        print(f"\n{i}. {archive['name']}")
        print(f"   City: {archive['city'] or 'Unknown'}")
        print(f"   Type: {archive['institution_type']}")


if __name__ == "__main__":
    main()