glam/scripts/scrapers/harvest_nrw_archives_complete.py

#!/usr/bin/env python3
"""
NRW Archives Complete Harvester
Extracts ALL 523+ archives from archive.nrw.de portal with complete metadata

This script harvests ALL archives (not just one category) and extracts:
- Archive names
- ISIL codes (from persistent links)
- City/location information
- Institution types
- Archive creation dates (when available)

Portal: https://www.archive.nrw.de/archivsuche
Operator: Landesarchiv Nordrhein-Westfalen
Data: 523+ archives across ALL archive types (Archivsparten)

Uses Playwright for JavaScript rendering.

Author: OpenCode + AI Agent
Date: 2025-11-19
Version: 2.0 (Complete Harvest)
"""

from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
import re

# Configuration
BASE_URL = "https://www.archive.nrw.de"
SEARCH_URL = f"{BASE_URL}/archivsuche"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Map German archive types to GLAM taxonomy
ARCHIVE_TYPE_MAPPING = {
    "Landesarchiv": "OFFICIAL_INSTITUTION",  # State archive
    "Stadtarchiv": "ARCHIVE",  # City archive
    "Gemeindearchiv": "ARCHIVE",  # Municipal archive
    "Kreisarchiv": "ARCHIVE",  # District archive
    "Stiftsarchiv": "ARCHIVE",  # Foundation/monastic archive
    "Kommunalarchiv": "ARCHIVE",  # Local archive
    "Stadt- und": "ARCHIVE",  # Combined city/district archive
    "Institut für": "RESEARCH_CENTER",  # Research institute
    "Archiv des": "ARCHIVE",  # Archive of (organization)
    "Archiv der": "ARCHIVE",  # Archive of (institution)
    "Historisches": "RESEARCH_CENTER",  # Historical center/archive
    "Universitätsarchiv": "EDUCATION_PROVIDER",  # University archive
    "Hochschularchiv": "EDUCATION_PROVIDER",  # University archive
    "Bistumsarchiv": "HOLY_SITES",  # Diocese archive
    "Erzbistumsarchiv": "HOLY_SITES",  # Archdiocese archive
    "Diözesanarchiv": "HOLY_SITES",  # Diocesan archive
    "Landeskirchliches": "HOLY_SITES",  # Regional church archive
    "Kirchenkreises": "HOLY_SITES",  # Church district archive
    "Unternehmensarchiv": "CORPORATION",  # Corporate archive
    "Konzernarchiv": "CORPORATION",  # Corporate group archive
    "Wirtschaftsarchiv": "CORPORATION",  # Business archive
}


def infer_institution_type(name: str) -> str:
    """Infer institution type from German archive name."""
    for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
        if keyword in name:
            return inst_type
    return "ARCHIVE"  # Default to generic archive


def extract_city_from_name(name: str) -> Optional[str]:
    """
    Extract city name from German archive names.

    Patterns:
    - Stadtarchiv München → München
    - Gemeindearchiv Bedburg-Hau → Bedburg-Hau
    - Kreisarchiv Viersen → Viersen
    - Archiv der Stadt Gummersbach → Gummersbach
    """
    patterns = [
        r'Stadtarchiv\s+(.+)',
        r'Gemeindearchiv\s+(.+)',
        r'Kreisarchiv\s+(.+)',
        r'Kommunalarchiv\s+(.+)',
        r'Stadt-\s+und\s+\w+\s+(.+)',  # Stadt- und Kreisarchiv X
        r'Archiv\s+der\s+(?:Stadt|Kreis-\s+und\s+Hochschulstadt)\s+(.+)',
        r'Institut\s+für\s+Stadtgeschichte[/\s]+(.+)',
        r'Historisches\s+Zentrum\s+(.+)',
        r'Stiftsarchiv\s+(.+)',
    ]

    for pattern in patterns:
        match = re.search(pattern, name)
        if match:
            city = match.group(1).strip()
            # Remove trailing qualifiers
            city = re.sub(r'\s+\(.*\)$', '', city)  # Remove (Westf.), (Ruhr), etc.
            return city

    return None


def extract_isil_from_link(link_url: str) -> Optional[str]:
    """
    Extract ISIL code from persistent link URL.

    Example: https://www.archive.nrw.de/ms/search?link=ARCHIV-DE-Due75
    → Extracts: DE-Due75
    """
    match = re.search(r'link=ARCHIV-([A-Z]{2}-[A-Za-z0-9]+)', link_url)
    if match:
        return match.group(1)
    return None


def harvest_archives_complete() -> List[Dict]:
    """
    Harvest ALL 523+ archives using Playwright.

    Strategy:
    1. Navigate to search page
    2. Click on "Navigierende Suche" tab
    3. Do NOT select any category filter (get all archives)
    4. Extract all archive button names
    5. Click each archive to get persistent link
    6. Extract ISIL code from link
    """
    archives = []

    with sync_playwright() as p:
        print("Launching browser...")
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        try:
            print(f"Navigating to {SEARCH_URL}...")
            page.goto(SEARCH_URL, wait_until="networkidle", timeout=30000)

            # Accept cookies if present
            try:
                page.get_by_role("button", name="Okay, ich bin einverstanden").click(timeout=5000)
                print("  ✓ Accepted cookies")
            except PlaywrightTimeout:
                pass  # No cookie banner

            # Click on "Navigierende Suche" tab
            try:
                page.get_by_text("Navigierende Suche").click(timeout=5000)
                time.sleep(3)  # Wait for ALL archives to load (no filter = all 523)
                print("  ✓ Switched to Navigierende Suche (all archives)")
            except PlaywrightTimeout:
                print("  ⚠ Could not find Navigierende Suche tab")
                return archives

            # Extract all archive buttons from the list
            print("\n🔍 Extracting archive metadata...")
            print("=" * 70)

            # Find all TOP-LEVEL archive buttons (not sub-collections)
            # Top-level archives are the ones visible initially before any expansion
            archive_buttons = page.get_by_role("button").all()

            # Filter to only archive-level buttons (names contain "archiv" keywords)
            archive_buttons_filtered = []
            for button in archive_buttons:
                text = button.text_content()
                if text and any(keyword in text.lower() for keyword in ['archiv', 'institut', 'zentrum', 'stiftung', 'bibliothek']):
                    # Skip sub-collections (start with numbers, asterisks, or contain "/")
                    if not (text.startswith('*') or text.startswith(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')) or ' / ' in text):
                        archive_buttons_filtered.append(button)

            total_archives = len(archive_buttons_filtered)
            print(f"Found {total_archives} top-level archive institutions\n")

            for idx, button in enumerate(archive_buttons_filtered, 1):
                try:
                    archive_name = button.text_content().strip()

                    # Click the archive button to reveal detail panel
                    button.click()
                    time.sleep(0.8)  # Wait for detail panel and persistent link to load

                    # Extract persistent link (ISIL code source)
                    isil_code = None
                    try:
                        # Try multiple methods to find the persistent link
                        link_element = page.get_by_text("Beständigen Verweis öffnen").first
                        persistent_link = link_element.get_attribute('href', timeout=3000)
                        if persistent_link:
                            isil_code = extract_isil_from_link(persistent_link)
                    except:
                        try:
                            # Alternative: look for any link containing "ARCHIV-"
                            all_links = page.locator('a[href*="ARCHIV-"]').all()
                            if all_links:
                                persistent_link = all_links[0].get_attribute('href')
                                if persistent_link:
                                    isil_code = extract_isil_from_link(persistent_link)
                        except:
                            pass  # No persistent link available

                    # Extract city and institution type
                    city = extract_city_from_name(archive_name)
                    inst_type = infer_institution_type(archive_name)

                    record = {
                        "name": archive_name,
                        "city": city,
                        "country": "DE",
                        "region": "Nordrhein-Westfalen",
                        "institution_type": inst_type,
                        "isil_code": isil_code,
                        "url": SEARCH_URL,
                        "source": "archive.nrw.de",
                        "harvest_date": datetime.now(timezone.utc).isoformat()
                    }

                    archives.append(record)

                    # Progress indicator
                    isil_display = f"ISIL: {isil_code}" if isil_code else "ISIL: N/A"
                    city_display = f"({city})" if city else "(no city)"
                    print(f"[{idx}/{total_archives}] {archive_name} {city_display} - {isil_display}")

                except Exception as e:
                    print(f"  ⚠ Error processing archive {idx}: {e}")
                    continue

        except Exception as e:
            print(f"❌ Error during harvest: {e}")

        finally:
            browser.close()

    return archives


def deduplicate_archives(archives: List[Dict]) -> List[Dict]:
    """Remove duplicate archive entries based on name."""
    seen = set()
    unique = []

    for archive in archives:
        key = archive['name'].lower().strip()
        if key not in seen:
            seen.add(key)
            unique.append(archive)

    return unique


def main():
    """Main harvest workflow."""
    print("=" * 70)
    print("NRW Archives COMPLETE Harvester")
    print("Extracting ALL 523+ archives with ISIL codes")
    print("=" * 70)
    print()

    start_time = time.time()

    # Harvest archives using Playwright
    archives = harvest_archives_complete()

    if not archives:
        print("❌ No archives found. The page structure may have changed.")
        return

    # Deduplicate
    archives = deduplicate_archives(archives)

    print()
    print("=" * 70)
    print(f"✅ Harvested {len(archives)} unique NRW archives")
    print("=" * 70)
    print()

    # Statistics
    cities = set(a['city'] for a in archives if a['city'])
    types = {}
    isil_count = sum(1 for a in archives if a.get('isil_code'))

    for archive in archives:
        inst_type = archive['institution_type']
        types[inst_type] = types.get(inst_type, 0) + 1

    print("📊 Statistics:")
    print(f"  Total archives: {len(archives)}")
    print(f"  Archives with ISIL codes: {isil_count} ({isil_count/len(archives)*100:.1f}%)")
    print(f"  Cities covered: {len(cities)}")
    print(f"  Archives with city data: {len([a for a in archives if a['city']])} ({len([a for a in archives if a['city']])/len(archives)*100:.1f}%)")
    print()
    print("  Institution types:")
    for inst_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
        print(f"    {inst_type}: {count}")

    # Export to JSON
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    output_file = OUTPUT_DIR / f"nrw_archives_complete_{timestamp}.json"

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(archives, f, ensure_ascii=False, indent=2)

    print()
    print(f"📁 Output: {output_file}")
    print(f"   File size: {output_file.stat().st_size / 1024:.1f} KB")
    print(f"⏱️  Time: {time.time() - start_time:.1f}s")
    print()

    # Show sample records with ISIL codes
    print("📋 Sample records (with ISIL codes):")
    samples = [a for a in archives if a.get('isil_code')][:5]
    for i, archive in enumerate(samples, 1):
        print(f"\n{i}. {archive['name']}")
        print(f"   City: {archive['city'] or 'Unknown'}")
        print(f"   Type: {archive['institution_type']}")
        print(f"   ISIL: {archive['isil_code']}")


if __name__ == "__main__":
    main()