glam/scripts/scrapers/harvest_nrw_archives_fast.py

#!/usr/bin/env python3
"""
NRW Archives Fast Harvester (No Clicking Strategy)
Extracts ALL 523+ archives from archive.nrw.de by parsing the rendered page

This version extracts archive names from the list WITHOUT clicking each one,
which is much faster. ISIL codes will be enriched later via detail page scraping.

Portal: https://www.archive.nrw.de/archivsuche
Strategy: Parse rendered HTML after JavaScript execution
Speed: ~10 seconds (vs 10+ minutes for clicking approach)

Author: OpenCode + AI Agent
Date: 2025-11-19
Version: 3.0 (Fast Harvest - Name Only)
"""

from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
import re

# Configuration
BASE_URL = "https://www.archive.nrw.de"
SEARCH_URL = f"{BASE_URL}/archivsuche"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Map German archive types to GLAM taxonomy
ARCHIVE_TYPE_MAPPING = {
    "Landesarchiv": "OFFICIAL_INSTITUTION",
    "Stadtarchiv": "ARCHIVE",
    "Gemeindearchiv": "ARCHIVE",
    "Kreisarchiv": "ARCHIVE",
    "Stiftsarchiv": "ARCHIVE",
    "Kommunalarchiv": "ARCHIVE",
    "Stadt- und": "ARCHIVE",
    "Institut für": "RESEARCH_CENTER",
    "Archiv des": "ARCHIVE",
    "Archiv der": "ARCHIVE",
    "Historisches": "RESEARCH_CENTER",
    "Universitätsarchiv": "EDUCATION_PROVIDER",
    "Hochschularchiv": "EDUCATION_PROVIDER",
    "Bistumsarchiv": "HOLY_SITES",
    "Erzbistumsarchiv": "HOLY_SITES",
    "Diözesanarchiv": "HOLY_SITES",
    "Landeskirchliches": "HOLY_SITES",
    "Kirchenkreises": "HOLY_SITES",
    "Unternehmensarchiv": "CORPORATION",
    "Konzernarchiv": "CORPORATION",
    "Wirtschaftsarchiv": "CORPORATION",
}


def infer_institution_type(name: str) -> str:
    """Infer institution type from German archive name."""
    for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
        if keyword in name:
            return inst_type
    return "ARCHIVE"


def extract_city_from_name(name: str) -> Optional[str]:
    """Extract city name from German archive names."""
    patterns = [
        r'Stadtarchiv\s+(.+)',
        r'Gemeindearchiv\s+(.+)',
        r'Kreisarchiv\s+(.+)',
        r'Kommunalarchiv\s+(.+)',
        r'Stadt-\s+und\s+\w+\s+(.+)',
        r'Archiv\s+der\s+(?:Stadt|Kreis-\s+und\s+Hochschulstadt)\s+(.+)',
        r'Institut\s+für\s+Stadtgeschichte[/\s]+(.+)',
        r'Historisches\s+Zentrum\s+(.+)',
        r'Stiftsarchiv\s+(.+)',
    ]

    for pattern in patterns:
        match = re.search(pattern, name)
        if match:
            city = match.group(1).strip()
            city = re.sub(r'\s+\(.*\)$', '', city)  # Remove (Westf.), (Ruhr), etc.
            return city

    return None


def harvest_archives_fast() -> List[Dict]:
    """
    Fast harvest strategy: Extract archive names from page without clicking.

    Strategy:
    1. Load page with all archives visible
    2. Extract ALL archive button texts at once
    3. Filter to top-level archives only
    4. Save metadata

    ISIL codes can be enriched later in a second pass if needed.
    """
    archives = []

    with sync_playwright() as p:
        print("Launching browser...")
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        try:
            print(f"Navigating to {SEARCH_URL}...")
            page.goto(SEARCH_URL, wait_until="networkidle", timeout=30000)

            # Accept cookies if present
            try:
                page.get_by_role("button", name="Okay, ich bin einverstanden").click(timeout=5000)
                print("  ✓ Accepted cookies")
            except PlaywrightTimeout:
                pass

            # Click on "Navigierende Suche" tab
            try:
                page.get_by_text("Navigierende Suche").click(timeout=5000)
                time.sleep(3)  # Wait for all 523 archives to load
                print("  ✓ Switched to Navigierende Suche (all archives)\n")
            except PlaywrightTimeout:
                print("  ⚠ Could not find Navigierende Suche tab")
                return archives

            print("🔍 Extracting archive names from page...")
            print("=" * 70)

            # Get ALL button elements at once (much faster than clicking each)
            archive_buttons = page.get_by_role("button").all()
            print(f"Found {len(archive_buttons)} buttons on page")

            # Extract text from all buttons
            archive_names = []
            for button in archive_buttons:
                try:
                    text = button.text_content()
                    if text:
                        archive_names.append(text.strip())
                except:
                    continue

            print(f"Extracted {len(archive_names)} button texts")
            print()

            # Filter to top-level archive institutions
            # Criteria: Contains archive keywords AND NOT a sub-collection
            print("Filtering to top-level archive institutions...")
            for name in archive_names:
                # Must contain archive-related keywords
                if not any(keyword in name.lower() for keyword in ['archiv', 'institut', 'zentrum', 'stiftung', 'bibliothek']):
                    continue

                # Skip sub-collections (start with *, numbers, or contain " / ")
                if name.startswith('*') or ' / ' in name:
                    continue

                # Skip numeric prefixes (sub-collection indicators)
                if re.match(r'^[0-9]+', name):
                    continue

                # Extract metadata
                city = extract_city_from_name(name)
                inst_type = infer_institution_type(name)

                record = {
                    "name": name,
                    "city": city,
                    "country": "DE",
                    "region": "Nordrhein-Westfalen",
                    "institution_type": inst_type,
                    "isil_code": None,  # To be enriched in second pass
                    "url": SEARCH_URL,
                    "source": "archive.nrw.de",
                    "harvest_date": datetime.now(timezone.utc).isoformat(),
                    "notes": "Fast harvest - ISIL codes require detail page scraping"
                }

                archives.append(record)

                city_display = f"({city})" if city else "(no city)"
                print(f"  ✓ {name} {city_display}")

        except Exception as e:
            print(f"❌ Error during harvest: {e}")
            import traceback
            traceback.print_exc()

        finally:
            browser.close()

    return archives


def deduplicate_archives(archives: List[Dict]) -> List[Dict]:
    """Remove duplicate archive entries based on name."""
    seen = set()
    unique = []

    for archive in archives:
        key = archive['name'].lower().strip()
        if key not in seen:
            seen.add(key)
            unique.append(archive)

    return unique


def main():
    """Main harvest workflow."""
    print("=" * 70)
    print("NRW Archives FAST Harvester")
    print("Extracting ALL 523+ archive names (ISIL codes in second pass)")
    print("=" * 70)
    print()

    start_time = time.time()

    # Harvest archives using fast method
    archives = harvest_archives_fast()

    if not archives:
        print("❌ No archives found. The page structure may have changed.")
        return

    # Deduplicate
    archives = deduplicate_archives(archives)

    print()
    print("=" * 70)
    print(f"✅ Harvested {len(archives)} unique NRW archives")
    print("=" * 70)
    print()

    # Statistics
    cities = set(a['city'] for a in archives if a['city'])
    types = {}

    for archive in archives:
        inst_type = archive['institution_type']
        types[inst_type] = types.get(inst_type, 0) + 1

    print("📊 Statistics:")
    print(f"  Total archives: {len(archives)}")
    print(f"  Cities covered: {len(cities)}")
    print(f"  Archives with city data: {len([a for a in archives if a['city']])} ({len([a for a in archives if a['city']])/len(archives)*100:.1f}%)")
    print()
    print("  Institution types:")
    for inst_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
        print(f"    {inst_type}: {count}")

    # Export to JSON
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    output_file = OUTPUT_DIR / f"nrw_archives_fast_{timestamp}.json"

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(archives, f, ensure_ascii=False, indent=2)

    print()
    print(f"📁 Output: {output_file}")
    print(f"   File size: {output_file.stat().st_size / 1024:.1f} KB")
    print(f"⏱️  Time: {time.time() - start_time:.1f}s")
    print()

    # Show sample records
    print("📋 Sample records:")
    for i, archive in enumerate(archives[:5], 1):
        print(f"\n{i}. {archive['name']}")
        print(f"   City: {archive['city'] or 'Unknown'}")
        print(f"   Type: {archive['institution_type']}")

    print("\n" + "=" * 70)
    print("ℹ️  NOTE: ISIL codes not included in fast harvest.")
    print("   Run detail page scraper to enrich with ISIL codes:")
    print("   python scripts/scrapers/enrich_nrw_with_isil.py")
    print("=" * 70)


if __name__ == "__main__":
    main()