glam/scripts/scrapers/scrape_danish_archives_playwright.py

#!/usr/bin/env python3
"""
Danish Archives Web Scraper (Arkiv.dk) - Playwright Version
Uses browser automation with JavaScript evaluation (no clicking needed).

Author: GLAM Data Extraction Project
Date: 2025-11-19
License: MIT
"""

import asyncio
import csv
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Output directory
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "denmark"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


async def scrape_danish_archives():
    """
    Scrape Arkiv.dk archive directory using Playwright + JavaScript evaluation.
    Much faster than clicking - extract all data at once from DOM.
    """
    from playwright.async_api import async_playwright

    logger.info("Starting Playwright browser automation...")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        try:
            # Navigate to archives page
            logger.info("Navigating to https://arkiv.dk/arkiver")
            await page.goto('https://arkiv.dk/arkiver', wait_until='networkidle')

            # Wait for content to load
            await page.wait_for_selector('h4.panel-title', timeout=15000)
            await page.wait_for_timeout(2000)  # Extra wait for JS rendering

            logger.info("Extracting archive data via JavaScript...")

            # Execute JavaScript to extract all data at once
            raw_data = await page.evaluate('''() => {
                const results = [];
                const links = document.querySelectorAll('h4.panel-title a[data-toggle="collapse"]');

                links.forEach(link => {
                    const municipality = link.textContent.trim();
                    const href = link.getAttribute('href');

                    if (href && href.startsWith('#')) {
                        const panelId = href.substring(1);
                        const panel = document.getElementById(panelId);

                        if (panel) {
                            const archiveText = panel.textContent.trim();
                            results.push({
                                municipality: municipality,
                                archiveText: archiveText
                            });
                        }
                    }
                });

                return results;
            }''')

            await browser.close()

            # Process raw data
            archives = []
            for idx, item in enumerate(raw_data, 1):
                municipality = item['municipality']
                archive_text = item['archiveText']

                # Skip "Ingen" (no contribution)
                if archive_text == 'Ingen' or not archive_text:
                    logger.info(f"[{idx}] {municipality} → (No contribution)")
                    continue

                # Handle Specialsamlinger (multiple archives)
                if 'Specialsamlinger' in municipality:
                    special_archives = [line.strip() for line in archive_text.split('\n') if line.strip()]
                    logger.info(f"[{idx}] Specialsamlinger → {len(special_archives)} archives")

                    for special_name in special_archives:
                        archives.append({
                            'municipality': 'Specialsamlinger',
                            'archive_name': special_name,
                            'source': 'Arkiv.dk',
                            'country': 'DK',
                            'url': 'https://arkiv.dk/arkiver'
                        })

                # Handle municipalities with multiple local archives (newline-separated)
                elif '\n' in archive_text and archive_text.count('\n') > 1:
                    local_archives = [line.strip() for line in archive_text.split('\n') if line.strip()]
                    logger.info(f"[{idx}] {municipality} → {len(local_archives)} local archives")

                    for archive_name in local_archives:
                        archives.append({
                            'municipality': municipality,
                            'archive_name': archive_name,
                            'source': 'Arkiv.dk',
                            'country': 'DK',
                            'url': 'https://arkiv.dk/arkiver'
                        })

                # Single archive for municipality
                else:
                    archives.append({
                        'municipality': municipality,
                        'archive_name': archive_text,
                        'source': 'Arkiv.dk',
                        'country': 'DK',
                        'url': 'https://arkiv.dk/arkiver'
                    })
                    logger.info(f"[{idx}] {municipality} → {archive_text}")

            logger.info(f"Successfully scraped {len(archives)} archive records from {len(raw_data)} municipalities")
            return archives

        except Exception as e:
            logger.error(f"Error during scraping: {e}", exc_info=True)
            await browser.close()
            return []


def export_to_csv(archives: List[Dict], filename: str):
    """Export scraped data to CSV."""
    output_file = OUTPUT_DIR / filename

    if not archives:
        logger.warning(f"No archives to export to {filename}")
        return

    fieldnames = ['municipality', 'archive_name', 'country', 'source', 'url']

    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(archives)

    logger.info(f"Exported {len(archives)} records to {output_file}")


def export_to_json(archives: List[Dict], filename: str):
    """Export scraped data to JSON with metadata."""
    output_file = OUTPUT_DIR / filename

    metadata = {
        'extraction_date': datetime.now(timezone.utc).isoformat(),
        'data_source': 'Arkiv.dk Municipal Archive Directory',
        'source_url': 'https://arkiv.dk/arkiver',
        'scraper_version': '2.0.0-playwright-js-eval',
        'record_count': len(archives),
        'notes': 'Danish local archives do NOT have ISIL codes - ISIL in Denmark is only for libraries',
        'archives': archives
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False, indent=2)

    logger.info(f"Exported {len(archives)} records to {output_file}")


async def main():
    """Main execution function."""
    logger.info("=== Danish Archive Directory Scraper (Arkiv.dk - Playwright) ===")
    logger.info("Using browser automation with JavaScript evaluation")
    logger.info("NOTE: Danish archives do NOT have ISIL codes")
    logger.info("      ISIL codes in Denmark (DK-*) are ONLY for libraries\n")

    archives = await scrape_danish_archives()

    if archives:
        # Export results
        export_to_csv(archives, "danish_archives_arkivdk.csv")
        export_to_json(archives, "danish_archives_arkivdk.json")

        logger.info(f"\n=== Scraping Complete ===")
        logger.info(f"Total archives extracted: {len(archives)}")
        logger.info(f"Output directory: {OUTPUT_DIR}")

        # Count by category
        municipal = [a for a in archives if a['municipality'] != 'Specialsamlinger']
        special = [a for a in archives if a['municipality'] == 'Specialsamlinger']

        logger.info(f"\nBreakdown:")
        logger.info(f"  Municipal archives: {len(municipal)}")
        logger.info(f"  Special collections: {len(special)}")

        # Show sample
        logger.info(f"\nSample municipal archives (first 10):")
        for archive in municipal[:10]:
            logger.info(f"  {archive['municipality']} → {archive['archive_name']}")

        logger.info(f"\nSample special collections (first 10):")
        for archive in special[:10]:
            logger.info(f"  {archive['archive_name']}")
    else:
        logger.error("No archives were scraped. Check logs for errors.")


if __name__ == "__main__":
    asyncio.run(main())