glam/scripts/scrapers/scrape_danish_archives_arkivdk.py

#!/usr/bin/env python3
"""
Danish Archives Web Scraper (Arkiv.dk)
Extracts Danish local archive data from Arkiv.dk municipal archive directory.

Author: GLAM Data Extraction Project
Date: 2025-11-19
License: MIT
"""

import requests
from bs4 import BeautifulSoup
import time
import csv
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Output directory
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "denmark"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


class DanishArchiveScraper:
    """Scrapes Danish municipal archive data from Arkiv.dk."""

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (GLAM Data Extraction Bot; +https://github.com/kempersc/glam)',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'da,en-US;q=0.9,en;q=0.8',
        })
        self.rate_limit_delay = 2  # Respectful 2-second delay between requests
        self.base_url = "https://arkiv.dk"

    def scrape_archive_list(self) -> List[Dict]:
        """
        Scrape Arkiv.dk municipal archive directory.

        Returns:
            List of archive dictionaries
        """
        logger.info("Starting Arkiv.dk archive directory scrape...")

        archives_url = f"{self.base_url}/arkiver"

        try:
            response = self.session.get(archives_url)
            response.raise_for_status()
            response.encoding = 'utf-8'

            soup = BeautifulSoup(response.text, 'html.parser')

            archives = []

            # Find all tab buttons (one per municipality)
            # Pattern: <button>Municipality Name </button> followed by tabpanel with archive name
            tab_buttons = soup.find_all('button', attrs={'aria-expanded': lambda x: x in ['true', 'false']})

            logger.info(f"Found {len(tab_buttons)} municipal archive tabs")

            for idx, button in enumerate(tab_buttons, 1):
                # Extract municipality name from button text
                municipality = button.get_text(strip=True)

                # Find the associated tabpanel (sibling or following element)
                # The tabpanel contains the archive name
                parent = button.find_parent()
                if parent:
                    tabpanel = parent.find_next_sibling('div', attrs={'role': 'tabpanel'})

                    if tabpanel:
                        archive_name = tabpanel.get_text(strip=True)

                        # Skip if no archive name found
                        if not archive_name or archive_name == municipality:
                            logger.warning(f"[{idx}/{len(tab_buttons)}] No archive name found for {municipality}")
                            continue

                        archive = {
                            'municipality': municipality,
                            'archive_name': archive_name,
                            'source': 'Arkiv.dk',
                            'country': 'DK',
                            'url': archives_url
                        }

                        archives.append(archive)
                        logger.info(f"[{idx}/{len(tab_buttons)}] Extracted: {municipality} → {archive_name}")

                # Respectful rate limiting
                time.sleep(self.rate_limit_delay)

            logger.info(f"Successfully scraped {len(archives)} archive records from Arkiv.dk")
            return archives

        except requests.RequestException as e:
            logger.error(f"Error fetching Arkiv.dk archive directory: {e}")
            return []

    def scrape_all_municipalities_via_api(self) -> List[Dict]:
        """
        Alternative method: Try to find a JSON API endpoint.
        Arkiv.dk may have an API for archive data.

        Returns:
            List of archive dictionaries
        """
        logger.info("Attempting API-based scrape (if available)...")

        # Check if Arkiv.dk has a JSON endpoint
        api_candidates = [
            f"{self.base_url}/api/archives",
            f"{self.base_url}/arkiver.json",
            f"{self.base_url}/api/arkiver",
        ]

        for api_url in api_candidates:
            try:
                response = self.session.get(api_url, timeout=10)
                if response.status_code == 200:
                    logger.info(f"Found API endpoint: {api_url}")
                    data = response.json()
                    # Process JSON data (structure unknown, adapt as needed)
                    return self._process_api_data(data)
            except (requests.RequestException, json.JSONDecodeError):
                continue

        logger.info("No JSON API found, falling back to HTML scraping")
        return []

    def _process_api_data(self, data: Dict) -> List[Dict]:
        """Process JSON API response."""
        # Placeholder - adapt based on actual API structure
        logger.warning("API data processing not yet implemented")
        return []

    def export_to_csv(self, archives: List[Dict], filename: str):
        """Export scraped data to CSV."""
        output_file = OUTPUT_DIR / filename

        if not archives:
            logger.warning(f"No archives to export to {filename}")
            return

        fieldnames = ['municipality', 'archive_name', 'country', 'source', 'url']

        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(archives)

        logger.info(f"Exported {len(archives)} records to {output_file}")

    def export_to_json(self, archives: List[Dict], filename: str):
        """Export scraped data to JSON with metadata."""
        output_file = OUTPUT_DIR / filename

        metadata = {
            'extraction_date': datetime.now(timezone.utc).isoformat(),
            'data_source': 'Arkiv.dk Municipal Archive Directory',
            'source_url': 'https://arkiv.dk/arkiver',
            'scraper_version': '1.0.0',
            'record_count': len(archives),
            'notes': 'Danish local archives do NOT have ISIL codes - ISIL in Denmark is only for libraries',
            'archives': archives
        }

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, ensure_ascii=False, indent=2)

        logger.info(f"Exported {len(archives)} records to {output_file}")


def main():
    """Main execution function."""
    logger.info("=== Danish Archive Directory Scraper (Arkiv.dk) ===")
    logger.info("Respectful web scraping with 2-second rate limiting")
    logger.info("NOTE: Danish archives do NOT have ISIL codes")
    logger.info("      ISIL codes in Denmark (DK-*) are ONLY for libraries\n")

    scraper = DanishArchiveScraper()

    # Try API first (likely to fail, but worth checking)
    logger.info("--- Phase 1: Checking for JSON API ---")
    archives = scraper.scrape_all_municipalities_via_api()

    # Fall back to HTML scraping
    if not archives:
        logger.info("\n--- Phase 2: HTML Scraping of Archive Directory ---")
        archives = scraper.scrape_archive_list()

    if archives:
        # Export results
        scraper.export_to_csv(archives, "danish_archives_arkivdk.csv")
        scraper.export_to_json(archives, "danish_archives_arkivdk.json")

        logger.info(f"\n=== Scraping Complete ===")
        logger.info(f"Total archives extracted: {len(archives)}")
        logger.info(f"Output directory: {OUTPUT_DIR}")

        # Show sample
        logger.info(f"\nSample records (first 5):")
        for archive in archives[:5]:
            logger.info(f"  {archive['municipality']} → {archive['archive_name']}")
    else:
        logger.error("No archives were scraped. Check logs for errors.")
        logger.info("\n=== Alternative Data Source Recommendations ===")
        logger.info("1. Arkivvejviser.dk - Geographic archive directory with map")
        logger.info("2. Contact Sammenslutningen af Lokalarkiver (SLA) for member list")
        logger.info("3. Rigsarkivet - National archives may have comprehensive registry")


if __name__ == "__main__":
    main()