#!/usr/bin/env python3 """ Danish Archives Web Scraper (Arkiv.dk) - Playwright Version Uses browser automation with JavaScript evaluation (no clicking needed). Author: GLAM Data Extraction Project Date: 2025-11-19 License: MIT """ import asyncio import csv import json from datetime import datetime, timezone from pathlib import Path from typing import List, Dict import logging # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Output directory OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "denmark" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) async def scrape_danish_archives(): """ Scrape Arkiv.dk archive directory using Playwright + JavaScript evaluation. Much faster than clicking - extract all data at once from DOM. """ from playwright.async_api import async_playwright logger.info("Starting Playwright browser automation...") async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: # Navigate to archives page logger.info("Navigating to https://arkiv.dk/arkiver") await page.goto('https://arkiv.dk/arkiver', wait_until='networkidle') # Wait for content to load await page.wait_for_selector('h4.panel-title', timeout=15000) await page.wait_for_timeout(2000) # Extra wait for JS rendering logger.info("Extracting archive data via JavaScript...") # Execute JavaScript to extract all data at once raw_data = await page.evaluate('''() => { const results = []; const links = document.querySelectorAll('h4.panel-title a[data-toggle="collapse"]'); links.forEach(link => { const municipality = link.textContent.trim(); const href = link.getAttribute('href'); if (href && href.startsWith('#')) { const panelId = href.substring(1); const panel = document.getElementById(panelId); if (panel) { const archiveText = panel.textContent.trim(); results.push({ municipality: municipality, archiveText: archiveText }); } } }); return results; }''') await browser.close() # Process raw data archives = [] for idx, item in enumerate(raw_data, 1): municipality = item['municipality'] archive_text = item['archiveText'] # Skip "Ingen" (no contribution) if archive_text == 'Ingen' or not archive_text: logger.info(f"[{idx}] {municipality} → (No contribution)") continue # Handle Specialsamlinger (multiple archives) if 'Specialsamlinger' in municipality: special_archives = [line.strip() for line in archive_text.split('\n') if line.strip()] logger.info(f"[{idx}] Specialsamlinger → {len(special_archives)} archives") for special_name in special_archives: archives.append({ 'municipality': 'Specialsamlinger', 'archive_name': special_name, 'source': 'Arkiv.dk', 'country': 'DK', 'url': 'https://arkiv.dk/arkiver' }) # Handle municipalities with multiple local archives (newline-separated) elif '\n' in archive_text and archive_text.count('\n') > 1: local_archives = [line.strip() for line in archive_text.split('\n') if line.strip()] logger.info(f"[{idx}] {municipality} → {len(local_archives)} local archives") for archive_name in local_archives: archives.append({ 'municipality': municipality, 'archive_name': archive_name, 'source': 'Arkiv.dk', 'country': 'DK', 'url': 'https://arkiv.dk/arkiver' }) # Single archive for municipality else: archives.append({ 'municipality': municipality, 'archive_name': archive_text, 'source': 'Arkiv.dk', 'country': 'DK', 'url': 'https://arkiv.dk/arkiver' }) logger.info(f"[{idx}] {municipality} → {archive_text}") logger.info(f"Successfully scraped {len(archives)} archive records from {len(raw_data)} municipalities") return archives except Exception as e: logger.error(f"Error during scraping: {e}", exc_info=True) await browser.close() return [] def export_to_csv(archives: List[Dict], filename: str): """Export scraped data to CSV.""" output_file = OUTPUT_DIR / filename if not archives: logger.warning(f"No archives to export to {filename}") return fieldnames = ['municipality', 'archive_name', 'country', 'source', 'url'] with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(archives) logger.info(f"Exported {len(archives)} records to {output_file}") def export_to_json(archives: List[Dict], filename: str): """Export scraped data to JSON with metadata.""" output_file = OUTPUT_DIR / filename metadata = { 'extraction_date': datetime.now(timezone.utc).isoformat(), 'data_source': 'Arkiv.dk Municipal Archive Directory', 'source_url': 'https://arkiv.dk/arkiver', 'scraper_version': '2.0.0-playwright-js-eval', 'record_count': len(archives), 'notes': 'Danish local archives do NOT have ISIL codes - ISIL in Denmark is only for libraries', 'archives': archives } with open(output_file, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) logger.info(f"Exported {len(archives)} records to {output_file}") async def main(): """Main execution function.""" logger.info("=== Danish Archive Directory Scraper (Arkiv.dk - Playwright) ===") logger.info("Using browser automation with JavaScript evaluation") logger.info("NOTE: Danish archives do NOT have ISIL codes") logger.info(" ISIL codes in Denmark (DK-*) are ONLY for libraries\n") archives = await scrape_danish_archives() if archives: # Export results export_to_csv(archives, "danish_archives_arkivdk.csv") export_to_json(archives, "danish_archives_arkivdk.json") logger.info(f"\n=== Scraping Complete ===") logger.info(f"Total archives extracted: {len(archives)}") logger.info(f"Output directory: {OUTPUT_DIR}") # Count by category municipal = [a for a in archives if a['municipality'] != 'Specialsamlinger'] special = [a for a in archives if a['municipality'] == 'Specialsamlinger'] logger.info(f"\nBreakdown:") logger.info(f" Municipal archives: {len(municipal)}") logger.info(f" Special collections: {len(special)}") # Show sample logger.info(f"\nSample municipal archives (first 10):") for archive in municipal[:10]: logger.info(f" {archive['municipality']} → {archive['archive_name']}") logger.info(f"\nSample special collections (first 10):") for archive in special[:10]: logger.info(f" {archive['archive_name']}") else: logger.error("No archives were scraped. Check logs for errors.") if __name__ == "__main__": asyncio.run(main())