glam/scripts/scrapers/scrape_danish_archives_playwright.py
2025-11-19 23:25:22 +01:00

219 lines
8.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Danish Archives Web Scraper (Arkiv.dk) - Playwright Version
Uses browser automation with JavaScript evaluation (no clicking needed).
Author: GLAM Data Extraction Project
Date: 2025-11-19
License: MIT
"""
import asyncio
import csv
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Output directory
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "denmark"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
async def scrape_danish_archives():
"""
Scrape Arkiv.dk archive directory using Playwright + JavaScript evaluation.
Much faster than clicking - extract all data at once from DOM.
"""
from playwright.async_api import async_playwright
logger.info("Starting Playwright browser automation...")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
# Navigate to archives page
logger.info("Navigating to https://arkiv.dk/arkiver")
await page.goto('https://arkiv.dk/arkiver', wait_until='networkidle')
# Wait for content to load
await page.wait_for_selector('h4.panel-title', timeout=15000)
await page.wait_for_timeout(2000) # Extra wait for JS rendering
logger.info("Extracting archive data via JavaScript...")
# Execute JavaScript to extract all data at once
raw_data = await page.evaluate('''() => {
const results = [];
const links = document.querySelectorAll('h4.panel-title a[data-toggle="collapse"]');
links.forEach(link => {
const municipality = link.textContent.trim();
const href = link.getAttribute('href');
if (href && href.startsWith('#')) {
const panelId = href.substring(1);
const panel = document.getElementById(panelId);
if (panel) {
const archiveText = panel.textContent.trim();
results.push({
municipality: municipality,
archiveText: archiveText
});
}
}
});
return results;
}''')
await browser.close()
# Process raw data
archives = []
for idx, item in enumerate(raw_data, 1):
municipality = item['municipality']
archive_text = item['archiveText']
# Skip "Ingen" (no contribution)
if archive_text == 'Ingen' or not archive_text:
logger.info(f"[{idx}] {municipality} → (No contribution)")
continue
# Handle Specialsamlinger (multiple archives)
if 'Specialsamlinger' in municipality:
special_archives = [line.strip() for line in archive_text.split('\n') if line.strip()]
logger.info(f"[{idx}] Specialsamlinger → {len(special_archives)} archives")
for special_name in special_archives:
archives.append({
'municipality': 'Specialsamlinger',
'archive_name': special_name,
'source': 'Arkiv.dk',
'country': 'DK',
'url': 'https://arkiv.dk/arkiver'
})
# Handle municipalities with multiple local archives (newline-separated)
elif '\n' in archive_text and archive_text.count('\n') > 1:
local_archives = [line.strip() for line in archive_text.split('\n') if line.strip()]
logger.info(f"[{idx}] {municipality}{len(local_archives)} local archives")
for archive_name in local_archives:
archives.append({
'municipality': municipality,
'archive_name': archive_name,
'source': 'Arkiv.dk',
'country': 'DK',
'url': 'https://arkiv.dk/arkiver'
})
# Single archive for municipality
else:
archives.append({
'municipality': municipality,
'archive_name': archive_text,
'source': 'Arkiv.dk',
'country': 'DK',
'url': 'https://arkiv.dk/arkiver'
})
logger.info(f"[{idx}] {municipality}{archive_text}")
logger.info(f"Successfully scraped {len(archives)} archive records from {len(raw_data)} municipalities")
return archives
except Exception as e:
logger.error(f"Error during scraping: {e}", exc_info=True)
await browser.close()
return []
def export_to_csv(archives: List[Dict], filename: str):
"""Export scraped data to CSV."""
output_file = OUTPUT_DIR / filename
if not archives:
logger.warning(f"No archives to export to {filename}")
return
fieldnames = ['municipality', 'archive_name', 'country', 'source', 'url']
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(archives)
logger.info(f"Exported {len(archives)} records to {output_file}")
def export_to_json(archives: List[Dict], filename: str):
"""Export scraped data to JSON with metadata."""
output_file = OUTPUT_DIR / filename
metadata = {
'extraction_date': datetime.now(timezone.utc).isoformat(),
'data_source': 'Arkiv.dk Municipal Archive Directory',
'source_url': 'https://arkiv.dk/arkiver',
'scraper_version': '2.0.0-playwright-js-eval',
'record_count': len(archives),
'notes': 'Danish local archives do NOT have ISIL codes - ISIL in Denmark is only for libraries',
'archives': archives
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
logger.info(f"Exported {len(archives)} records to {output_file}")
async def main():
"""Main execution function."""
logger.info("=== Danish Archive Directory Scraper (Arkiv.dk - Playwright) ===")
logger.info("Using browser automation with JavaScript evaluation")
logger.info("NOTE: Danish archives do NOT have ISIL codes")
logger.info(" ISIL codes in Denmark (DK-*) are ONLY for libraries\n")
archives = await scrape_danish_archives()
if archives:
# Export results
export_to_csv(archives, "danish_archives_arkivdk.csv")
export_to_json(archives, "danish_archives_arkivdk.json")
logger.info(f"\n=== Scraping Complete ===")
logger.info(f"Total archives extracted: {len(archives)}")
logger.info(f"Output directory: {OUTPUT_DIR}")
# Count by category
municipal = [a for a in archives if a['municipality'] != 'Specialsamlinger']
special = [a for a in archives if a['municipality'] == 'Specialsamlinger']
logger.info(f"\nBreakdown:")
logger.info(f" Municipal archives: {len(municipal)}")
logger.info(f" Special collections: {len(special)}")
# Show sample
logger.info(f"\nSample municipal archives (first 10):")
for archive in municipal[:10]:
logger.info(f" {archive['municipality']}{archive['archive_name']}")
logger.info(f"\nSample special collections (first 10):")
for archive in special[:10]:
logger.info(f" {archive['archive_name']}")
else:
logger.error("No archives were scraped. Check logs for errors.")
if __name__ == "__main__":
asyncio.run(main())