219 lines
8.5 KiB
Python
Executable file
219 lines
8.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Danish Archives Web Scraper (Arkiv.dk) - Playwright Version
|
|
Uses browser automation with JavaScript evaluation (no clicking needed).
|
|
|
|
Author: GLAM Data Extraction Project
|
|
Date: 2025-11-19
|
|
License: MIT
|
|
"""
|
|
|
|
import asyncio
|
|
import csv
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Dict
|
|
import logging
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Output directory
|
|
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "denmark"
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
async def scrape_danish_archives():
|
|
"""
|
|
Scrape Arkiv.dk archive directory using Playwright + JavaScript evaluation.
|
|
Much faster than clicking - extract all data at once from DOM.
|
|
"""
|
|
from playwright.async_api import async_playwright
|
|
|
|
logger.info("Starting Playwright browser automation...")
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
page = await browser.new_page()
|
|
|
|
try:
|
|
# Navigate to archives page
|
|
logger.info("Navigating to https://arkiv.dk/arkiver")
|
|
await page.goto('https://arkiv.dk/arkiver', wait_until='networkidle')
|
|
|
|
# Wait for content to load
|
|
await page.wait_for_selector('h4.panel-title', timeout=15000)
|
|
await page.wait_for_timeout(2000) # Extra wait for JS rendering
|
|
|
|
logger.info("Extracting archive data via JavaScript...")
|
|
|
|
# Execute JavaScript to extract all data at once
|
|
raw_data = await page.evaluate('''() => {
|
|
const results = [];
|
|
const links = document.querySelectorAll('h4.panel-title a[data-toggle="collapse"]');
|
|
|
|
links.forEach(link => {
|
|
const municipality = link.textContent.trim();
|
|
const href = link.getAttribute('href');
|
|
|
|
if (href && href.startsWith('#')) {
|
|
const panelId = href.substring(1);
|
|
const panel = document.getElementById(panelId);
|
|
|
|
if (panel) {
|
|
const archiveText = panel.textContent.trim();
|
|
results.push({
|
|
municipality: municipality,
|
|
archiveText: archiveText
|
|
});
|
|
}
|
|
}
|
|
});
|
|
|
|
return results;
|
|
}''')
|
|
|
|
await browser.close()
|
|
|
|
# Process raw data
|
|
archives = []
|
|
for idx, item in enumerate(raw_data, 1):
|
|
municipality = item['municipality']
|
|
archive_text = item['archiveText']
|
|
|
|
# Skip "Ingen" (no contribution)
|
|
if archive_text == 'Ingen' or not archive_text:
|
|
logger.info(f"[{idx}] {municipality} → (No contribution)")
|
|
continue
|
|
|
|
# Handle Specialsamlinger (multiple archives)
|
|
if 'Specialsamlinger' in municipality:
|
|
special_archives = [line.strip() for line in archive_text.split('\n') if line.strip()]
|
|
logger.info(f"[{idx}] Specialsamlinger → {len(special_archives)} archives")
|
|
|
|
for special_name in special_archives:
|
|
archives.append({
|
|
'municipality': 'Specialsamlinger',
|
|
'archive_name': special_name,
|
|
'source': 'Arkiv.dk',
|
|
'country': 'DK',
|
|
'url': 'https://arkiv.dk/arkiver'
|
|
})
|
|
|
|
# Handle municipalities with multiple local archives (newline-separated)
|
|
elif '\n' in archive_text and archive_text.count('\n') > 1:
|
|
local_archives = [line.strip() for line in archive_text.split('\n') if line.strip()]
|
|
logger.info(f"[{idx}] {municipality} → {len(local_archives)} local archives")
|
|
|
|
for archive_name in local_archives:
|
|
archives.append({
|
|
'municipality': municipality,
|
|
'archive_name': archive_name,
|
|
'source': 'Arkiv.dk',
|
|
'country': 'DK',
|
|
'url': 'https://arkiv.dk/arkiver'
|
|
})
|
|
|
|
# Single archive for municipality
|
|
else:
|
|
archives.append({
|
|
'municipality': municipality,
|
|
'archive_name': archive_text,
|
|
'source': 'Arkiv.dk',
|
|
'country': 'DK',
|
|
'url': 'https://arkiv.dk/arkiver'
|
|
})
|
|
logger.info(f"[{idx}] {municipality} → {archive_text}")
|
|
|
|
logger.info(f"Successfully scraped {len(archives)} archive records from {len(raw_data)} municipalities")
|
|
return archives
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during scraping: {e}", exc_info=True)
|
|
await browser.close()
|
|
return []
|
|
|
|
|
|
def export_to_csv(archives: List[Dict], filename: str):
|
|
"""Export scraped data to CSV."""
|
|
output_file = OUTPUT_DIR / filename
|
|
|
|
if not archives:
|
|
logger.warning(f"No archives to export to {filename}")
|
|
return
|
|
|
|
fieldnames = ['municipality', 'archive_name', 'country', 'source', 'url']
|
|
|
|
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(archives)
|
|
|
|
logger.info(f"Exported {len(archives)} records to {output_file}")
|
|
|
|
|
|
def export_to_json(archives: List[Dict], filename: str):
|
|
"""Export scraped data to JSON with metadata."""
|
|
output_file = OUTPUT_DIR / filename
|
|
|
|
metadata = {
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'data_source': 'Arkiv.dk Municipal Archive Directory',
|
|
'source_url': 'https://arkiv.dk/arkiver',
|
|
'scraper_version': '2.0.0-playwright-js-eval',
|
|
'record_count': len(archives),
|
|
'notes': 'Danish local archives do NOT have ISIL codes - ISIL in Denmark is only for libraries',
|
|
'archives': archives
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"Exported {len(archives)} records to {output_file}")
|
|
|
|
|
|
async def main():
|
|
"""Main execution function."""
|
|
logger.info("=== Danish Archive Directory Scraper (Arkiv.dk - Playwright) ===")
|
|
logger.info("Using browser automation with JavaScript evaluation")
|
|
logger.info("NOTE: Danish archives do NOT have ISIL codes")
|
|
logger.info(" ISIL codes in Denmark (DK-*) are ONLY for libraries\n")
|
|
|
|
archives = await scrape_danish_archives()
|
|
|
|
if archives:
|
|
# Export results
|
|
export_to_csv(archives, "danish_archives_arkivdk.csv")
|
|
export_to_json(archives, "danish_archives_arkivdk.json")
|
|
|
|
logger.info(f"\n=== Scraping Complete ===")
|
|
logger.info(f"Total archives extracted: {len(archives)}")
|
|
logger.info(f"Output directory: {OUTPUT_DIR}")
|
|
|
|
# Count by category
|
|
municipal = [a for a in archives if a['municipality'] != 'Specialsamlinger']
|
|
special = [a for a in archives if a['municipality'] == 'Specialsamlinger']
|
|
|
|
logger.info(f"\nBreakdown:")
|
|
logger.info(f" Municipal archives: {len(municipal)}")
|
|
logger.info(f" Special collections: {len(special)}")
|
|
|
|
# Show sample
|
|
logger.info(f"\nSample municipal archives (first 10):")
|
|
for archive in municipal[:10]:
|
|
logger.info(f" {archive['municipality']} → {archive['archive_name']}")
|
|
|
|
logger.info(f"\nSample special collections (first 10):")
|
|
for archive in special[:10]:
|
|
logger.info(f" {archive['archive_name']}")
|
|
else:
|
|
logger.error("No archives were scraped. Check logs for errors.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|