glam/scripts/scrapers/scrape_danish_archives_arkivdk.py
2025-11-19 23:25:22 +01:00

226 lines
8.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Danish Archives Web Scraper (Arkiv.dk)
Extracts Danish local archive data from Arkiv.dk municipal archive directory.
Author: GLAM Data Extraction Project
Date: 2025-11-19
License: MIT
"""
import requests
from bs4 import BeautifulSoup
import time
import csv
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Output directory
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "denmark"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
class DanishArchiveScraper:
"""Scrapes Danish municipal archive data from Arkiv.dk."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (GLAM Data Extraction Bot; +https://github.com/kempersc/glam)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'da,en-US;q=0.9,en;q=0.8',
})
self.rate_limit_delay = 2 # Respectful 2-second delay between requests
self.base_url = "https://arkiv.dk"
def scrape_archive_list(self) -> List[Dict]:
"""
Scrape Arkiv.dk municipal archive directory.
Returns:
List of archive dictionaries
"""
logger.info("Starting Arkiv.dk archive directory scrape...")
archives_url = f"{self.base_url}/arkiver"
try:
response = self.session.get(archives_url)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
archives = []
# Find all tab buttons (one per municipality)
# Pattern: <button>Municipality Name </button> followed by tabpanel with archive name
tab_buttons = soup.find_all('button', attrs={'aria-expanded': lambda x: x in ['true', 'false']})
logger.info(f"Found {len(tab_buttons)} municipal archive tabs")
for idx, button in enumerate(tab_buttons, 1):
# Extract municipality name from button text
municipality = button.get_text(strip=True)
# Find the associated tabpanel (sibling or following element)
# The tabpanel contains the archive name
parent = button.find_parent()
if parent:
tabpanel = parent.find_next_sibling('div', attrs={'role': 'tabpanel'})
if tabpanel:
archive_name = tabpanel.get_text(strip=True)
# Skip if no archive name found
if not archive_name or archive_name == municipality:
logger.warning(f"[{idx}/{len(tab_buttons)}] No archive name found for {municipality}")
continue
archive = {
'municipality': municipality,
'archive_name': archive_name,
'source': 'Arkiv.dk',
'country': 'DK',
'url': archives_url
}
archives.append(archive)
logger.info(f"[{idx}/{len(tab_buttons)}] Extracted: {municipality}{archive_name}")
# Respectful rate limiting
time.sleep(self.rate_limit_delay)
logger.info(f"Successfully scraped {len(archives)} archive records from Arkiv.dk")
return archives
except requests.RequestException as e:
logger.error(f"Error fetching Arkiv.dk archive directory: {e}")
return []
def scrape_all_municipalities_via_api(self) -> List[Dict]:
"""
Alternative method: Try to find a JSON API endpoint.
Arkiv.dk may have an API for archive data.
Returns:
List of archive dictionaries
"""
logger.info("Attempting API-based scrape (if available)...")
# Check if Arkiv.dk has a JSON endpoint
api_candidates = [
f"{self.base_url}/api/archives",
f"{self.base_url}/arkiver.json",
f"{self.base_url}/api/arkiver",
]
for api_url in api_candidates:
try:
response = self.session.get(api_url, timeout=10)
if response.status_code == 200:
logger.info(f"Found API endpoint: {api_url}")
data = response.json()
# Process JSON data (structure unknown, adapt as needed)
return self._process_api_data(data)
except (requests.RequestException, json.JSONDecodeError):
continue
logger.info("No JSON API found, falling back to HTML scraping")
return []
def _process_api_data(self, data: Dict) -> List[Dict]:
"""Process JSON API response."""
# Placeholder - adapt based on actual API structure
logger.warning("API data processing not yet implemented")
return []
def export_to_csv(self, archives: List[Dict], filename: str):
"""Export scraped data to CSV."""
output_file = OUTPUT_DIR / filename
if not archives:
logger.warning(f"No archives to export to {filename}")
return
fieldnames = ['municipality', 'archive_name', 'country', 'source', 'url']
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(archives)
logger.info(f"Exported {len(archives)} records to {output_file}")
def export_to_json(self, archives: List[Dict], filename: str):
"""Export scraped data to JSON with metadata."""
output_file = OUTPUT_DIR / filename
metadata = {
'extraction_date': datetime.now(timezone.utc).isoformat(),
'data_source': 'Arkiv.dk Municipal Archive Directory',
'source_url': 'https://arkiv.dk/arkiver',
'scraper_version': '1.0.0',
'record_count': len(archives),
'notes': 'Danish local archives do NOT have ISIL codes - ISIL in Denmark is only for libraries',
'archives': archives
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
logger.info(f"Exported {len(archives)} records to {output_file}")
def main():
"""Main execution function."""
logger.info("=== Danish Archive Directory Scraper (Arkiv.dk) ===")
logger.info("Respectful web scraping with 2-second rate limiting")
logger.info("NOTE: Danish archives do NOT have ISIL codes")
logger.info(" ISIL codes in Denmark (DK-*) are ONLY for libraries\n")
scraper = DanishArchiveScraper()
# Try API first (likely to fail, but worth checking)
logger.info("--- Phase 1: Checking for JSON API ---")
archives = scraper.scrape_all_municipalities_via_api()
# Fall back to HTML scraping
if not archives:
logger.info("\n--- Phase 2: HTML Scraping of Archive Directory ---")
archives = scraper.scrape_archive_list()
if archives:
# Export results
scraper.export_to_csv(archives, "danish_archives_arkivdk.csv")
scraper.export_to_json(archives, "danish_archives_arkivdk.json")
logger.info(f"\n=== Scraping Complete ===")
logger.info(f"Total archives extracted: {len(archives)}")
logger.info(f"Output directory: {OUTPUT_DIR}")
# Show sample
logger.info(f"\nSample records (first 5):")
for archive in archives[:5]:
logger.info(f" {archive['municipality']}{archive['archive_name']}")
else:
logger.error("No archives were scraped. Check logs for errors.")
logger.info("\n=== Alternative Data Source Recommendations ===")
logger.info("1. Arkivvejviser.dk - Geographic archive directory with map")
logger.info("2. Contact Sammenslutningen af Lokalarkiver (SLA) for member list")
logger.info("3. Rigsarkivet - National archives may have comprehensive registry")
if __name__ == "__main__":
main()