#!/usr/bin/env python3 """ Danish Archives Web Scraper (Arkiv.dk) Extracts Danish local archive data from Arkiv.dk municipal archive directory. Author: GLAM Data Extraction Project Date: 2025-11-19 License: MIT """ import requests from bs4 import BeautifulSoup import time import csv import json import re from datetime import datetime, timezone from pathlib import Path from typing import List, Dict, Optional import logging # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Output directory OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "denmark" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) class DanishArchiveScraper: """Scrapes Danish municipal archive data from Arkiv.dk.""" def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (GLAM Data Extraction Bot; +https://github.com/kempersc/glam)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'da,en-US;q=0.9,en;q=0.8', }) self.rate_limit_delay = 2 # Respectful 2-second delay between requests self.base_url = "https://arkiv.dk" def scrape_archive_list(self) -> List[Dict]: """ Scrape Arkiv.dk municipal archive directory. Returns: List of archive dictionaries """ logger.info("Starting Arkiv.dk archive directory scrape...") archives_url = f"{self.base_url}/arkiver" try: response = self.session.get(archives_url) response.raise_for_status() response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') archives = [] # Find all tab buttons (one per municipality) # Pattern: followed by tabpanel with archive name tab_buttons = soup.find_all('button', attrs={'aria-expanded': lambda x: x in ['true', 'false']}) logger.info(f"Found {len(tab_buttons)} municipal archive tabs") for idx, button in enumerate(tab_buttons, 1): # Extract municipality name from button text municipality = button.get_text(strip=True) # Find the associated tabpanel (sibling or following element) # The tabpanel contains the archive name parent = button.find_parent() if parent: tabpanel = parent.find_next_sibling('div', attrs={'role': 'tabpanel'}) if tabpanel: archive_name = tabpanel.get_text(strip=True) # Skip if no archive name found if not archive_name or archive_name == municipality: logger.warning(f"[{idx}/{len(tab_buttons)}] No archive name found for {municipality}") continue archive = { 'municipality': municipality, 'archive_name': archive_name, 'source': 'Arkiv.dk', 'country': 'DK', 'url': archives_url } archives.append(archive) logger.info(f"[{idx}/{len(tab_buttons)}] Extracted: {municipality} → {archive_name}") # Respectful rate limiting time.sleep(self.rate_limit_delay) logger.info(f"Successfully scraped {len(archives)} archive records from Arkiv.dk") return archives except requests.RequestException as e: logger.error(f"Error fetching Arkiv.dk archive directory: {e}") return [] def scrape_all_municipalities_via_api(self) -> List[Dict]: """ Alternative method: Try to find a JSON API endpoint. Arkiv.dk may have an API for archive data. Returns: List of archive dictionaries """ logger.info("Attempting API-based scrape (if available)...") # Check if Arkiv.dk has a JSON endpoint api_candidates = [ f"{self.base_url}/api/archives", f"{self.base_url}/arkiver.json", f"{self.base_url}/api/arkiver", ] for api_url in api_candidates: try: response = self.session.get(api_url, timeout=10) if response.status_code == 200: logger.info(f"Found API endpoint: {api_url}") data = response.json() # Process JSON data (structure unknown, adapt as needed) return self._process_api_data(data) except (requests.RequestException, json.JSONDecodeError): continue logger.info("No JSON API found, falling back to HTML scraping") return [] def _process_api_data(self, data: Dict) -> List[Dict]: """Process JSON API response.""" # Placeholder - adapt based on actual API structure logger.warning("API data processing not yet implemented") return [] def export_to_csv(self, archives: List[Dict], filename: str): """Export scraped data to CSV.""" output_file = OUTPUT_DIR / filename if not archives: logger.warning(f"No archives to export to {filename}") return fieldnames = ['municipality', 'archive_name', 'country', 'source', 'url'] with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(archives) logger.info(f"Exported {len(archives)} records to {output_file}") def export_to_json(self, archives: List[Dict], filename: str): """Export scraped data to JSON with metadata.""" output_file = OUTPUT_DIR / filename metadata = { 'extraction_date': datetime.now(timezone.utc).isoformat(), 'data_source': 'Arkiv.dk Municipal Archive Directory', 'source_url': 'https://arkiv.dk/arkiver', 'scraper_version': '1.0.0', 'record_count': len(archives), 'notes': 'Danish local archives do NOT have ISIL codes - ISIL in Denmark is only for libraries', 'archives': archives } with open(output_file, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) logger.info(f"Exported {len(archives)} records to {output_file}") def main(): """Main execution function.""" logger.info("=== Danish Archive Directory Scraper (Arkiv.dk) ===") logger.info("Respectful web scraping with 2-second rate limiting") logger.info("NOTE: Danish archives do NOT have ISIL codes") logger.info(" ISIL codes in Denmark (DK-*) are ONLY for libraries\n") scraper = DanishArchiveScraper() # Try API first (likely to fail, but worth checking) logger.info("--- Phase 1: Checking for JSON API ---") archives = scraper.scrape_all_municipalities_via_api() # Fall back to HTML scraping if not archives: logger.info("\n--- Phase 2: HTML Scraping of Archive Directory ---") archives = scraper.scrape_archive_list() if archives: # Export results scraper.export_to_csv(archives, "danish_archives_arkivdk.csv") scraper.export_to_json(archives, "danish_archives_arkivdk.json") logger.info(f"\n=== Scraping Complete ===") logger.info(f"Total archives extracted: {len(archives)}") logger.info(f"Output directory: {OUTPUT_DIR}") # Show sample logger.info(f"\nSample records (first 5):") for archive in archives[:5]: logger.info(f" {archive['municipality']} → {archive['archive_name']}") else: logger.error("No archives were scraped. Check logs for errors.") logger.info("\n=== Alternative Data Source Recommendations ===") logger.info("1. Arkivvejviser.dk - Geographic archive directory with map") logger.info("2. Contact Sammenslutningen af Lokalarkiver (SLA) for member list") logger.info("3. Rigsarkivet - National archives may have comprehensive registry") if __name__ == "__main__": main()