226 lines
8.7 KiB
Python
Executable file
226 lines
8.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Danish Archives Web Scraper (Arkiv.dk)
|
|
Extracts Danish local archive data from Arkiv.dk municipal archive directory.
|
|
|
|
Author: GLAM Data Extraction Project
|
|
Date: 2025-11-19
|
|
License: MIT
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
import csv
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
import logging
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Output directory
|
|
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "denmark"
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
class DanishArchiveScraper:
|
|
"""Scrapes Danish municipal archive data from Arkiv.dk."""
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (GLAM Data Extraction Bot; +https://github.com/kempersc/glam)',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'da,en-US;q=0.9,en;q=0.8',
|
|
})
|
|
self.rate_limit_delay = 2 # Respectful 2-second delay between requests
|
|
self.base_url = "https://arkiv.dk"
|
|
|
|
def scrape_archive_list(self) -> List[Dict]:
|
|
"""
|
|
Scrape Arkiv.dk municipal archive directory.
|
|
|
|
Returns:
|
|
List of archive dictionaries
|
|
"""
|
|
logger.info("Starting Arkiv.dk archive directory scrape...")
|
|
|
|
archives_url = f"{self.base_url}/arkiver"
|
|
|
|
try:
|
|
response = self.session.get(archives_url)
|
|
response.raise_for_status()
|
|
response.encoding = 'utf-8'
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
archives = []
|
|
|
|
# Find all tab buttons (one per municipality)
|
|
# Pattern: <button>Municipality Name </button> followed by tabpanel with archive name
|
|
tab_buttons = soup.find_all('button', attrs={'aria-expanded': lambda x: x in ['true', 'false']})
|
|
|
|
logger.info(f"Found {len(tab_buttons)} municipal archive tabs")
|
|
|
|
for idx, button in enumerate(tab_buttons, 1):
|
|
# Extract municipality name from button text
|
|
municipality = button.get_text(strip=True)
|
|
|
|
# Find the associated tabpanel (sibling or following element)
|
|
# The tabpanel contains the archive name
|
|
parent = button.find_parent()
|
|
if parent:
|
|
tabpanel = parent.find_next_sibling('div', attrs={'role': 'tabpanel'})
|
|
|
|
if tabpanel:
|
|
archive_name = tabpanel.get_text(strip=True)
|
|
|
|
# Skip if no archive name found
|
|
if not archive_name or archive_name == municipality:
|
|
logger.warning(f"[{idx}/{len(tab_buttons)}] No archive name found for {municipality}")
|
|
continue
|
|
|
|
archive = {
|
|
'municipality': municipality,
|
|
'archive_name': archive_name,
|
|
'source': 'Arkiv.dk',
|
|
'country': 'DK',
|
|
'url': archives_url
|
|
}
|
|
|
|
archives.append(archive)
|
|
logger.info(f"[{idx}/{len(tab_buttons)}] Extracted: {municipality} → {archive_name}")
|
|
|
|
# Respectful rate limiting
|
|
time.sleep(self.rate_limit_delay)
|
|
|
|
logger.info(f"Successfully scraped {len(archives)} archive records from Arkiv.dk")
|
|
return archives
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Error fetching Arkiv.dk archive directory: {e}")
|
|
return []
|
|
|
|
def scrape_all_municipalities_via_api(self) -> List[Dict]:
|
|
"""
|
|
Alternative method: Try to find a JSON API endpoint.
|
|
Arkiv.dk may have an API for archive data.
|
|
|
|
Returns:
|
|
List of archive dictionaries
|
|
"""
|
|
logger.info("Attempting API-based scrape (if available)...")
|
|
|
|
# Check if Arkiv.dk has a JSON endpoint
|
|
api_candidates = [
|
|
f"{self.base_url}/api/archives",
|
|
f"{self.base_url}/arkiver.json",
|
|
f"{self.base_url}/api/arkiver",
|
|
]
|
|
|
|
for api_url in api_candidates:
|
|
try:
|
|
response = self.session.get(api_url, timeout=10)
|
|
if response.status_code == 200:
|
|
logger.info(f"Found API endpoint: {api_url}")
|
|
data = response.json()
|
|
# Process JSON data (structure unknown, adapt as needed)
|
|
return self._process_api_data(data)
|
|
except (requests.RequestException, json.JSONDecodeError):
|
|
continue
|
|
|
|
logger.info("No JSON API found, falling back to HTML scraping")
|
|
return []
|
|
|
|
def _process_api_data(self, data: Dict) -> List[Dict]:
|
|
"""Process JSON API response."""
|
|
# Placeholder - adapt based on actual API structure
|
|
logger.warning("API data processing not yet implemented")
|
|
return []
|
|
|
|
def export_to_csv(self, archives: List[Dict], filename: str):
|
|
"""Export scraped data to CSV."""
|
|
output_file = OUTPUT_DIR / filename
|
|
|
|
if not archives:
|
|
logger.warning(f"No archives to export to {filename}")
|
|
return
|
|
|
|
fieldnames = ['municipality', 'archive_name', 'country', 'source', 'url']
|
|
|
|
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(archives)
|
|
|
|
logger.info(f"Exported {len(archives)} records to {output_file}")
|
|
|
|
def export_to_json(self, archives: List[Dict], filename: str):
|
|
"""Export scraped data to JSON with metadata."""
|
|
output_file = OUTPUT_DIR / filename
|
|
|
|
metadata = {
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'data_source': 'Arkiv.dk Municipal Archive Directory',
|
|
'source_url': 'https://arkiv.dk/arkiver',
|
|
'scraper_version': '1.0.0',
|
|
'record_count': len(archives),
|
|
'notes': 'Danish local archives do NOT have ISIL codes - ISIL in Denmark is only for libraries',
|
|
'archives': archives
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"Exported {len(archives)} records to {output_file}")
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
logger.info("=== Danish Archive Directory Scraper (Arkiv.dk) ===")
|
|
logger.info("Respectful web scraping with 2-second rate limiting")
|
|
logger.info("NOTE: Danish archives do NOT have ISIL codes")
|
|
logger.info(" ISIL codes in Denmark (DK-*) are ONLY for libraries\n")
|
|
|
|
scraper = DanishArchiveScraper()
|
|
|
|
# Try API first (likely to fail, but worth checking)
|
|
logger.info("--- Phase 1: Checking for JSON API ---")
|
|
archives = scraper.scrape_all_municipalities_via_api()
|
|
|
|
# Fall back to HTML scraping
|
|
if not archives:
|
|
logger.info("\n--- Phase 2: HTML Scraping of Archive Directory ---")
|
|
archives = scraper.scrape_archive_list()
|
|
|
|
if archives:
|
|
# Export results
|
|
scraper.export_to_csv(archives, "danish_archives_arkivdk.csv")
|
|
scraper.export_to_json(archives, "danish_archives_arkivdk.json")
|
|
|
|
logger.info(f"\n=== Scraping Complete ===")
|
|
logger.info(f"Total archives extracted: {len(archives)}")
|
|
logger.info(f"Output directory: {OUTPUT_DIR}")
|
|
|
|
# Show sample
|
|
logger.info(f"\nSample records (first 5):")
|
|
for archive in archives[:5]:
|
|
logger.info(f" {archive['municipality']} → {archive['archive_name']}")
|
|
else:
|
|
logger.error("No archives were scraped. Check logs for errors.")
|
|
logger.info("\n=== Alternative Data Source Recommendations ===")
|
|
logger.info("1. Arkivvejviser.dk - Geographic archive directory with map")
|
|
logger.info("2. Contact Sammenslutningen af Lokalarkiver (SLA) for member list")
|
|
logger.info("3. Rigsarkivet - National archives may have comprehensive registry")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|