glam/scripts/scrapers/harvest_archivportal_d.py

#!/usr/bin/env python3
"""
Archivportal-D Archive Harvester

This script harvests archive listings from Archivportal-D, the national German
archive portal operated by Deutsche Digitale Bibliothek.

Portal: https://www.archivportal-d.de/
Coverage: All archives across Germany (state, municipal, church, business, etc.)
Method: Web scraping (fallback if API unavailable)

Author: OpenCode + MCP Tools
Date: 2025-11-19
"""

import json
import time
import re
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, quote

# Configuration
ARCHIVPORTAL_BASE_URL = "https://www.archivportal-d.de"
ARCHIVE_LIST_URL = f"{ARCHIVPORTAL_BASE_URL}/struktur"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
REQUEST_DELAY = 1.5  # Seconds between requests (be respectful)
MAX_RETRIES = 3
USER_AGENT = "GlamDataHarvester/1.0 (https://github.com/yourusername/glam; contact@email.com)"

# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


def fetch_page(url: str, params: Optional[Dict] = None) -> Optional[BeautifulSoup]:
    """
    Fetch a page and return BeautifulSoup object.

    Args:
        url: URL to fetch
        params: Optional query parameters

    Returns:
        BeautifulSoup object or None on error
    """
    headers = {
        'User-Agent': USER_AGENT,
        'Accept': 'text/html,application/xhtml+xml',
        'Accept-Language': 'de,en;q=0.9'
    }

    for attempt in range(MAX_RETRIES):
        try:
            print(f"Fetching: {url}", end=' ')
            if params:
                print(f"(params: {params})", end=' ')

            response = requests.get(url, params=params, headers=headers, timeout=30)
            response.raise_for_status()

            print("OK")
            return BeautifulSoup(response.content, 'html.parser')

        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(REQUEST_DELAY * (attempt + 1))
            else:
                return None


def extract_archive_from_listing(item_elem) -> Optional[Dict]:
    """
    Extract archive information from a listing item element.

    Args:
        item_elem: BeautifulSoup element for archive listing item

    Returns:
        Dictionary with archive info or None
    """
    try:
        archive: Dict = {
            'name': '',
            'location': '',
            'federal_state': '',
            'archive_type': '',
            'archive_id': '',
            'profile_url': '',
            'description': '',
            'isil': ''
        }

        # Extract name
        name_elem = item_elem.find(['h2', 'h3', 'h4'], class_=['title', 'heading', 'name'])
        if name_elem:
            archive['name'] = name_elem.get_text(strip=True)

        # Extract profile URL
        link_elem = item_elem.find('a', href=True)
        if link_elem:
            archive['profile_url'] = urljoin(ARCHIVPORTAL_BASE_URL, link_elem['href'])
            # Extract ID from URL if present
            match = re.search(r'/struktur/([A-Za-z0-9_-]+)', link_elem['href'])
            if match:
                archive['archive_id'] = match.group(1)

        # Extract location (city)
        location_elem = item_elem.find(class_=['location', 'place', 'city'])
        if location_elem:
            archive['location'] = location_elem.get_text(strip=True)

        # Extract federal state (Bundesland)
        state_elem = item_elem.find(class_=['state', 'federal-state', 'bundesland'])
        if state_elem:
            archive['federal_state'] = state_elem.get_text(strip=True)

        # Extract archive type/sector
        type_elem = item_elem.find(class_=['type', 'sector', 'category'])
        if type_elem:
            archive['archive_type'] = type_elem.get_text(strip=True)

        # Extract description
        desc_elem = item_elem.find(class_=['description', 'abstract', 'summary'])
        if desc_elem:
            archive['description'] = desc_elem.get_text(strip=True)

        # Look for ISIL code in text
        text = item_elem.get_text()
        isil_match = re.search(r'\b(DE-[A-Za-z0-9]+)\b', text)
        if isil_match:
            archive['isil'] = isil_match.group(1)

        return archive if archive['name'] else None

    except Exception as e:
        print(f"Error parsing archive item: {e}")
        return None


def fetch_archive_profile(profile_url: str) -> Dict:
    """
    Fetch detailed information from an archive's profile page.

    Args:
        profile_url: URL to archive profile

    Returns:
        Dictionary with enriched archive info
    """
    soup = fetch_page(profile_url)
    if not soup:
        return {}

    enriched = {
        'contact': {},
        'finding_aids': None,
        'digital_copies': None,
        'collections': [],
        'coordinates': {}
    }

    try:
        # Extract contact information
        contact_section = soup.find(['section', 'div'], class_=['contact', 'kontakt'])
        if contact_section:
            email_elem = contact_section.find('a', href=re.compile(r'^mailto:'))
            if email_elem:
                enriched['contact']['email'] = email_elem['href'].replace('mailto:', '')

            phone_elem = contact_section.find('a', href=re.compile(r'^tel:'))
            if phone_elem:
                enriched['contact']['phone'] = phone_elem['href'].replace('tel:', '')

            website_elem = contact_section.find('a', href=re.compile(r'^https?://'))
            if website_elem:
                enriched['contact']['website'] = website_elem['href']

        # Extract finding aids count
        finding_aids_elem = soup.find(text=re.compile(r'Findbücher|Finding aids'))
        if finding_aids_elem:
            match = re.search(r'(\d+)', finding_aids_elem)
            if match:
                enriched['finding_aids'] = int(match.group(1))

        # Extract digital copies count
        digital_elem = soup.find(text=re.compile(r'digitalisierte|digital'))
        if digital_elem:
            match = re.search(r'(\d+)', digital_elem)
            if match:
                enriched['digital_copies'] = int(match.group(1))

        # Extract coordinates if present
        map_elem = soup.find(['div', 'section'], class_=['map', 'karte'])
        if map_elem:
            lat_match = re.search(r'latitude["\s:]+([0-9.]+)', str(map_elem))
            lon_match = re.search(r'longitude["\s:]+([0-9.]+)', str(map_elem))
            if lat_match and lon_match:
                enriched['coordinates'] = {
                    'latitude': lat_match.group(1),
                    'longitude': lon_match.group(1)
                }

    except Exception as e:
        print(f"Error enriching profile: {e}")

    return enriched


def harvest_archive_list(max_pages: Optional[int] = None, enrich_profiles: bool = False) -> List[Dict]:
    """
    Harvest archive listings from Archivportal-D.

    Args:
        max_pages: Maximum pages to fetch (None = all)
        enrich_profiles: Whether to fetch detailed profile pages

    Returns:
        List of archive records
    """
    print(f"\n{'='*70}")
    print(f"Harvesting Archivportal-D Archive Listings")
    print(f"Portal: {ARCHIVPORTAL_BASE_URL}")
    print(f"{'='*70}\n")

    all_archives = []
    page = 0

    while True:
        if max_pages and page >= max_pages:
            break

        # Fetch listing page
        params = {'page': page} if page > 0 else None
        soup = fetch_page(ARCHIVE_LIST_URL, params)

        if not soup:
            print(f"Warning: Failed to fetch page {page}. Stopping.")
            break

        # Find archive listings
        # This selector may need adjustment based on actual HTML structure
        listings = soup.find_all(['article', 'div', 'li'], class_=['archive', 'item', 'result'])

        if not listings:
            # Try alternative selectors
            listings = soup.find_all('a', href=re.compile(r'/struktur/[A-Za-z0-9_-]+'))

        if not listings:
            print(f"No archives found on page {page}. Stopping.")
            break

        print(f"\nPage {page}: Found {len(listings)} listings")

        # Parse each listing
        for listing in listings:
            archive = extract_archive_from_listing(listing)
            if archive:
                # Enrich with profile page if requested
                if enrich_profiles and archive['profile_url']:
                    print(f"  Enriching: {archive['name'][:50]}...", end=' ')
                    enriched = fetch_archive_profile(archive['profile_url'])
                    archive.update(enriched)
                    print("OK")
                    time.sleep(REQUEST_DELAY)

                all_archives.append(archive)

        print(f"Progress: {len(all_archives)} archives collected")

        # Check for next page
        next_button = soup.find('a', class_=['next', 'pagination-next'])
        if not next_button or not next_button.get('href'):
            print("No more pages. Stopping.")
            break

        page += 1
        time.sleep(REQUEST_DELAY)

    print(f"\n{'='*70}")
    print(f"Harvest complete: {len(all_archives)} archives")
    print(f"{'='*70}\n")

    return all_archives


def harvest_by_federal_state() -> Dict[str, List[Dict]]:
    """
    Harvest archives grouped by federal state.

    Returns:
        Dictionary mapping federal state to list of archives
    """
    federal_states = [
        "Baden-Württemberg",
        "Bayern",
        "Berlin",
        "Brandenburg",
        "Bremen",
        "Hamburg",
        "Hessen",
        "Mecklenburg-Vorpommern",
        "Niedersachsen",
        "Nordrhein-Westfalen",
        "Rheinland-Pfalz",
        "Saarland",
        "Sachsen",
        "Sachsen-Anhalt",
        "Schleswig-Holstein",
        "Thüringen"
    ]

    print(f"\n{'='*70}")
    print(f"Harvesting Archives by Federal State")
    print(f"{'='*70}\n")

    archives_by_state = {}

    for state in federal_states:
        print(f"\nFetching archives for: {state}")

        # Use filter parameter (this will need adjustment based on actual URL structure)
        params = {'federalState': state}
        soup = fetch_page(ARCHIVE_LIST_URL, params)

        if not soup:
            print(f"  Failed to fetch {state}. Skipping.")
            continue

        # Parse listings
        listings = soup.find_all(['article', 'div'], class_=['archive', 'item'])
        state_archives = []

        for listing in listings:
            archive = extract_archive_from_listing(listing)
            if archive:
                archive['federal_state'] = state
                state_archives.append(archive)

        archives_by_state[state] = state_archives
        print(f"  Found {len(state_archives)} archives")

        time.sleep(REQUEST_DELAY)

    return archives_by_state


def save_archives(archives: List[Dict], filename_suffix: str = ""):
    """Save archives to JSON file."""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = OUTPUT_DIR / f"archivportal_d_archives{filename_suffix}_{timestamp}.json"

    output = {
        'metadata': {
            'source': 'Archivportal-D',
            'source_url': ARCHIVPORTAL_BASE_URL,
            'operator': 'Deutsche Digitale Bibliothek',
            'harvest_date': datetime.utcnow().isoformat() + 'Z',
            'total_archives': len(archives),
            'method': 'Web scraping',
            'license': 'CC0 1.0 Universal (Public Domain)',
            'coverage': 'All German archives (state, municipal, church, business, etc.)'
        },
        'archives': archives
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print(f"✓ Saved to: {output_file}")
    print(f"  File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")

    return output_file


def generate_statistics(archives: List[Dict]):
    """Generate and display statistics."""
    stats = {
        'total': len(archives),
        'by_state': {},
        'by_type': {},
        'with_isil': 0,
        'with_profile_url': 0,
        'with_email': 0,
        'with_phone': 0,
        'with_coordinates': 0
    }

    for archive in archives:
        # Count by state
        state = archive.get('federal_state', 'Unknown')
        stats['by_state'][state] = stats['by_state'].get(state, 0) + 1

        # Count by type
        arch_type = archive.get('archive_type', 'Unknown')
        stats['by_type'][arch_type] = stats['by_type'].get(arch_type, 0) + 1

        # Count completeness
        if archive.get('isil'):
            stats['with_isil'] += 1
        if archive.get('profile_url'):
            stats['with_profile_url'] += 1
        if archive.get('contact', {}).get('email'):
            stats['with_email'] += 1
        if archive.get('contact', {}).get('phone'):
            stats['with_phone'] += 1
        if archive.get('coordinates', {}).get('latitude'):
            stats['with_coordinates'] += 1

    print(f"\n{'='*70}")
    print("Statistics:")
    print(f"{'='*70}")
    print(f"Total archives: {stats['total']}")
    print(f"\nData completeness:")
    print(f"  - With ISIL code: {stats['with_isil']} ({stats['with_isil']/stats['total']*100:.1f}%)")
    print(f"  - With profile URL: {stats['with_profile_url']} ({stats['with_profile_url']/stats['total']*100:.1f}%)")
    print(f"  - With email: {stats['with_email']} ({stats['with_email']/stats['total']*100:.1f}%)")
    print(f"  - With phone: {stats['with_phone']} ({stats['with_phone']/stats['total']*100:.1f}%)")
    print(f"  - With coordinates: {stats['with_coordinates']} ({stats['with_coordinates']/stats['total']*100:.1f}%)")

    print(f"\nArchives by federal state:")
    for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True):
        print(f"  - {state}: {count}")

    print(f"\nTop 10 archive types:")
    for arch_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  - {arch_type}: {count}")

    print(f"{'='*70}\n")

    return stats


def main():
    """Main execution."""
    print(f"\n{'#'*70}")
    print(f"# Archivportal-D Archive Harvester")
    print(f"# Deutsche Digitale Bibliothek")
    print(f"{'#'*70}\n")

    print("NOTE: This harvester uses web scraping as a fallback method.")
    print("The HTML structure may change over time and require updates to selectors.")
    print("For production use, consider registering for DDB API access.\n")

    # Harvest archives
    archives = harvest_archive_list(max_pages=10, enrich_profiles=False)  # Start with 10 pages for testing

    if not archives:
        print("No archives harvested. Exiting.")
        return

    # Save archives
    save_archives(archives)

    # Generate statistics
    stats = generate_statistics(archives)

    # Save statistics
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    stats_file = OUTPUT_DIR / f"archivportal_d_stats_{timestamp}.json"
    with open(stats_file, 'w', encoding='utf-8') as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)
    print(f"✓ Statistics saved to: {stats_file}\n")

    print("✓ Harvest complete!\n")
    print("Next steps:")
    print("  1. Review the harvested data for accuracy")
    print("  2. Adjust HTML selectors if needed")
    print("  3. Run full harvest (remove max_pages limit)")
    print("  4. Cross-reference with ISIL dataset\n")


if __name__ == "__main__":
    main()