glam/scripts/scrapers/harvest_sachsen_anhalt_archives.py

#!/usr/bin/env python3
"""
Harvest Sachsen-Anhalt GLAM Institutions

Extracts heritage institutions from Sachsen-Anhalt (Saxony-Anhalt), Germany.

Sources:
1. Landesarchiv Sachsen-Anhalt (state archive branches)
2. Archivportal-D (regional archives)
3. DDB SPARQL (museums, libraries, archives)
4. ULB Sachsen-Anhalt (digital collections)

Output: LinkML-compliant JSON with comprehensive metadata

Author: OpenCode AI Agent
Date: 2025-11-20
"""

import requests
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime, timezone
from typing import List, Dict, Optional

def extract_landesarchiv_branches() -> List[Dict]:
    """
    Extract the 4 branches of Landesarchiv Sachsen-Anhalt.

    Locations: Magdeburg, Wernigerode, Merseburg, Dessau
    """
    base_url = 'https://landesarchiv.sachsen-anhalt.de'
    locations = ['magdeburg', 'wernigerode', 'merseburg', 'dessau']

    archives = []

    print("📚 Extracting Landesarchiv Sachsen-Anhalt branches...")
    print("=" * 70)

    for location in locations:
        url = f'{base_url}/landesarchiv/standorte/{location}'
        print(f"   Fetching: {location.upper()}")

        try:
            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')

                # Extract title
                title = soup.find('h1')
                title_text = title.get_text(strip=True) if title else f'Landesarchiv Sachsen-Anhalt - Abteilung {location.title()}'

                # Extract contact details
                contact_info: Dict[str, Optional[str]] = {
                    'email': None,
                    'phone': None,
                    'address': None
                }

                # Look for contact information in paragraphs
                for p in soup.find_all('p'):
                    text = p.get_text()

                    # Email
                    if '@' in text and not contact_info['email']:
                        # Extract email
                        email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
                        if email_match:
                            contact_info['email'] = email_match.group()

                    # Phone
                    if 'Telefon' in text or 'Tel.' in text:
                        # Extract phone number
                        phone_match = re.search(r'\+?\d[\d\s\-\(\)]+', text)
                        if phone_match:
                            contact_info['phone'] = phone_match.group().strip()

                    # Address
                    if 'Adresse' in text or 'straße' in text.lower() or 'str.' in text.lower():
                        contact_info['address'] = p.get_text(strip=True)

                archive_record = {
                    'id': f'sachsen-anhalt-la-{location}',
                    'name': title_text,
                    'institution_type': 'ARCHIVE',
                    'city': location.title(),
                    'region': 'Sachsen-Anhalt',
                    'country': 'DE',
                    'url': url,
                    'email': contact_info['email'],
                    'phone': contact_info['phone'],
                    'address_text': contact_info['address'],
                    'source_portal': 'landesarchiv.sachsen-anhalt.de',
                    'provenance': {
                        'data_source': 'WEB_SCRAPING',
                        'data_tier': 'TIER_2_VERIFIED',
                        'extraction_date': datetime.now(timezone.utc).isoformat(),
                        'extraction_method': 'Landesarchiv Sachsen-Anhalt portal harvest',
                        'source_url': url,
                        'confidence_score': 0.95
                    }
                }

                archives.append(archive_record)
                print(f"   ✅ Extracted: {title_text}")

            else:
                print(f"   ❌ HTTP {response.status_code}")

        except Exception as e:
            print(f"   ❌ ERROR: {e}")

    print()
    print(f"✅ Extracted {len(archives)} Landesarchiv branches")
    print()

    return archives

def save_archives(archives: List[Dict], filename: str):
    """Save archives to JSON file."""
    output_data = {
        'metadata': {
            'source': 'Sachsen-Anhalt GLAM institutions',
            'harvest_date': datetime.now(timezone.utc).isoformat(),
            'total_archives': len(archives),
            'region': 'Sachsen-Anhalt',
            'country': 'DE',
            'harvester_version': '1.0'
        },
        'archives': archives
    }

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"💾 Saved to: {filename}")
    print(f"   File size: {len(json.dumps(output_data)) / 1024:.1f} KB")

def main():
    print("=" * 70)
    print("SACHSEN-ANHALT GLAM INSTITUTIONS HARVEST")
    print("=" * 70)
    print()

    # Extract Landesarchiv branches
    archives = extract_landesarchiv_branches()

    # Save results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f'data/isil/germany/sachsen_anhalt_archives_{timestamp}.json'
    save_archives(archives, output_file)

    print()
    print("=" * 70)
    print("HARVEST COMPLETE")
    print("=" * 70)
    print(f"Total archives harvested: {len(archives)}")
    print()
    print("Next steps:")
    print("  1. Harvest Archivportal-D for Sachsen-Anhalt archives")
    print("  2. Query DDB SPARQL for museums/libraries")
    print("  3. Merge all sources into unified dataset")

if __name__ == '__main__':
    main()