glam/scripts/scrapers/harvest_thueringen_archives.py

#!/usr/bin/env python3
"""
Thüringen Archives Harvester
Extracts 149 archives from archive-in-thueringen.de

Portal: https://www.archive-in-thueringen.de/de/archiv/list
Strategy: All archives visible on single page - direct extraction
Speed: ~10 seconds

Author: OpenCode + AI Agent
Date: 2025-11-20
Version: 1.0
"""

from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
import re

# Configuration
BASE_URL = "https://www.archive-in-thueringen.de"
ARCHIVE_LIST_URL = f"{BASE_URL}/de/archiv/list"
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Map German archive types to GLAM taxonomy
ARCHIVE_TYPE_MAPPING = {
    "Landesarchiv": "OFFICIAL_INSTITUTION",
    "Staatsarchiv": "OFFICIAL_INSTITUTION",
    "Hauptstaatsarchiv": "OFFICIAL_INSTITUTION",
    "Stadtarchiv": "ARCHIVE",
    "Gemeindearchiv": "ARCHIVE",
    "Kreisarchiv": "ARCHIVE",
    "Stadt- und Kreisarchiv": "ARCHIVE",
    "Bistumsarchiv": "HOLY_SITES",
    "Kirchenkreisarchiv": "HOLY_SITES",
    "Landeskirchenarchiv": "HOLY_SITES",
    "Archiv des Ev.": "HOLY_SITES",
    "Archiv des Bischöflichen": "HOLY_SITES",
    "Pfarrhausarchiv": "HOLY_SITES",
    "Universitätsarchiv": "EDUCATION_PROVIDER",
    "Hochschularchiv": "EDUCATION_PROVIDER",
    "Hochschule": "EDUCATION_PROVIDER",
    "Universität": "EDUCATION_PROVIDER",
    "Fachhochschule": "EDUCATION_PROVIDER",
    "Fachschule": "EDUCATION_PROVIDER",
    "Carl Zeiss": "CORPORATION",
    "SCHOTT": "CORPORATION",
    "Wirtschaftsarchiv": "CORPORATION",
    "Handwerkskammer": "CORPORATION",
    "Handelskammer": "CORPORATION",
    "Industrie- und Handelskammer": "CORPORATION",
    "Lederfabrik": "CORPORATION",
    "Verlagsgesellschaft": "CORPORATION",
    "Bundesarchiv": "OFFICIAL_INSTITUTION",
    "Stasi-Unterlagen": "OFFICIAL_INSTITUTION",
    "Thüringer Landtages": "OFFICIAL_INSTITUTION",
    "Gedenkstätte": "MUSEUM",
    "Museum": "MUSEUM",
    "Goethe- und Schiller": "RESEARCH_CENTER",
    "Akademie": "RESEARCH_CENTER",
    "Thüringer Archiv für Zeitgeschichte": "RESEARCH_CENTER",
    "Thüringer Industriearchiv": "RESEARCH_CENTER",
    "Thüringer Bauteil-Archiv": "RESEARCH_CENTER",
    "Thüringer Talsperren": "RESEARCH_CENTER",
    "Landesamt": "OFFICIAL_INSTITUTION",
    "Archiv des Vogtländischen": "COLLECTING_SOCIETY",
    "Archiv des Arbeitskreises": "NGO",
    "Grenzlandmuseum": "MUSEUM",
    "Archiv der VG": "ARCHIVE",  # Verwaltungsgemeinschaft = administrative community
    "Archiv der Verwaltungsgemeinschaft": "ARCHIVE",
    "Archiv der Landgemeinde": "ARCHIVE",
    "Archiv der Sammlung": "RESEARCH_CENTER",
    "Musikarchiv": "RESEARCH_CENTER",
}


def infer_institution_type(name: str) -> str:
    """Infer institution type from German archive name."""
    for keyword, inst_type in ARCHIVE_TYPE_MAPPING.items():
        if keyword in name:
            return inst_type
    return "ARCHIVE"


def extract_city_from_fulltext(fulltext: str) -> Optional[str]:
    """
    Extract city from Thüringen archive format: "City - Archive Name"

    Examples:
    - "Altenburg - Stadtarchiv Altenburg" -> "Altenburg"
    - "Erfurt - Stadtarchiv Erfurt" -> "Erfurt"
    - "Landesarchiv Thüringen - Staatsarchiv Altenburg" -> "Altenburg" (from name)
    - "Arnstadt - Kreisarchiv Ilm-Kreis - Altkreis Ilmenau" -> "Arnstadt"
    """
    # PRIORITY 1: Extract city from specific archive name patterns
    # (must come BEFORE split-by-dash logic to handle "Landesarchiv - Staatsarchiv City")

    # Stadtarchiv pattern (most reliable)
    stadtarchiv_match = re.search(r'Stadtarchiv\s+(.+?)(?:\s*$)', fulltext)
    if stadtarchiv_match:
        city = stadtarchiv_match.group(1).strip()
        city = re.sub(r'\s*\(.*\)$', '', city)  # Remove (StadtA NDH) etc.
        return city

    # Hauptstaatsarchiv pattern: "Hauptstaatsarchiv City"
    hauptstaatsarchiv_match = re.search(r'Hauptstaatsarchiv\s+(.+?)$', fulltext)
    if hauptstaatsarchiv_match:
        city = hauptstaatsarchiv_match.group(1).strip()
        return city

    # Staatsarchiv pattern: "Staatsarchiv City"
    staatsarchiv_match = re.search(r'Staatsarchiv\s+(.+?)$', fulltext)
    if staatsarchiv_match:
        city = staatsarchiv_match.group(1).strip()
        return city

    # Gemeindearchiv pattern
    gemeinde_match = re.search(r'Gemeindearchiv\s+(.+?)$', fulltext)
    if gemeinde_match:
        city = gemeinde_match.group(1).strip()
        return city

    # Stadt- und Kreisarchiv pattern
    stadt_kreis_match = re.search(r'Stadt-\s+und\s+Kreisarchiv\s+(.+?)$', fulltext)
    if stadt_kreis_match:
        city = stadt_kreis_match.group(1).strip()
        return city

    # Universitätsarchiv pattern
    uni_match = re.search(r'Universitätsarchiv\s+(.+?)$', fulltext)
    if uni_match:
        city = uni_match.group(1).strip()
        return city

    # PRIORITY 2: Split by " - " and take first part as city
    if " - " in fulltext:
        parts = fulltext.split(" - ")
        potential_city = parts[0].strip()

        # Skip if first part is an organization name (not a city)
        skip_patterns = [
            "Landesarchiv",
            "Bundesarchiv",
            "EKM",
            "Landkreis",
            "Kreisarchiv Ilm-Kreis"  # Special case
        ]

        if not any(pattern in potential_city for pattern in skip_patterns):
            return potential_city

    return None


def harvest_thueringen_archives() -> List[Dict]:
    """
    Harvest all 149 archives from Thüringen archive portal.

    All archives are visible on a single page, so this is very fast.
    """
    print(f"🚀 Thüringen Archives Harvester v1.0")
    print(f"📍 Portal: {ARCHIVE_LIST_URL}")
    print(f"⏱️  Starting harvest at {datetime.now(timezone.utc).isoformat()}")
    print()

    archives = []

    with sync_playwright() as p:
        print("🌐 Launching browser...")
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
        )
        page = context.new_page()

        try:
            print(f"📄 Loading archive list page...")
            page.goto(ARCHIVE_LIST_URL, wait_until='networkidle', timeout=30000)

            # Accept cookies if present
            try:
                cookie_button = page.locator('button:has-text("Akzeptieren"), button:has-text("Accept")')
                if cookie_button.is_visible(timeout=2000):
                    cookie_button.click()
                    print("✅ Accepted cookies")
                    time.sleep(1)
            except:
                pass

            print("📋 Extracting archives from page...")

            # Extract archives using JavaScript
            result = page.evaluate("""
                () => {
                    const archiveLinks = document.querySelectorAll('ul li a[href*="/de/archiv/view/id/"]');
                    const uniqueArchives = new Map();

                    archiveLinks.forEach(link => {
                        const fullText = link.textContent.trim();
                        const url = link.href;

                        // Extract ID from URL
                        const idMatch = url.match(/\\/id\\/(\\d+)/);
                        if (!idMatch) return;

                        const archiveId = idMatch[1];
                        if (uniqueArchives.has(archiveId)) return;

                        // Parse "City - Archive Name" format
                        let city = '';
                        let archiveName = '';

                        if (fullText.includes(' - ')) {
                            const parts = fullText.split(' - ');
                            city = parts[0].trim();
                            archiveName = parts.slice(1).join(' - ').trim();
                        } else {
                            archiveName = fullText;
                        }

                        uniqueArchives.set(archiveId, {
                            id: archiveId,
                            city: city,
                            name: archiveName,
                            url: url,
                            fullText: fullText
                        });
                    });

                    return Array.from(uniqueArchives.values());
                }
            """)

            print(f"✅ Extracted {len(result)} unique archives")

            # Process each archive
            for raw_archive in result:
                # Extract city from full text (ignore JavaScript split which may be wrong)
                city = extract_city_from_fulltext(raw_archive['fullText'])

                # Infer institution type
                inst_type = infer_institution_type(raw_archive['name'])

                archive_data = {
                    "id": f"thueringen-{raw_archive['id']}",
                    "name": raw_archive['name'],
                    "institution_type": inst_type,
                    "city": city,
                    "region": "Thüringen",
                    "country": "DE",
                    "url": raw_archive['url'],
                    "source_portal": "archive-in-thueringen.de",
                    "fulltext_display": raw_archive['fullText'],
                    "provenance": {
                        "data_source": "WEB_SCRAPING",
                        "data_tier": "TIER_2_VERIFIED",
                        "extraction_date": datetime.now(timezone.utc).isoformat(),
                        "extraction_method": "Playwright direct page extraction",
                        "source_url": ARCHIVE_LIST_URL,
                        "confidence_score": 0.95
                    }
                }

                archives.append(archive_data)

            print(f"\n📊 Harvest Statistics:")
            print(f"   Total archives: {len(archives)}")

            # Count by type
            type_counts = {}
            for archive in archives:
                inst_type = archive['institution_type']
                type_counts[inst_type] = type_counts.get(inst_type, 0) + 1

            print(f"   By type:")
            for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
                print(f"     - {inst_type}: {count}")

            # Count archives with cities
            with_city = sum(1 for a in archives if a.get('city'))
            print(f"   With city names: {with_city}/{len(archives)} ({with_city/len(archives)*100:.1f}%)")

        except Exception as e:
            print(f"❌ Error during harvest: {e}")
            import traceback
            traceback.print_exc()

        finally:
            browser.close()

    return archives


def save_results(archives: List[Dict]) -> Path:
    """Save harvested archives to JSON file."""
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    output_file = OUTPUT_DIR / f"thueringen_archives_{timestamp}.json"

    output_data = {
        "metadata": {
            "source": "archive-in-thueringen.de",
            "harvest_date": datetime.now(timezone.utc).isoformat(),
            "total_archives": len(archives),
            "region": "Thüringen",
            "country": "DE",
            "harvester_version": "1.0"
        },
        "archives": archives
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Results saved to: {output_file}")
    print(f"   File size: {output_file.stat().st_size / 1024:.1f} KB")

    return output_file


def main():
    """Main execution function."""
    start_time = time.time()

    # Harvest archives
    archives = harvest_thueringen_archives()

    if archives:
        # Save results
        output_file = save_results(archives)

        elapsed = time.time() - start_time
        print(f"\n✅ Harvest completed in {elapsed:.1f} seconds")
        print(f"📈 Speed: {len(archives)/elapsed:.1f} archives/second")

        print(f"\n🎯 Next Steps:")
        print(f"   1. Run geocoding: python scripts/enrich_geocoding.py {output_file}")
        print(f"   2. Merge with German dataset: python scripts/scrapers/merge_thueringen_to_german_dataset.py")
        print(f"   3. Expected new additions: ~120 archives (after deduplication)")
    else:
        print("\n❌ No archives harvested!")
        return 1

    return 0


if __name__ == "__main__":
    exit(main())