glam/scripts/scrapers/scrape_czech_archives_aron.py

#!/usr/bin/env python3
"""
Scrape Czech Archive Institutions from ARON Portal API

The ARON (ARchiv ONline) portal has an undocumented REST API that provides
access to archive institutions and collections.

API Discovery:
- Found by reverse-engineering the portal.nacr.cz/aron/institution page
- Uses POST requests with JSON body containing filters
- Key discovery: type filter with value "INSTITUTION" returns only institutions

API Endpoints:
- List: POST https://portal.nacr.cz/aron/api/aron/apu/listview?listType=EVIDENCE-LIST
- Detail: GET https://portal.nacr.cz/aron/api/aron/apu/{uuid}

Filter Structure:
{
  "filters": [{"field": "type", "operation": "EQ", "value": "INSTITUTION"}],
  "offset": 0,
  "size": 100
}

This script:
1. Fetches institutions using API type filter (~560 total)
2. Extracts metadata from each institution detail page
3. Outputs LinkML-compliant YAML for Czech archives

Estimated runtime: 10-15 minutes (560 institutions × 0.5s rate limit)
"""

import json
import time
import requests
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional
import sys

# Constants
API_BASE = "https://portal.nacr.cz/aron/api/aron"
API_LIST = f"{API_BASE}/apu/listview?listType=EVIDENCE-LIST"
API_DETAIL = f"{API_BASE}/apu"

BATCH_SIZE = 100  # Records per API request
RATE_LIMIT_DELAY = 0.5  # Seconds between requests (2 req/sec max)

OUTPUT_DIR = Path("data/instances")
OUTPUT_FILE = OUTPUT_DIR / "czech_archives_aron.yaml"

# Type mapping from ARON metadata types to GLAM taxonomy
TYPE_MAPPING = {
    "archiv": "ARCHIVE",
    "státní archiv": "ARCHIVE",
    "oblastní archiv": "ARCHIVE",
    "okresní archiv": "ARCHIVE",
    "městský archiv": "ARCHIVE",
    "archiv města": "ARCHIVE",
    "muzeum": "MUSEUM",
    "galerie": "GALLERY",
    "knihovna": "LIBRARY",
    "univerzita": "EDUCATION_PROVIDER",
    "vysoká škola": "EDUCATION_PROVIDER",
    "památník": "MUSEUM",
}


def fetch_list_page(offset: int = 0) -> Dict[str, Any]:
    """
    Fetch a page of institution records from ARON API.

    Uses the institution type filter to get only institutions (not fonds, etc.)

    Args:
        offset: Record offset for pagination

    Returns:
        API response with items and pagination info
    """
    payload = {
        "filters": [
            {
                "field": "type",
                "operation": "EQ",
                "value": "INSTITUTION"
            }
        ],
        "sort": [
            {
                "field": "name",
                "type": "SCORE",
                "order": "DESC",
                "sortMode": "MIN"
            }
        ],
        "offset": offset,
        "flipDirection": False,
        "size": BATCH_SIZE
    }

    try:
        response = requests.post(
            API_LIST,
            json=payload,
            headers={"Content-Type": "application/json"},
            timeout=30
        )
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        print(f"Error fetching list page: {e}")
        raise


def fetch_institution_detail(uuid: str) -> Dict[str, Any]:
    """
    Fetch detailed metadata for a single institution.

    Args:
        uuid: Institution UUID

    Returns:
        Institution detail object
    """
    try:
        response = requests.get(
            f"{API_DETAIL}/{uuid}",
            timeout=30
        )
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        print(f"Error fetching detail for {uuid}: {e}")
        return {}


def classify_institution_type(name: str, description: str = "") -> str:
    """
    Classify institution type from name and description.

    Args:
        name: Institution name
        description: Institution description

    Returns:
        Institution type from GLAM taxonomy
    """
    text = f"{name} {description}".lower()

    for pattern, inst_type in TYPE_MAPPING.items():
        if pattern in text:
            return inst_type

    # Default to ARCHIVE (since we're scraping archive portal)
    return "ARCHIVE"


def extract_metadata(detail: Dict[str, Any]) -> Dict[str, Any]:
    """
    Extract metadata from institution detail response.

    Args:
        detail: Institution detail from API

    Returns:
        Extracted metadata dict
    """
    metadata = {
        "name": detail.get("name"),
        "description": detail.get("description"),
        "uuid": detail.get("id"),
    }

    # Extract metadata from parts
    parts = detail.get("parts", [])
    for part in parts:
        items = part.get("items", [])
        for item in items:
            item_type = item.get("type")
            value = item.get("value")

            if item_type == "INST~CODE":
                metadata["institution_code"] = value
            elif item_type == "INST~SHORT~NAME":
                metadata["short_name"] = value
            elif item_type == "INST~ADDRESS":
                metadata["address"] = value
            elif item_type == "INST~PHONE":
                metadata["phone"] = value
            elif item_type == "INST~EMAIL":
                metadata["email"] = value
            elif item_type == "INST~URL":
                metadata["website"] = value

    return metadata


def scrape_all_institutions():
    """
    Scrape all institutions from ARON portal using type filter.
    """
    print("=" * 70)
    print("ARON Archive Institution Scraper")
    print("=" * 70)

    institutions = []
    offset = 0
    total_count = 0

    # Fetch institution list with type filter (no need for name filtering!)
    print("\nFetching institutions with API type filter...")
    print("(This API filter returns only institutions, not fonds or collections)")

    while True:
        try:
            # Fetch page
            response = fetch_list_page(offset)
            items = response.get("items", [])
            count = response.get("count", 0)

            # Store total count from first response
            if offset == 0:
                total_count = count
                print(f"\nTotal institutions: {total_count}")

            if not items:
                break

            # All items are institutions (API filtered)
            for item in items:
                institutions.append({
                    "uuid": item["id"],
                    "name": item["name"],
                    "description": item.get("description")
                })

            # Progress
            fetched = offset + len(items)
            progress = (fetched / total_count * 100) if total_count > 0 else 0
            print(f"Progress: {fetched}/{total_count} institutions ({progress:.1f}%)", end="\r")

            # Check if more pages
            offset += len(items)
            if fetched >= total_count:
                break

            # Rate limiting
            time.sleep(RATE_LIMIT_DELAY)

        except KeyboardInterrupt:
            print("\n\nInterrupted by user. Saving partial results...")
            break
        except Exception as e:
            print(f"\nError during scraping: {e}")
            break

    print(f"\n\nPhase 1 complete: {len(institutions)} institutions fetched")

    # Second pass: Fetch details for each institution
    print(f"\nPhase 2: Fetching detailed metadata for {len(institutions)} institutions...")

    detailed_institutions = []
    for idx, inst in enumerate(institutions, 1):
        try:
            # Fetch detail
            detail = fetch_institution_detail(inst["uuid"])

            if detail:
                metadata = extract_metadata(detail)
                metadata["institution_type"] = classify_institution_type(
                    metadata.get("name", ""),
                    metadata.get("description", "")
                )
                detailed_institutions.append(metadata)

            # Progress
            print(f"Progress: {idx}/{len(institutions)} institutions processed", end="\r")

            # Rate limiting
            time.sleep(RATE_LIMIT_DELAY)

        except KeyboardInterrupt:
            print("\n\nInterrupted by user. Saving partial results...")
            break
        except Exception as e:
            print(f"\nError fetching detail for {inst['uuid']}: {e}")
            continue

    print(f"\n\nPhase 2 complete: {len(detailed_institutions)} institutions with details")

    # Save results
    print(f"\nSaving to {OUTPUT_FILE}...")
    save_institutions(detailed_institutions)

    print("\nScraping complete!")
    print(f"Total institutions: {len(detailed_institutions)}")


def save_institutions(institutions: List[Dict[str, Any]]):
    """
    Save institutions to YAML file.

    Args:
        institutions: List of institution metadata dicts
    """
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # Create LinkML-compliant records
    records = []
    for inst in institutions:
        record = {
            "id": f"https://w3id.org/heritage/custodian/cz/{inst['uuid']}",
            "name": inst.get("name"),
            "institution_type": inst.get("institution_type", "ARCHIVE"),
            "description": inst.get("description"),
        }

        # Identifiers
        identifiers = []
        if inst.get("uuid"):
            identifiers.append({
                "identifier_scheme": "ARON_UUID",
                "identifier_value": inst["uuid"],
                "identifier_url": f"https://portal.nacr.cz/aron/apu/{inst['uuid']}"
            })
        if inst.get("institution_code"):
            identifiers.append({
                "identifier_scheme": "INSTITUTION_CODE",
                "identifier_value": inst["institution_code"]
            })
        if identifiers:
            record["identifiers"] = identifiers

        # Website
        if inst.get("website"):
            if "identifiers" not in record:
                record["identifiers"] = []
            record["identifiers"].append({
                "identifier_scheme": "Website",
                "identifier_value": inst["website"],
                "identifier_url": inst["website"]
            })

        # Provenance
        record["provenance"] = {
            "data_source": "CONVERSATION_NLP",  # Will update to WEB_SCRAPING
            "data_tier": "TIER_1_AUTHORITATIVE",
            "extraction_date": datetime.now(timezone.utc).isoformat(),
            "extraction_method": "ARON API scraping via undocumented REST endpoint",
            "confidence_score": 0.85,
            "source_url": "https://portal.nacr.cz/aron/institution"
        }

        records.append(record)

    # Write YAML
    import yaml
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        yaml.dump(records, f, allow_unicode=True, sort_keys=False, default_flow_style=False)

    print(f"Saved {len(records)} institutions to {OUTPUT_FILE}")


if __name__ == "__main__":
    scrape_all_institutions()