#!/usr/bin/env python3 """ Scrape Czech Archive Institutions from ARON Portal API The ARON (ARchiv ONline) portal has an undocumented REST API that provides access to archive institutions and collections. API Discovery: - Found by reverse-engineering the portal.nacr.cz/aron/institution page - Uses POST requests with JSON body containing filters - Key discovery: type filter with value "INSTITUTION" returns only institutions API Endpoints: - List: POST https://portal.nacr.cz/aron/api/aron/apu/listview?listType=EVIDENCE-LIST - Detail: GET https://portal.nacr.cz/aron/api/aron/apu/{uuid} Filter Structure: { "filters": [{"field": "type", "operation": "EQ", "value": "INSTITUTION"}], "offset": 0, "size": 100 } This script: 1. Fetches institutions using API type filter (~560 total) 2. Extracts metadata from each institution detail page 3. Outputs LinkML-compliant YAML for Czech archives Estimated runtime: 10-15 minutes (560 institutions × 0.5s rate limit) """ import json import time import requests from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Any, Optional import sys # Constants API_BASE = "https://portal.nacr.cz/aron/api/aron" API_LIST = f"{API_BASE}/apu/listview?listType=EVIDENCE-LIST" API_DETAIL = f"{API_BASE}/apu" BATCH_SIZE = 100 # Records per API request RATE_LIMIT_DELAY = 0.5 # Seconds between requests (2 req/sec max) OUTPUT_DIR = Path("data/instances") OUTPUT_FILE = OUTPUT_DIR / "czech_archives_aron.yaml" # Type mapping from ARON metadata types to GLAM taxonomy TYPE_MAPPING = { "archiv": "ARCHIVE", "státní archiv": "ARCHIVE", "oblastní archiv": "ARCHIVE", "okresní archiv": "ARCHIVE", "městský archiv": "ARCHIVE", "archiv města": "ARCHIVE", "muzeum": "MUSEUM", "galerie": "GALLERY", "knihovna": "LIBRARY", "univerzita": "EDUCATION_PROVIDER", "vysoká škola": "EDUCATION_PROVIDER", "památník": "MUSEUM", } def fetch_list_page(offset: int = 0) -> Dict[str, Any]: """ Fetch a page of institution records from ARON API. Uses the institution type filter to get only institutions (not fonds, etc.) Args: offset: Record offset for pagination Returns: API response with items and pagination info """ payload = { "filters": [ { "field": "type", "operation": "EQ", "value": "INSTITUTION" } ], "sort": [ { "field": "name", "type": "SCORE", "order": "DESC", "sortMode": "MIN" } ], "offset": offset, "flipDirection": False, "size": BATCH_SIZE } try: response = requests.post( API_LIST, json=payload, headers={"Content-Type": "application/json"}, timeout=30 ) response.raise_for_status() return response.json() except requests.RequestException as e: print(f"Error fetching list page: {e}") raise def fetch_institution_detail(uuid: str) -> Dict[str, Any]: """ Fetch detailed metadata for a single institution. Args: uuid: Institution UUID Returns: Institution detail object """ try: response = requests.get( f"{API_DETAIL}/{uuid}", timeout=30 ) response.raise_for_status() return response.json() except requests.RequestException as e: print(f"Error fetching detail for {uuid}: {e}") return {} def classify_institution_type(name: str, description: str = "") -> str: """ Classify institution type from name and description. Args: name: Institution name description: Institution description Returns: Institution type from GLAM taxonomy """ text = f"{name} {description}".lower() for pattern, inst_type in TYPE_MAPPING.items(): if pattern in text: return inst_type # Default to ARCHIVE (since we're scraping archive portal) return "ARCHIVE" def extract_metadata(detail: Dict[str, Any]) -> Dict[str, Any]: """ Extract metadata from institution detail response. Args: detail: Institution detail from API Returns: Extracted metadata dict """ metadata = { "name": detail.get("name"), "description": detail.get("description"), "uuid": detail.get("id"), } # Extract metadata from parts parts = detail.get("parts", []) for part in parts: items = part.get("items", []) for item in items: item_type = item.get("type") value = item.get("value") if item_type == "INST~CODE": metadata["institution_code"] = value elif item_type == "INST~SHORT~NAME": metadata["short_name"] = value elif item_type == "INST~ADDRESS": metadata["address"] = value elif item_type == "INST~PHONE": metadata["phone"] = value elif item_type == "INST~EMAIL": metadata["email"] = value elif item_type == "INST~URL": metadata["website"] = value return metadata def scrape_all_institutions(): """ Scrape all institutions from ARON portal using type filter. """ print("=" * 70) print("ARON Archive Institution Scraper") print("=" * 70) institutions = [] offset = 0 total_count = 0 # Fetch institution list with type filter (no need for name filtering!) print("\nFetching institutions with API type filter...") print("(This API filter returns only institutions, not fonds or collections)") while True: try: # Fetch page response = fetch_list_page(offset) items = response.get("items", []) count = response.get("count", 0) # Store total count from first response if offset == 0: total_count = count print(f"\nTotal institutions: {total_count}") if not items: break # All items are institutions (API filtered) for item in items: institutions.append({ "uuid": item["id"], "name": item["name"], "description": item.get("description") }) # Progress fetched = offset + len(items) progress = (fetched / total_count * 100) if total_count > 0 else 0 print(f"Progress: {fetched}/{total_count} institutions ({progress:.1f}%)", end="\r") # Check if more pages offset += len(items) if fetched >= total_count: break # Rate limiting time.sleep(RATE_LIMIT_DELAY) except KeyboardInterrupt: print("\n\nInterrupted by user. Saving partial results...") break except Exception as e: print(f"\nError during scraping: {e}") break print(f"\n\nPhase 1 complete: {len(institutions)} institutions fetched") # Second pass: Fetch details for each institution print(f"\nPhase 2: Fetching detailed metadata for {len(institutions)} institutions...") detailed_institutions = [] for idx, inst in enumerate(institutions, 1): try: # Fetch detail detail = fetch_institution_detail(inst["uuid"]) if detail: metadata = extract_metadata(detail) metadata["institution_type"] = classify_institution_type( metadata.get("name", ""), metadata.get("description", "") ) detailed_institutions.append(metadata) # Progress print(f"Progress: {idx}/{len(institutions)} institutions processed", end="\r") # Rate limiting time.sleep(RATE_LIMIT_DELAY) except KeyboardInterrupt: print("\n\nInterrupted by user. Saving partial results...") break except Exception as e: print(f"\nError fetching detail for {inst['uuid']}: {e}") continue print(f"\n\nPhase 2 complete: {len(detailed_institutions)} institutions with details") # Save results print(f"\nSaving to {OUTPUT_FILE}...") save_institutions(detailed_institutions) print("\nScraping complete!") print(f"Total institutions: {len(detailed_institutions)}") def save_institutions(institutions: List[Dict[str, Any]]): """ Save institutions to YAML file. Args: institutions: List of institution metadata dicts """ OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Create LinkML-compliant records records = [] for inst in institutions: record = { "id": f"https://w3id.org/heritage/custodian/cz/{inst['uuid']}", "name": inst.get("name"), "institution_type": inst.get("institution_type", "ARCHIVE"), "description": inst.get("description"), } # Identifiers identifiers = [] if inst.get("uuid"): identifiers.append({ "identifier_scheme": "ARON_UUID", "identifier_value": inst["uuid"], "identifier_url": f"https://portal.nacr.cz/aron/apu/{inst['uuid']}" }) if inst.get("institution_code"): identifiers.append({ "identifier_scheme": "INSTITUTION_CODE", "identifier_value": inst["institution_code"] }) if identifiers: record["identifiers"] = identifiers # Website if inst.get("website"): if "identifiers" not in record: record["identifiers"] = [] record["identifiers"].append({ "identifier_scheme": "Website", "identifier_value": inst["website"], "identifier_url": inst["website"] }) # Provenance record["provenance"] = { "data_source": "CONVERSATION_NLP", # Will update to WEB_SCRAPING "data_tier": "TIER_1_AUTHORITATIVE", "extraction_date": datetime.now(timezone.utc).isoformat(), "extraction_method": "ARON API scraping via undocumented REST endpoint", "confidence_score": 0.85, "source_url": "https://portal.nacr.cz/aron/institution" } records.append(record) # Write YAML import yaml with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: yaml.dump(records, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(f"Saved {len(records)} institutions to {OUTPUT_FILE}") if __name__ == "__main__": scrape_all_institutions()