glam/scripts/scrapers/scrape_czech_archives_aron.py
2025-11-19 23:25:22 +01:00

365 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Scrape Czech Archive Institutions from ARON Portal API
The ARON (ARchiv ONline) portal has an undocumented REST API that provides
access to archive institutions and collections.
API Discovery:
- Found by reverse-engineering the portal.nacr.cz/aron/institution page
- Uses POST requests with JSON body containing filters
- Key discovery: type filter with value "INSTITUTION" returns only institutions
API Endpoints:
- List: POST https://portal.nacr.cz/aron/api/aron/apu/listview?listType=EVIDENCE-LIST
- Detail: GET https://portal.nacr.cz/aron/api/aron/apu/{uuid}
Filter Structure:
{
"filters": [{"field": "type", "operation": "EQ", "value": "INSTITUTION"}],
"offset": 0,
"size": 100
}
This script:
1. Fetches institutions using API type filter (~560 total)
2. Extracts metadata from each institution detail page
3. Outputs LinkML-compliant YAML for Czech archives
Estimated runtime: 10-15 minutes (560 institutions × 0.5s rate limit)
"""
import json
import time
import requests
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional
import sys
# Constants
API_BASE = "https://portal.nacr.cz/aron/api/aron"
API_LIST = f"{API_BASE}/apu/listview?listType=EVIDENCE-LIST"
API_DETAIL = f"{API_BASE}/apu"
BATCH_SIZE = 100 # Records per API request
RATE_LIMIT_DELAY = 0.5 # Seconds between requests (2 req/sec max)
OUTPUT_DIR = Path("data/instances")
OUTPUT_FILE = OUTPUT_DIR / "czech_archives_aron.yaml"
# Type mapping from ARON metadata types to GLAM taxonomy
TYPE_MAPPING = {
"archiv": "ARCHIVE",
"státní archiv": "ARCHIVE",
"oblastní archiv": "ARCHIVE",
"okresní archiv": "ARCHIVE",
"městský archiv": "ARCHIVE",
"archiv města": "ARCHIVE",
"muzeum": "MUSEUM",
"galerie": "GALLERY",
"knihovna": "LIBRARY",
"univerzita": "EDUCATION_PROVIDER",
"vysoká škola": "EDUCATION_PROVIDER",
"památník": "MUSEUM",
}
def fetch_list_page(offset: int = 0) -> Dict[str, Any]:
"""
Fetch a page of institution records from ARON API.
Uses the institution type filter to get only institutions (not fonds, etc.)
Args:
offset: Record offset for pagination
Returns:
API response with items and pagination info
"""
payload = {
"filters": [
{
"field": "type",
"operation": "EQ",
"value": "INSTITUTION"
}
],
"sort": [
{
"field": "name",
"type": "SCORE",
"order": "DESC",
"sortMode": "MIN"
}
],
"offset": offset,
"flipDirection": False,
"size": BATCH_SIZE
}
try:
response = requests.post(
API_LIST,
json=payload,
headers={"Content-Type": "application/json"},
timeout=30
)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
print(f"Error fetching list page: {e}")
raise
def fetch_institution_detail(uuid: str) -> Dict[str, Any]:
"""
Fetch detailed metadata for a single institution.
Args:
uuid: Institution UUID
Returns:
Institution detail object
"""
try:
response = requests.get(
f"{API_DETAIL}/{uuid}",
timeout=30
)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
print(f"Error fetching detail for {uuid}: {e}")
return {}
def classify_institution_type(name: str, description: str = "") -> str:
"""
Classify institution type from name and description.
Args:
name: Institution name
description: Institution description
Returns:
Institution type from GLAM taxonomy
"""
text = f"{name} {description}".lower()
for pattern, inst_type in TYPE_MAPPING.items():
if pattern in text:
return inst_type
# Default to ARCHIVE (since we're scraping archive portal)
return "ARCHIVE"
def extract_metadata(detail: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract metadata from institution detail response.
Args:
detail: Institution detail from API
Returns:
Extracted metadata dict
"""
metadata = {
"name": detail.get("name"),
"description": detail.get("description"),
"uuid": detail.get("id"),
}
# Extract metadata from parts
parts = detail.get("parts", [])
for part in parts:
items = part.get("items", [])
for item in items:
item_type = item.get("type")
value = item.get("value")
if item_type == "INST~CODE":
metadata["institution_code"] = value
elif item_type == "INST~SHORT~NAME":
metadata["short_name"] = value
elif item_type == "INST~ADDRESS":
metadata["address"] = value
elif item_type == "INST~PHONE":
metadata["phone"] = value
elif item_type == "INST~EMAIL":
metadata["email"] = value
elif item_type == "INST~URL":
metadata["website"] = value
return metadata
def scrape_all_institutions():
"""
Scrape all institutions from ARON portal using type filter.
"""
print("=" * 70)
print("ARON Archive Institution Scraper")
print("=" * 70)
institutions = []
offset = 0
total_count = 0
# Fetch institution list with type filter (no need for name filtering!)
print("\nFetching institutions with API type filter...")
print("(This API filter returns only institutions, not fonds or collections)")
while True:
try:
# Fetch page
response = fetch_list_page(offset)
items = response.get("items", [])
count = response.get("count", 0)
# Store total count from first response
if offset == 0:
total_count = count
print(f"\nTotal institutions: {total_count}")
if not items:
break
# All items are institutions (API filtered)
for item in items:
institutions.append({
"uuid": item["id"],
"name": item["name"],
"description": item.get("description")
})
# Progress
fetched = offset + len(items)
progress = (fetched / total_count * 100) if total_count > 0 else 0
print(f"Progress: {fetched}/{total_count} institutions ({progress:.1f}%)", end="\r")
# Check if more pages
offset += len(items)
if fetched >= total_count:
break
# Rate limiting
time.sleep(RATE_LIMIT_DELAY)
except KeyboardInterrupt:
print("\n\nInterrupted by user. Saving partial results...")
break
except Exception as e:
print(f"\nError during scraping: {e}")
break
print(f"\n\nPhase 1 complete: {len(institutions)} institutions fetched")
# Second pass: Fetch details for each institution
print(f"\nPhase 2: Fetching detailed metadata for {len(institutions)} institutions...")
detailed_institutions = []
for idx, inst in enumerate(institutions, 1):
try:
# Fetch detail
detail = fetch_institution_detail(inst["uuid"])
if detail:
metadata = extract_metadata(detail)
metadata["institution_type"] = classify_institution_type(
metadata.get("name", ""),
metadata.get("description", "")
)
detailed_institutions.append(metadata)
# Progress
print(f"Progress: {idx}/{len(institutions)} institutions processed", end="\r")
# Rate limiting
time.sleep(RATE_LIMIT_DELAY)
except KeyboardInterrupt:
print("\n\nInterrupted by user. Saving partial results...")
break
except Exception as e:
print(f"\nError fetching detail for {inst['uuid']}: {e}")
continue
print(f"\n\nPhase 2 complete: {len(detailed_institutions)} institutions with details")
# Save results
print(f"\nSaving to {OUTPUT_FILE}...")
save_institutions(detailed_institutions)
print("\nScraping complete!")
print(f"Total institutions: {len(detailed_institutions)}")
def save_institutions(institutions: List[Dict[str, Any]]):
"""
Save institutions to YAML file.
Args:
institutions: List of institution metadata dicts
"""
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Create LinkML-compliant records
records = []
for inst in institutions:
record = {
"id": f"https://w3id.org/heritage/custodian/cz/{inst['uuid']}",
"name": inst.get("name"),
"institution_type": inst.get("institution_type", "ARCHIVE"),
"description": inst.get("description"),
}
# Identifiers
identifiers = []
if inst.get("uuid"):
identifiers.append({
"identifier_scheme": "ARON_UUID",
"identifier_value": inst["uuid"],
"identifier_url": f"https://portal.nacr.cz/aron/apu/{inst['uuid']}"
})
if inst.get("institution_code"):
identifiers.append({
"identifier_scheme": "INSTITUTION_CODE",
"identifier_value": inst["institution_code"]
})
if identifiers:
record["identifiers"] = identifiers
# Website
if inst.get("website"):
if "identifiers" not in record:
record["identifiers"] = []
record["identifiers"].append({
"identifier_scheme": "Website",
"identifier_value": inst["website"],
"identifier_url": inst["website"]
})
# Provenance
record["provenance"] = {
"data_source": "CONVERSATION_NLP", # Will update to WEB_SCRAPING
"data_tier": "TIER_1_AUTHORITATIVE",
"extraction_date": datetime.now(timezone.utc).isoformat(),
"extraction_method": "ARON API scraping via undocumented REST endpoint",
"confidence_score": 0.85,
"source_url": "https://portal.nacr.cz/aron/institution"
}
records.append(record)
# Write YAML
import yaml
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(records, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(f"Saved {len(records)} institutions to {OUTPUT_FILE}")
if __name__ == "__main__":
scrape_all_institutions()