365 lines
11 KiB
Python
365 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Scrape Czech Archive Institutions from ARON Portal API
|
||
|
||
The ARON (ARchiv ONline) portal has an undocumented REST API that provides
|
||
access to archive institutions and collections.
|
||
|
||
API Discovery:
|
||
- Found by reverse-engineering the portal.nacr.cz/aron/institution page
|
||
- Uses POST requests with JSON body containing filters
|
||
- Key discovery: type filter with value "INSTITUTION" returns only institutions
|
||
|
||
API Endpoints:
|
||
- List: POST https://portal.nacr.cz/aron/api/aron/apu/listview?listType=EVIDENCE-LIST
|
||
- Detail: GET https://portal.nacr.cz/aron/api/aron/apu/{uuid}
|
||
|
||
Filter Structure:
|
||
{
|
||
"filters": [{"field": "type", "operation": "EQ", "value": "INSTITUTION"}],
|
||
"offset": 0,
|
||
"size": 100
|
||
}
|
||
|
||
This script:
|
||
1. Fetches institutions using API type filter (~560 total)
|
||
2. Extracts metadata from each institution detail page
|
||
3. Outputs LinkML-compliant YAML for Czech archives
|
||
|
||
Estimated runtime: 10-15 minutes (560 institutions × 0.5s rate limit)
|
||
"""
|
||
|
||
import json
|
||
import time
|
||
import requests
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
from typing import List, Dict, Any, Optional
|
||
import sys
|
||
|
||
# Constants
|
||
API_BASE = "https://portal.nacr.cz/aron/api/aron"
|
||
API_LIST = f"{API_BASE}/apu/listview?listType=EVIDENCE-LIST"
|
||
API_DETAIL = f"{API_BASE}/apu"
|
||
|
||
BATCH_SIZE = 100 # Records per API request
|
||
RATE_LIMIT_DELAY = 0.5 # Seconds between requests (2 req/sec max)
|
||
|
||
OUTPUT_DIR = Path("data/instances")
|
||
OUTPUT_FILE = OUTPUT_DIR / "czech_archives_aron.yaml"
|
||
|
||
# Type mapping from ARON metadata types to GLAM taxonomy
|
||
TYPE_MAPPING = {
|
||
"archiv": "ARCHIVE",
|
||
"státní archiv": "ARCHIVE",
|
||
"oblastní archiv": "ARCHIVE",
|
||
"okresní archiv": "ARCHIVE",
|
||
"městský archiv": "ARCHIVE",
|
||
"archiv města": "ARCHIVE",
|
||
"muzeum": "MUSEUM",
|
||
"galerie": "GALLERY",
|
||
"knihovna": "LIBRARY",
|
||
"univerzita": "EDUCATION_PROVIDER",
|
||
"vysoká škola": "EDUCATION_PROVIDER",
|
||
"památník": "MUSEUM",
|
||
}
|
||
|
||
|
||
def fetch_list_page(offset: int = 0) -> Dict[str, Any]:
|
||
"""
|
||
Fetch a page of institution records from ARON API.
|
||
|
||
Uses the institution type filter to get only institutions (not fonds, etc.)
|
||
|
||
Args:
|
||
offset: Record offset for pagination
|
||
|
||
Returns:
|
||
API response with items and pagination info
|
||
"""
|
||
payload = {
|
||
"filters": [
|
||
{
|
||
"field": "type",
|
||
"operation": "EQ",
|
||
"value": "INSTITUTION"
|
||
}
|
||
],
|
||
"sort": [
|
||
{
|
||
"field": "name",
|
||
"type": "SCORE",
|
||
"order": "DESC",
|
||
"sortMode": "MIN"
|
||
}
|
||
],
|
||
"offset": offset,
|
||
"flipDirection": False,
|
||
"size": BATCH_SIZE
|
||
}
|
||
|
||
try:
|
||
response = requests.post(
|
||
API_LIST,
|
||
json=payload,
|
||
headers={"Content-Type": "application/json"},
|
||
timeout=30
|
||
)
|
||
response.raise_for_status()
|
||
return response.json()
|
||
except requests.RequestException as e:
|
||
print(f"Error fetching list page: {e}")
|
||
raise
|
||
|
||
|
||
def fetch_institution_detail(uuid: str) -> Dict[str, Any]:
|
||
"""
|
||
Fetch detailed metadata for a single institution.
|
||
|
||
Args:
|
||
uuid: Institution UUID
|
||
|
||
Returns:
|
||
Institution detail object
|
||
"""
|
||
try:
|
||
response = requests.get(
|
||
f"{API_DETAIL}/{uuid}",
|
||
timeout=30
|
||
)
|
||
response.raise_for_status()
|
||
return response.json()
|
||
except requests.RequestException as e:
|
||
print(f"Error fetching detail for {uuid}: {e}")
|
||
return {}
|
||
|
||
|
||
def classify_institution_type(name: str, description: str = "") -> str:
|
||
"""
|
||
Classify institution type from name and description.
|
||
|
||
Args:
|
||
name: Institution name
|
||
description: Institution description
|
||
|
||
Returns:
|
||
Institution type from GLAM taxonomy
|
||
"""
|
||
text = f"{name} {description}".lower()
|
||
|
||
for pattern, inst_type in TYPE_MAPPING.items():
|
||
if pattern in text:
|
||
return inst_type
|
||
|
||
# Default to ARCHIVE (since we're scraping archive portal)
|
||
return "ARCHIVE"
|
||
|
||
|
||
def extract_metadata(detail: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""
|
||
Extract metadata from institution detail response.
|
||
|
||
Args:
|
||
detail: Institution detail from API
|
||
|
||
Returns:
|
||
Extracted metadata dict
|
||
"""
|
||
metadata = {
|
||
"name": detail.get("name"),
|
||
"description": detail.get("description"),
|
||
"uuid": detail.get("id"),
|
||
}
|
||
|
||
# Extract metadata from parts
|
||
parts = detail.get("parts", [])
|
||
for part in parts:
|
||
items = part.get("items", [])
|
||
for item in items:
|
||
item_type = item.get("type")
|
||
value = item.get("value")
|
||
|
||
if item_type == "INST~CODE":
|
||
metadata["institution_code"] = value
|
||
elif item_type == "INST~SHORT~NAME":
|
||
metadata["short_name"] = value
|
||
elif item_type == "INST~ADDRESS":
|
||
metadata["address"] = value
|
||
elif item_type == "INST~PHONE":
|
||
metadata["phone"] = value
|
||
elif item_type == "INST~EMAIL":
|
||
metadata["email"] = value
|
||
elif item_type == "INST~URL":
|
||
metadata["website"] = value
|
||
|
||
return metadata
|
||
|
||
|
||
def scrape_all_institutions():
|
||
"""
|
||
Scrape all institutions from ARON portal using type filter.
|
||
"""
|
||
print("=" * 70)
|
||
print("ARON Archive Institution Scraper")
|
||
print("=" * 70)
|
||
|
||
institutions = []
|
||
offset = 0
|
||
total_count = 0
|
||
|
||
# Fetch institution list with type filter (no need for name filtering!)
|
||
print("\nFetching institutions with API type filter...")
|
||
print("(This API filter returns only institutions, not fonds or collections)")
|
||
|
||
while True:
|
||
try:
|
||
# Fetch page
|
||
response = fetch_list_page(offset)
|
||
items = response.get("items", [])
|
||
count = response.get("count", 0)
|
||
|
||
# Store total count from first response
|
||
if offset == 0:
|
||
total_count = count
|
||
print(f"\nTotal institutions: {total_count}")
|
||
|
||
if not items:
|
||
break
|
||
|
||
# All items are institutions (API filtered)
|
||
for item in items:
|
||
institutions.append({
|
||
"uuid": item["id"],
|
||
"name": item["name"],
|
||
"description": item.get("description")
|
||
})
|
||
|
||
# Progress
|
||
fetched = offset + len(items)
|
||
progress = (fetched / total_count * 100) if total_count > 0 else 0
|
||
print(f"Progress: {fetched}/{total_count} institutions ({progress:.1f}%)", end="\r")
|
||
|
||
# Check if more pages
|
||
offset += len(items)
|
||
if fetched >= total_count:
|
||
break
|
||
|
||
# Rate limiting
|
||
time.sleep(RATE_LIMIT_DELAY)
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\nInterrupted by user. Saving partial results...")
|
||
break
|
||
except Exception as e:
|
||
print(f"\nError during scraping: {e}")
|
||
break
|
||
|
||
print(f"\n\nPhase 1 complete: {len(institutions)} institutions fetched")
|
||
|
||
# Second pass: Fetch details for each institution
|
||
print(f"\nPhase 2: Fetching detailed metadata for {len(institutions)} institutions...")
|
||
|
||
detailed_institutions = []
|
||
for idx, inst in enumerate(institutions, 1):
|
||
try:
|
||
# Fetch detail
|
||
detail = fetch_institution_detail(inst["uuid"])
|
||
|
||
if detail:
|
||
metadata = extract_metadata(detail)
|
||
metadata["institution_type"] = classify_institution_type(
|
||
metadata.get("name", ""),
|
||
metadata.get("description", "")
|
||
)
|
||
detailed_institutions.append(metadata)
|
||
|
||
# Progress
|
||
print(f"Progress: {idx}/{len(institutions)} institutions processed", end="\r")
|
||
|
||
# Rate limiting
|
||
time.sleep(RATE_LIMIT_DELAY)
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\nInterrupted by user. Saving partial results...")
|
||
break
|
||
except Exception as e:
|
||
print(f"\nError fetching detail for {inst['uuid']}: {e}")
|
||
continue
|
||
|
||
print(f"\n\nPhase 2 complete: {len(detailed_institutions)} institutions with details")
|
||
|
||
# Save results
|
||
print(f"\nSaving to {OUTPUT_FILE}...")
|
||
save_institutions(detailed_institutions)
|
||
|
||
print("\nScraping complete!")
|
||
print(f"Total institutions: {len(detailed_institutions)}")
|
||
|
||
|
||
def save_institutions(institutions: List[Dict[str, Any]]):
|
||
"""
|
||
Save institutions to YAML file.
|
||
|
||
Args:
|
||
institutions: List of institution metadata dicts
|
||
"""
|
||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Create LinkML-compliant records
|
||
records = []
|
||
for inst in institutions:
|
||
record = {
|
||
"id": f"https://w3id.org/heritage/custodian/cz/{inst['uuid']}",
|
||
"name": inst.get("name"),
|
||
"institution_type": inst.get("institution_type", "ARCHIVE"),
|
||
"description": inst.get("description"),
|
||
}
|
||
|
||
# Identifiers
|
||
identifiers = []
|
||
if inst.get("uuid"):
|
||
identifiers.append({
|
||
"identifier_scheme": "ARON_UUID",
|
||
"identifier_value": inst["uuid"],
|
||
"identifier_url": f"https://portal.nacr.cz/aron/apu/{inst['uuid']}"
|
||
})
|
||
if inst.get("institution_code"):
|
||
identifiers.append({
|
||
"identifier_scheme": "INSTITUTION_CODE",
|
||
"identifier_value": inst["institution_code"]
|
||
})
|
||
if identifiers:
|
||
record["identifiers"] = identifiers
|
||
|
||
# Website
|
||
if inst.get("website"):
|
||
if "identifiers" not in record:
|
||
record["identifiers"] = []
|
||
record["identifiers"].append({
|
||
"identifier_scheme": "Website",
|
||
"identifier_value": inst["website"],
|
||
"identifier_url": inst["website"]
|
||
})
|
||
|
||
# Provenance
|
||
record["provenance"] = {
|
||
"data_source": "CONVERSATION_NLP", # Will update to WEB_SCRAPING
|
||
"data_tier": "TIER_1_AUTHORITATIVE",
|
||
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
||
"extraction_method": "ARON API scraping via undocumented REST endpoint",
|
||
"confidence_score": 0.85,
|
||
"source_url": "https://portal.nacr.cz/aron/institution"
|
||
}
|
||
|
||
records.append(record)
|
||
|
||
# Write YAML
|
||
import yaml
|
||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||
yaml.dump(records, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
||
|
||
print(f"Saved {len(records)} institutions to {OUTPUT_FILE}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
scrape_all_institutions()
|