glam/scripts/test_nationaalarchief_endpoints.py
2025-12-14 17:09:55 +01:00

796 lines
27 KiB
Python

#!/usr/bin/env python3
"""
Nationaal Archief Data Service Endpoint Testing Script
Tests all documented API endpoints for the Nationaal Archief (NL-ZH-DHA-A-NA):
1. OAI-PMH endpoint (EAD/XML harvesting)
2. METS API (metadata files)
3. File API (scan downloads)
4. SPARQL webservice (photo collection LOD)
5. Ontology endpoint (NAO)
6. Photo API (currently offline)
Usage:
python scripts/test_nationaalarchief_endpoints.py [--verbose] [--output-dir DIR]
References:
- Data file: data/custodian/NL-ZH-DHA-A-NA.yaml (lines 573-750)
- Documentation: https://www.nationaalarchief.nl/onderzoeken/open-data
"""
import argparse
import json
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from xml.etree import ElementTree as ET
import requests
# ============================================================================
# Configuration
# ============================================================================
ENDPOINTS = {
"oai_pmh": {
"id": "nationaalarchief-oai-pmh-ead",
"name": "OAI-PMH Endpoint (EAD/XML)",
"base_url": "https://service.archief.nl/gaf/oai/!open_oai.OAIHandler",
"protocol": "OAI-PMH 2.0",
"expected_status": "ACTIVE",
},
"mets_api": {
"id": "nationaalarchief-mets-api",
"name": "METS API",
"base_url": "https://service.archief.nl/gaf/api/mets/v1",
"protocol": "REST",
"expected_status": "ACTIVE",
"sample_uuid": "48f1f22f-1228-4b00-9720-5816a07b4003",
},
"file_api": {
"id": "nationaalarchief-file-api",
"name": "File Download API",
"base_url": "https://service.archief.nl/gaf/api/file/v1",
"protocol": "REST",
"expected_status": "ACTIVE",
"sample_uuids": {
"default": "835776c2-fb57-47eb-b537-b82758b6558a",
"thumb": "834a6c29-61f1-4926-94bd-674132d25fd5",
},
},
"sparql": {
"id": "nationaalarchief-sparql-webservice",
"name": "SPARQL Webservice",
"base_url": "https://www.nationaalarchief.nl/onderzoeken/sparql",
"protocol": "SPARQL",
"expected_status": "ACTIVE",
},
"ontology": {
"id": "nationaalarchief-ontology",
"name": "Nationaal Archief Ontologie (NAO)",
"base_url": "https://raw.githubusercontent.com/NationaalArchief/LOD/master/archief.nl-def-ontologie.json",
"deprecated_url": "https://archief.nl/def/ontologie/", # SSL/403 issues
"protocol": "HTTP",
"expected_status": "ACTIVE",
},
"photo_api": {
"id": "nationaalarchief-photo-api",
"name": "Photo API",
"base_url": None, # Currently offline
"protocol": "REST",
"expected_status": "OFFLINE",
},
}
# OAI-PMH verbs to test
OAI_PMH_VERBS = [
("Identify", {}),
("ListMetadataFormats", {}),
("ListSets", {}),
("ListIdentifiers", {"metadataPrefix": "oai_ead", "set": "2.21.205.69"}),
("GetRecord", {"metadataPrefix": "oai_ead", "identifier": "2.21.205.69"}),
]
# Sample SPARQL query for photo collection
SPARQL_QUERY = """
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX edm: <http://www.europeana.eu/schemas/edm/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?photo ?title ?creator ?date
WHERE {
?photo a edm:ProvidedCHO .
OPTIONAL { ?photo dc:title ?title }
OPTIONAL { ?photo dc:creator ?creator }
OPTIONAL { ?photo dc:date ?date }
}
LIMIT 5
"""
# ============================================================================
# Data Classes
# ============================================================================
@dataclass
class TestResult:
"""Result of a single endpoint test."""
endpoint_id: str
test_name: str
success: bool
status_code: int | None = None
response_time_ms: float | None = None
content_type: str | None = None
error_message: str | None = None
details: dict[str, Any] = field(default_factory=dict)
@dataclass
class EndpointReport:
"""Aggregated report for an endpoint."""
endpoint_id: str
endpoint_name: str
protocol: str
expected_status: str
actual_status: str
tests_passed: int = 0
tests_failed: int = 0
tests: list[TestResult] = field(default_factory=list)
# ============================================================================
# Test Functions
# ============================================================================
def test_oai_pmh(verbose: bool = False) -> EndpointReport:
"""Test OAI-PMH endpoint with all verbs."""
config = ENDPOINTS["oai_pmh"]
report = EndpointReport(
endpoint_id=config["id"],
endpoint_name=config["name"],
protocol=config["protocol"],
expected_status=config["expected_status"],
actual_status="UNKNOWN",
)
base_url = config["base_url"]
for verb, params in OAI_PMH_VERBS:
test_name = f"OAI-PMH {verb}"
params_with_verb = {"verb": verb, **params}
try:
start = time.time()
response = requests.get(base_url, params=params_with_verb, timeout=30)
elapsed_ms = (time.time() - start) * 1000
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=response.status_code == 200,
status_code=response.status_code,
response_time_ms=round(elapsed_ms, 2),
content_type=response.headers.get("Content-Type"),
)
# Parse XML response
if response.status_code == 200:
try:
root = ET.fromstring(response.content)
# Check for OAI-PMH error
ns = {"oai": "http://www.openarchives.org/OAI/2.0/"}
error = root.find(".//oai:error", ns)
if error is not None:
result.success = False
result.error_message = f"OAI error: {error.get('code')} - {error.text}"
else:
# Extract some details based on verb
if verb == "Identify":
repo_name = root.find(".//oai:repositoryName", ns)
if repo_name is not None:
result.details["repository_name"] = repo_name.text
elif verb == "ListSets":
sets = root.findall(".//oai:set", ns)
result.details["set_count"] = len(sets)
elif verb == "GetRecord":
record = root.find(".//oai:record", ns)
result.details["record_found"] = record is not None
except ET.ParseError as e:
result.success = False
result.error_message = f"XML parse error: {e}"
if verbose:
print(f" [{'+' if result.success else '-'}] {test_name}: "
f"{result.status_code} ({result.response_time_ms:.0f}ms)")
if result.details:
print(f" Details: {result.details}")
if result.error_message:
print(f" Error: {result.error_message}")
except requests.RequestException as e:
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=False,
error_message=str(e),
)
if verbose:
print(f" [-] {test_name}: ERROR - {e}")
report.tests.append(result)
if result.success:
report.tests_passed += 1
else:
report.tests_failed += 1
# Determine actual status
if report.tests_passed == len(OAI_PMH_VERBS):
report.actual_status = "ACTIVE"
elif report.tests_passed > 0:
report.actual_status = "DEGRADED"
else:
report.actual_status = "OFFLINE"
return report
def test_mets_api(verbose: bool = False) -> EndpointReport:
"""Test METS API endpoint."""
config = ENDPOINTS["mets_api"]
report = EndpointReport(
endpoint_id=config["id"],
endpoint_name=config["name"],
protocol=config["protocol"],
expected_status=config["expected_status"],
actual_status="UNKNOWN",
)
uuid = config["sample_uuid"]
url = f"{config['base_url']}/{uuid}"
test_name = "METS file retrieval"
try:
start = time.time()
response = requests.get(url, timeout=30)
elapsed_ms = (time.time() - start) * 1000
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=response.status_code == 200,
status_code=response.status_code,
response_time_ms=round(elapsed_ms, 2),
content_type=response.headers.get("Content-Type"),
)
if response.status_code == 200:
try:
root = ET.fromstring(response.content)
# Count file entries in METS
ns = {"mets": "http://www.loc.gov/METS/"}
files = root.findall(".//mets:file", ns)
result.details["file_count"] = len(files)
result.details["mets_uuid"] = uuid
# Extract first few file UUIDs for reference
file_uuids = []
for f in files[:3]:
fid = f.get("ID", "")
file_uuids.append(fid)
if file_uuids:
result.details["sample_file_ids"] = file_uuids
except ET.ParseError as e:
result.success = False
result.error_message = f"XML parse error: {e}"
if verbose:
print(f" [{'+' if result.success else '-'}] {test_name}: "
f"{result.status_code} ({result.response_time_ms:.0f}ms)")
if result.details:
print(f" Details: {result.details}")
if result.error_message:
print(f" Error: {result.error_message}")
except requests.RequestException as e:
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=False,
error_message=str(e),
)
if verbose:
print(f" [-] {test_name}: ERROR - {e}")
report.tests.append(result)
if result.success:
report.tests_passed += 1
report.actual_status = "ACTIVE"
else:
report.tests_failed += 1
report.actual_status = "OFFLINE"
return report
def test_file_api(verbose: bool = False) -> EndpointReport:
"""Test File Download API endpoint."""
config = ENDPOINTS["file_api"]
report = EndpointReport(
endpoint_id=config["id"],
endpoint_name=config["name"],
protocol=config["protocol"],
expected_status=config["expected_status"],
actual_status="UNKNOWN",
)
for file_type, uuid in config["sample_uuids"].items():
url = f"{config['base_url']}/{file_type}/{uuid}"
test_name = f"File download ({file_type})"
try:
start = time.time()
# Use HEAD request first to avoid downloading large files
response = requests.head(url, timeout=30, allow_redirects=True)
elapsed_ms = (time.time() - start) * 1000
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=response.status_code == 200,
status_code=response.status_code,
response_time_ms=round(elapsed_ms, 2),
content_type=response.headers.get("Content-Type"),
)
if response.status_code == 200:
content_length = response.headers.get("Content-Length")
if content_length:
result.details["content_length_bytes"] = int(content_length)
result.details["file_type"] = file_type
result.details["uuid"] = uuid
if verbose:
print(f" [{'+' if result.success else '-'}] {test_name}: "
f"{result.status_code} ({result.response_time_ms:.0f}ms)")
if result.details:
print(f" Details: {result.details}")
except requests.RequestException as e:
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=False,
error_message=str(e),
)
if verbose:
print(f" [-] {test_name}: ERROR - {e}")
report.tests.append(result)
if result.success:
report.tests_passed += 1
else:
report.tests_failed += 1
# Determine actual status
if report.tests_passed == len(config["sample_uuids"]):
report.actual_status = "ACTIVE"
elif report.tests_passed > 0:
report.actual_status = "DEGRADED"
else:
report.actual_status = "OFFLINE"
return report
def test_sparql(verbose: bool = False) -> EndpointReport:
"""Test SPARQL webservice endpoint."""
config = ENDPOINTS["sparql"]
report = EndpointReport(
endpoint_id=config["id"],
endpoint_name=config["name"],
protocol=config["protocol"],
expected_status=config["expected_status"],
actual_status="UNKNOWN",
)
# Test 1: Check endpoint availability
test_name = "SPARQL endpoint availability"
try:
start = time.time()
response = requests.get(config["base_url"], timeout=30)
elapsed_ms = (time.time() - start) * 1000
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=response.status_code == 200,
status_code=response.status_code,
response_time_ms=round(elapsed_ms, 2),
content_type=response.headers.get("Content-Type"),
)
if verbose:
print(f" [{'+' if result.success else '-'}] {test_name}: "
f"{result.status_code} ({result.response_time_ms:.0f}ms)")
except requests.RequestException as e:
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=False,
error_message=str(e),
)
if verbose:
print(f" [-] {test_name}: ERROR - {e}")
report.tests.append(result)
if result.success:
report.tests_passed += 1
else:
report.tests_failed += 1
# Test 2: Execute sample query
test_name = "SPARQL query execution"
try:
start = time.time()
response = requests.post(
config["base_url"],
data={"query": SPARQL_QUERY},
headers={"Accept": "application/sparql-results+json"},
timeout=60,
)
elapsed_ms = (time.time() - start) * 1000
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=response.status_code == 200,
status_code=response.status_code,
response_time_ms=round(elapsed_ms, 2),
content_type=response.headers.get("Content-Type"),
)
if response.status_code == 200:
try:
data = response.json()
bindings = data.get("results", {}).get("bindings", [])
result.details["result_count"] = len(bindings)
if bindings:
result.details["sample_result"] = {
k: v.get("value") for k, v in bindings[0].items()
}
except json.JSONDecodeError:
result.details["note"] = "Response not JSON, may be HTML form"
if verbose:
print(f" [{'+' if result.success else '-'}] {test_name}: "
f"{result.status_code} ({result.response_time_ms:.0f}ms)")
if result.details:
print(f" Details: {result.details}")
except requests.RequestException as e:
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=False,
error_message=str(e),
)
if verbose:
print(f" [-] {test_name}: ERROR - {e}")
report.tests.append(result)
if result.success:
report.tests_passed += 1
else:
report.tests_failed += 1
# Determine actual status
if report.tests_passed == 2:
report.actual_status = "ACTIVE"
elif report.tests_passed > 0:
report.actual_status = "DEGRADED"
else:
report.actual_status = "OFFLINE"
return report
def test_ontology(verbose: bool = False) -> EndpointReport:
"""Test NAO ontology endpoint (JSON-LD from GitHub).
Note: The original archief.nl/def/ontologie/ URL has SSL/403 issues.
The ontology is now hosted on GitHub as JSON-LD only.
"""
config = ENDPOINTS["ontology"]
report = EndpointReport(
endpoint_id=config["id"],
endpoint_name=config["name"],
protocol=config["protocol"],
expected_status=config["expected_status"],
actual_status="UNKNOWN",
)
# Test 1: Fetch JSON-LD ontology from GitHub
test_name = "Ontology retrieval (JSON-LD from GitHub)"
try:
start = time.time()
response = requests.get(
config["base_url"],
timeout=30,
allow_redirects=True,
)
elapsed_ms = (time.time() - start) * 1000
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=response.status_code == 200,
status_code=response.status_code,
response_time_ms=round(elapsed_ms, 2),
content_type=response.headers.get("Content-Type"),
)
if response.status_code == 200:
result.details["response_size_bytes"] = len(response.content)
result.details["source"] = "GitHub (NationaalArchief/LOD)"
# Try to parse JSON-LD and extract some metadata
try:
data = response.json()
if isinstance(data, list):
# JSON-LD is an array of entities
result.details["entity_count"] = len(data)
# Count different types
types = {}
for item in data:
if isinstance(item, dict) and "@type" in item:
t = item["@type"]
if isinstance(t, list):
t = t[0] if t else "unknown"
types[t] = types.get(t, 0) + 1
if types:
result.details["types_found"] = dict(list(types.items())[:5])
elif isinstance(data, dict):
# Standard JSON-LD with @graph
graph = data.get("@graph", [])
result.details["graph_size"] = len(graph)
except json.JSONDecodeError:
result.details["parse_note"] = "Content is not valid JSON"
if verbose:
print(f" [{'+' if result.success else '-'}] {test_name}: "
f"{result.status_code} ({result.response_time_ms:.0f}ms)")
if result.details:
print(f" Details: {result.details}")
except requests.RequestException as e:
result = TestResult(
endpoint_id=config["id"],
test_name=test_name,
success=False,
error_message=str(e),
)
if verbose:
print(f" [-] {test_name}: ERROR - {e}")
report.tests.append(result)
if result.success:
report.tests_passed += 1
else:
report.tests_failed += 1
# Determine actual status
if report.tests_passed == 1:
report.actual_status = "ACTIVE"
else:
report.actual_status = "OFFLINE"
return report
def test_photo_api(verbose: bool = False) -> EndpointReport:
"""Test Photo API (expected to be offline)."""
config = ENDPOINTS["photo_api"]
report = EndpointReport(
endpoint_id=config["id"],
endpoint_name=config["name"],
protocol=config["protocol"],
expected_status=config["expected_status"],
actual_status="OFFLINE",
)
# No URL to test - documented as offline
result = TestResult(
endpoint_id=config["id"],
test_name="Photo API status check",
success=True, # Expected to be offline
details={"status": "OFFLINE", "note": "API under development, no URL available"},
)
if verbose:
print(f" [~] Photo API: OFFLINE (expected - under development)")
report.tests.append(result)
report.tests_passed += 1 # Correctly identified as offline
return report
# ============================================================================
# Report Generation
# ============================================================================
def generate_report(reports: list[EndpointReport], output_dir: Path | None = None) -> dict:
"""Generate comprehensive test report."""
timestamp = datetime.now(timezone.utc).isoformat()
summary = {
"report_timestamp": timestamp,
"custodian_id": "NL-ZH-DHA-A-NA",
"custodian_name": "Nationaal Archief",
"total_endpoints": len(reports),
"endpoints_active": sum(1 for r in reports if r.actual_status == "ACTIVE"),
"endpoints_degraded": sum(1 for r in reports if r.actual_status == "DEGRADED"),
"endpoints_offline": sum(1 for r in reports if r.actual_status == "OFFLINE"),
"total_tests": sum(r.tests_passed + r.tests_failed for r in reports),
"tests_passed": sum(r.tests_passed for r in reports),
"tests_failed": sum(r.tests_failed for r in reports),
}
endpoint_results = []
for report in reports:
endpoint_results.append({
"endpoint_id": report.endpoint_id,
"endpoint_name": report.endpoint_name,
"protocol": report.protocol,
"expected_status": report.expected_status,
"actual_status": report.actual_status,
"status_match": report.expected_status == report.actual_status,
"tests_passed": report.tests_passed,
"tests_failed": report.tests_failed,
"tests": [
{
"name": t.test_name,
"success": t.success,
"status_code": t.status_code,
"response_time_ms": t.response_time_ms,
"content_type": t.content_type,
"error": t.error_message,
"details": t.details,
}
for t in report.tests
],
})
full_report = {
"summary": summary,
"endpoints": endpoint_results,
}
# Save to file if output directory specified
if output_dir:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
report_file = output_dir / f"nationaalarchief_endpoint_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(report_file, "w") as f:
json.dump(full_report, f, indent=2)
print(f"\nReport saved to: {report_file}")
return full_report
def print_summary(reports: list[EndpointReport]) -> None:
"""Print summary table."""
print("\n" + "=" * 70)
print("ENDPOINT TEST SUMMARY - Nationaal Archief (NL-ZH-DHA-A-NA)")
print("=" * 70)
total_passed = 0
total_failed = 0
for report in reports:
status_icon = {
"ACTIVE": "\u2705", # Green checkmark
"DEGRADED": "\u26A0\uFE0F", # Warning
"OFFLINE": "\u274C", # Red X
"UNKNOWN": "\u2753", # Question mark
}.get(report.actual_status, "?")
match_icon = "\u2705" if report.expected_status == report.actual_status else "\u26A0\uFE0F"
print(f"\n{report.endpoint_name}")
print(f" Protocol: {report.protocol}")
print(f" Expected: {report.expected_status} | Actual: {report.actual_status} {status_icon}")
print(f" Tests: {report.tests_passed} passed, {report.tests_failed} failed")
total_passed += report.tests_passed
total_failed += report.tests_failed
print("\n" + "-" * 70)
print(f"TOTAL: {total_passed} tests passed, {total_failed} tests failed")
print("=" * 70)
# ============================================================================
# Main
# ============================================================================
def main():
parser = argparse.ArgumentParser(
description="Test Nationaal Archief data service endpoints"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Show detailed test output"
)
parser.add_argument(
"-o", "--output-dir",
type=str,
help="Directory to save JSON report"
)
parser.add_argument(
"--endpoint",
type=str,
choices=list(ENDPOINTS.keys()),
help="Test only a specific endpoint"
)
args = parser.parse_args()
print("Nationaal Archief Endpoint Testing")
print("=" * 40)
print(f"Started: {datetime.now().isoformat()}")
reports = []
# Define test functions
test_functions = {
"oai_pmh": test_oai_pmh,
"mets_api": test_mets_api,
"file_api": test_file_api,
"sparql": test_sparql,
"ontology": test_ontology,
"photo_api": test_photo_api,
}
# Run tests
if args.endpoint:
endpoints_to_test = [args.endpoint]
else:
endpoints_to_test = list(test_functions.keys())
for endpoint_key in endpoints_to_test:
config = ENDPOINTS[endpoint_key]
print(f"\n[{endpoint_key.upper()}] Testing {config['name']}...")
test_func = test_functions[endpoint_key]
report = test_func(verbose=args.verbose)
reports.append(report)
# Generate and print report
output_path = Path(args.output_dir) if args.output_dir else Path("reports/endpoint_tests")
full_report = generate_report(
reports,
output_dir=output_path
)
print_summary(reports)
# Return exit code based on test results
if full_report["summary"]["tests_failed"] > 0:
sys.exit(1)
sys.exit(0)
if __name__ == "__main__":
main()