#!/usr/bin/env python3 """ Nationaal Archief Data Service Endpoint Testing Script Tests all documented API endpoints for the Nationaal Archief (NL-ZH-DHA-A-NA): 1. OAI-PMH endpoint (EAD/XML harvesting) 2. METS API (metadata files) 3. File API (scan downloads) 4. SPARQL webservice (photo collection LOD) 5. Ontology endpoint (NAO) 6. Photo API (currently offline) Usage: python scripts/test_nationaalarchief_endpoints.py [--verbose] [--output-dir DIR] References: - Data file: data/custodian/NL-ZH-DHA-A-NA.yaml (lines 573-750) - Documentation: https://www.nationaalarchief.nl/onderzoeken/open-data """ import argparse import json import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any from xml.etree import ElementTree as ET import requests # ============================================================================ # Configuration # ============================================================================ ENDPOINTS = { "oai_pmh": { "id": "nationaalarchief-oai-pmh-ead", "name": "OAI-PMH Endpoint (EAD/XML)", "base_url": "https://service.archief.nl/gaf/oai/!open_oai.OAIHandler", "protocol": "OAI-PMH 2.0", "expected_status": "ACTIVE", }, "mets_api": { "id": "nationaalarchief-mets-api", "name": "METS API", "base_url": "https://service.archief.nl/gaf/api/mets/v1", "protocol": "REST", "expected_status": "ACTIVE", "sample_uuid": "48f1f22f-1228-4b00-9720-5816a07b4003", }, "file_api": { "id": "nationaalarchief-file-api", "name": "File Download API", "base_url": "https://service.archief.nl/gaf/api/file/v1", "protocol": "REST", "expected_status": "ACTIVE", "sample_uuids": { "default": "835776c2-fb57-47eb-b537-b82758b6558a", "thumb": "834a6c29-61f1-4926-94bd-674132d25fd5", }, }, "sparql": { "id": "nationaalarchief-sparql-webservice", "name": "SPARQL Webservice", "base_url": "https://www.nationaalarchief.nl/onderzoeken/sparql", "protocol": "SPARQL", "expected_status": "ACTIVE", }, "ontology": { "id": "nationaalarchief-ontology", "name": "Nationaal Archief Ontologie (NAO)", "base_url": "https://raw.githubusercontent.com/NationaalArchief/LOD/master/archief.nl-def-ontologie.json", "deprecated_url": "https://archief.nl/def/ontologie/", # SSL/403 issues "protocol": "HTTP", "expected_status": "ACTIVE", }, "photo_api": { "id": "nationaalarchief-photo-api", "name": "Photo API", "base_url": None, # Currently offline "protocol": "REST", "expected_status": "OFFLINE", }, } # OAI-PMH verbs to test OAI_PMH_VERBS = [ ("Identify", {}), ("ListMetadataFormats", {}), ("ListSets", {}), ("ListIdentifiers", {"metadataPrefix": "oai_ead", "set": "2.21.205.69"}), ("GetRecord", {"metadataPrefix": "oai_ead", "identifier": "2.21.205.69"}), ] # Sample SPARQL query for photo collection SPARQL_QUERY = """ PREFIX dc: PREFIX dcterms: PREFIX edm: PREFIX foaf: SELECT ?photo ?title ?creator ?date WHERE { ?photo a edm:ProvidedCHO . OPTIONAL { ?photo dc:title ?title } OPTIONAL { ?photo dc:creator ?creator } OPTIONAL { ?photo dc:date ?date } } LIMIT 5 """ # ============================================================================ # Data Classes # ============================================================================ @dataclass class TestResult: """Result of a single endpoint test.""" endpoint_id: str test_name: str success: bool status_code: int | None = None response_time_ms: float | None = None content_type: str | None = None error_message: str | None = None details: dict[str, Any] = field(default_factory=dict) @dataclass class EndpointReport: """Aggregated report for an endpoint.""" endpoint_id: str endpoint_name: str protocol: str expected_status: str actual_status: str tests_passed: int = 0 tests_failed: int = 0 tests: list[TestResult] = field(default_factory=list) # ============================================================================ # Test Functions # ============================================================================ def test_oai_pmh(verbose: bool = False) -> EndpointReport: """Test OAI-PMH endpoint with all verbs.""" config = ENDPOINTS["oai_pmh"] report = EndpointReport( endpoint_id=config["id"], endpoint_name=config["name"], protocol=config["protocol"], expected_status=config["expected_status"], actual_status="UNKNOWN", ) base_url = config["base_url"] for verb, params in OAI_PMH_VERBS: test_name = f"OAI-PMH {verb}" params_with_verb = {"verb": verb, **params} try: start = time.time() response = requests.get(base_url, params=params_with_verb, timeout=30) elapsed_ms = (time.time() - start) * 1000 result = TestResult( endpoint_id=config["id"], test_name=test_name, success=response.status_code == 200, status_code=response.status_code, response_time_ms=round(elapsed_ms, 2), content_type=response.headers.get("Content-Type"), ) # Parse XML response if response.status_code == 200: try: root = ET.fromstring(response.content) # Check for OAI-PMH error ns = {"oai": "http://www.openarchives.org/OAI/2.0/"} error = root.find(".//oai:error", ns) if error is not None: result.success = False result.error_message = f"OAI error: {error.get('code')} - {error.text}" else: # Extract some details based on verb if verb == "Identify": repo_name = root.find(".//oai:repositoryName", ns) if repo_name is not None: result.details["repository_name"] = repo_name.text elif verb == "ListSets": sets = root.findall(".//oai:set", ns) result.details["set_count"] = len(sets) elif verb == "GetRecord": record = root.find(".//oai:record", ns) result.details["record_found"] = record is not None except ET.ParseError as e: result.success = False result.error_message = f"XML parse error: {e}" if verbose: print(f" [{'+' if result.success else '-'}] {test_name}: " f"{result.status_code} ({result.response_time_ms:.0f}ms)") if result.details: print(f" Details: {result.details}") if result.error_message: print(f" Error: {result.error_message}") except requests.RequestException as e: result = TestResult( endpoint_id=config["id"], test_name=test_name, success=False, error_message=str(e), ) if verbose: print(f" [-] {test_name}: ERROR - {e}") report.tests.append(result) if result.success: report.tests_passed += 1 else: report.tests_failed += 1 # Determine actual status if report.tests_passed == len(OAI_PMH_VERBS): report.actual_status = "ACTIVE" elif report.tests_passed > 0: report.actual_status = "DEGRADED" else: report.actual_status = "OFFLINE" return report def test_mets_api(verbose: bool = False) -> EndpointReport: """Test METS API endpoint.""" config = ENDPOINTS["mets_api"] report = EndpointReport( endpoint_id=config["id"], endpoint_name=config["name"], protocol=config["protocol"], expected_status=config["expected_status"], actual_status="UNKNOWN", ) uuid = config["sample_uuid"] url = f"{config['base_url']}/{uuid}" test_name = "METS file retrieval" try: start = time.time() response = requests.get(url, timeout=30) elapsed_ms = (time.time() - start) * 1000 result = TestResult( endpoint_id=config["id"], test_name=test_name, success=response.status_code == 200, status_code=response.status_code, response_time_ms=round(elapsed_ms, 2), content_type=response.headers.get("Content-Type"), ) if response.status_code == 200: try: root = ET.fromstring(response.content) # Count file entries in METS ns = {"mets": "http://www.loc.gov/METS/"} files = root.findall(".//mets:file", ns) result.details["file_count"] = len(files) result.details["mets_uuid"] = uuid # Extract first few file UUIDs for reference file_uuids = [] for f in files[:3]: fid = f.get("ID", "") file_uuids.append(fid) if file_uuids: result.details["sample_file_ids"] = file_uuids except ET.ParseError as e: result.success = False result.error_message = f"XML parse error: {e}" if verbose: print(f" [{'+' if result.success else '-'}] {test_name}: " f"{result.status_code} ({result.response_time_ms:.0f}ms)") if result.details: print(f" Details: {result.details}") if result.error_message: print(f" Error: {result.error_message}") except requests.RequestException as e: result = TestResult( endpoint_id=config["id"], test_name=test_name, success=False, error_message=str(e), ) if verbose: print(f" [-] {test_name}: ERROR - {e}") report.tests.append(result) if result.success: report.tests_passed += 1 report.actual_status = "ACTIVE" else: report.tests_failed += 1 report.actual_status = "OFFLINE" return report def test_file_api(verbose: bool = False) -> EndpointReport: """Test File Download API endpoint.""" config = ENDPOINTS["file_api"] report = EndpointReport( endpoint_id=config["id"], endpoint_name=config["name"], protocol=config["protocol"], expected_status=config["expected_status"], actual_status="UNKNOWN", ) for file_type, uuid in config["sample_uuids"].items(): url = f"{config['base_url']}/{file_type}/{uuid}" test_name = f"File download ({file_type})" try: start = time.time() # Use HEAD request first to avoid downloading large files response = requests.head(url, timeout=30, allow_redirects=True) elapsed_ms = (time.time() - start) * 1000 result = TestResult( endpoint_id=config["id"], test_name=test_name, success=response.status_code == 200, status_code=response.status_code, response_time_ms=round(elapsed_ms, 2), content_type=response.headers.get("Content-Type"), ) if response.status_code == 200: content_length = response.headers.get("Content-Length") if content_length: result.details["content_length_bytes"] = int(content_length) result.details["file_type"] = file_type result.details["uuid"] = uuid if verbose: print(f" [{'+' if result.success else '-'}] {test_name}: " f"{result.status_code} ({result.response_time_ms:.0f}ms)") if result.details: print(f" Details: {result.details}") except requests.RequestException as e: result = TestResult( endpoint_id=config["id"], test_name=test_name, success=False, error_message=str(e), ) if verbose: print(f" [-] {test_name}: ERROR - {e}") report.tests.append(result) if result.success: report.tests_passed += 1 else: report.tests_failed += 1 # Determine actual status if report.tests_passed == len(config["sample_uuids"]): report.actual_status = "ACTIVE" elif report.tests_passed > 0: report.actual_status = "DEGRADED" else: report.actual_status = "OFFLINE" return report def test_sparql(verbose: bool = False) -> EndpointReport: """Test SPARQL webservice endpoint.""" config = ENDPOINTS["sparql"] report = EndpointReport( endpoint_id=config["id"], endpoint_name=config["name"], protocol=config["protocol"], expected_status=config["expected_status"], actual_status="UNKNOWN", ) # Test 1: Check endpoint availability test_name = "SPARQL endpoint availability" try: start = time.time() response = requests.get(config["base_url"], timeout=30) elapsed_ms = (time.time() - start) * 1000 result = TestResult( endpoint_id=config["id"], test_name=test_name, success=response.status_code == 200, status_code=response.status_code, response_time_ms=round(elapsed_ms, 2), content_type=response.headers.get("Content-Type"), ) if verbose: print(f" [{'+' if result.success else '-'}] {test_name}: " f"{result.status_code} ({result.response_time_ms:.0f}ms)") except requests.RequestException as e: result = TestResult( endpoint_id=config["id"], test_name=test_name, success=False, error_message=str(e), ) if verbose: print(f" [-] {test_name}: ERROR - {e}") report.tests.append(result) if result.success: report.tests_passed += 1 else: report.tests_failed += 1 # Test 2: Execute sample query test_name = "SPARQL query execution" try: start = time.time() response = requests.post( config["base_url"], data={"query": SPARQL_QUERY}, headers={"Accept": "application/sparql-results+json"}, timeout=60, ) elapsed_ms = (time.time() - start) * 1000 result = TestResult( endpoint_id=config["id"], test_name=test_name, success=response.status_code == 200, status_code=response.status_code, response_time_ms=round(elapsed_ms, 2), content_type=response.headers.get("Content-Type"), ) if response.status_code == 200: try: data = response.json() bindings = data.get("results", {}).get("bindings", []) result.details["result_count"] = len(bindings) if bindings: result.details["sample_result"] = { k: v.get("value") for k, v in bindings[0].items() } except json.JSONDecodeError: result.details["note"] = "Response not JSON, may be HTML form" if verbose: print(f" [{'+' if result.success else '-'}] {test_name}: " f"{result.status_code} ({result.response_time_ms:.0f}ms)") if result.details: print(f" Details: {result.details}") except requests.RequestException as e: result = TestResult( endpoint_id=config["id"], test_name=test_name, success=False, error_message=str(e), ) if verbose: print(f" [-] {test_name}: ERROR - {e}") report.tests.append(result) if result.success: report.tests_passed += 1 else: report.tests_failed += 1 # Determine actual status if report.tests_passed == 2: report.actual_status = "ACTIVE" elif report.tests_passed > 0: report.actual_status = "DEGRADED" else: report.actual_status = "OFFLINE" return report def test_ontology(verbose: bool = False) -> EndpointReport: """Test NAO ontology endpoint (JSON-LD from GitHub). Note: The original archief.nl/def/ontologie/ URL has SSL/403 issues. The ontology is now hosted on GitHub as JSON-LD only. """ config = ENDPOINTS["ontology"] report = EndpointReport( endpoint_id=config["id"], endpoint_name=config["name"], protocol=config["protocol"], expected_status=config["expected_status"], actual_status="UNKNOWN", ) # Test 1: Fetch JSON-LD ontology from GitHub test_name = "Ontology retrieval (JSON-LD from GitHub)" try: start = time.time() response = requests.get( config["base_url"], timeout=30, allow_redirects=True, ) elapsed_ms = (time.time() - start) * 1000 result = TestResult( endpoint_id=config["id"], test_name=test_name, success=response.status_code == 200, status_code=response.status_code, response_time_ms=round(elapsed_ms, 2), content_type=response.headers.get("Content-Type"), ) if response.status_code == 200: result.details["response_size_bytes"] = len(response.content) result.details["source"] = "GitHub (NationaalArchief/LOD)" # Try to parse JSON-LD and extract some metadata try: data = response.json() if isinstance(data, list): # JSON-LD is an array of entities result.details["entity_count"] = len(data) # Count different types types = {} for item in data: if isinstance(item, dict) and "@type" in item: t = item["@type"] if isinstance(t, list): t = t[0] if t else "unknown" types[t] = types.get(t, 0) + 1 if types: result.details["types_found"] = dict(list(types.items())[:5]) elif isinstance(data, dict): # Standard JSON-LD with @graph graph = data.get("@graph", []) result.details["graph_size"] = len(graph) except json.JSONDecodeError: result.details["parse_note"] = "Content is not valid JSON" if verbose: print(f" [{'+' if result.success else '-'}] {test_name}: " f"{result.status_code} ({result.response_time_ms:.0f}ms)") if result.details: print(f" Details: {result.details}") except requests.RequestException as e: result = TestResult( endpoint_id=config["id"], test_name=test_name, success=False, error_message=str(e), ) if verbose: print(f" [-] {test_name}: ERROR - {e}") report.tests.append(result) if result.success: report.tests_passed += 1 else: report.tests_failed += 1 # Determine actual status if report.tests_passed == 1: report.actual_status = "ACTIVE" else: report.actual_status = "OFFLINE" return report def test_photo_api(verbose: bool = False) -> EndpointReport: """Test Photo API (expected to be offline).""" config = ENDPOINTS["photo_api"] report = EndpointReport( endpoint_id=config["id"], endpoint_name=config["name"], protocol=config["protocol"], expected_status=config["expected_status"], actual_status="OFFLINE", ) # No URL to test - documented as offline result = TestResult( endpoint_id=config["id"], test_name="Photo API status check", success=True, # Expected to be offline details={"status": "OFFLINE", "note": "API under development, no URL available"}, ) if verbose: print(f" [~] Photo API: OFFLINE (expected - under development)") report.tests.append(result) report.tests_passed += 1 # Correctly identified as offline return report # ============================================================================ # Report Generation # ============================================================================ def generate_report(reports: list[EndpointReport], output_dir: Path | None = None) -> dict: """Generate comprehensive test report.""" timestamp = datetime.now(timezone.utc).isoformat() summary = { "report_timestamp": timestamp, "custodian_id": "NL-ZH-DHA-A-NA", "custodian_name": "Nationaal Archief", "total_endpoints": len(reports), "endpoints_active": sum(1 for r in reports if r.actual_status == "ACTIVE"), "endpoints_degraded": sum(1 for r in reports if r.actual_status == "DEGRADED"), "endpoints_offline": sum(1 for r in reports if r.actual_status == "OFFLINE"), "total_tests": sum(r.tests_passed + r.tests_failed for r in reports), "tests_passed": sum(r.tests_passed for r in reports), "tests_failed": sum(r.tests_failed for r in reports), } endpoint_results = [] for report in reports: endpoint_results.append({ "endpoint_id": report.endpoint_id, "endpoint_name": report.endpoint_name, "protocol": report.protocol, "expected_status": report.expected_status, "actual_status": report.actual_status, "status_match": report.expected_status == report.actual_status, "tests_passed": report.tests_passed, "tests_failed": report.tests_failed, "tests": [ { "name": t.test_name, "success": t.success, "status_code": t.status_code, "response_time_ms": t.response_time_ms, "content_type": t.content_type, "error": t.error_message, "details": t.details, } for t in report.tests ], }) full_report = { "summary": summary, "endpoints": endpoint_results, } # Save to file if output directory specified if output_dir: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) report_file = output_dir / f"nationaalarchief_endpoint_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(report_file, "w") as f: json.dump(full_report, f, indent=2) print(f"\nReport saved to: {report_file}") return full_report def print_summary(reports: list[EndpointReport]) -> None: """Print summary table.""" print("\n" + "=" * 70) print("ENDPOINT TEST SUMMARY - Nationaal Archief (NL-ZH-DHA-A-NA)") print("=" * 70) total_passed = 0 total_failed = 0 for report in reports: status_icon = { "ACTIVE": "\u2705", # Green checkmark "DEGRADED": "\u26A0\uFE0F", # Warning "OFFLINE": "\u274C", # Red X "UNKNOWN": "\u2753", # Question mark }.get(report.actual_status, "?") match_icon = "\u2705" if report.expected_status == report.actual_status else "\u26A0\uFE0F" print(f"\n{report.endpoint_name}") print(f" Protocol: {report.protocol}") print(f" Expected: {report.expected_status} | Actual: {report.actual_status} {status_icon}") print(f" Tests: {report.tests_passed} passed, {report.tests_failed} failed") total_passed += report.tests_passed total_failed += report.tests_failed print("\n" + "-" * 70) print(f"TOTAL: {total_passed} tests passed, {total_failed} tests failed") print("=" * 70) # ============================================================================ # Main # ============================================================================ def main(): parser = argparse.ArgumentParser( description="Test Nationaal Archief data service endpoints" ) parser.add_argument( "-v", "--verbose", action="store_true", help="Show detailed test output" ) parser.add_argument( "-o", "--output-dir", type=str, help="Directory to save JSON report" ) parser.add_argument( "--endpoint", type=str, choices=list(ENDPOINTS.keys()), help="Test only a specific endpoint" ) args = parser.parse_args() print("Nationaal Archief Endpoint Testing") print("=" * 40) print(f"Started: {datetime.now().isoformat()}") reports = [] # Define test functions test_functions = { "oai_pmh": test_oai_pmh, "mets_api": test_mets_api, "file_api": test_file_api, "sparql": test_sparql, "ontology": test_ontology, "photo_api": test_photo_api, } # Run tests if args.endpoint: endpoints_to_test = [args.endpoint] else: endpoints_to_test = list(test_functions.keys()) for endpoint_key in endpoints_to_test: config = ENDPOINTS[endpoint_key] print(f"\n[{endpoint_key.upper()}] Testing {config['name']}...") test_func = test_functions[endpoint_key] report = test_func(verbose=args.verbose) reports.append(report) # Generate and print report output_path = Path(args.output_dir) if args.output_dir else Path("reports/endpoint_tests") full_report = generate_report( reports, output_dir=output_path ) print_summary(reports) # Return exit code based on test results if full_report["summary"]["tests_failed"] > 0: sys.exit(1) sys.exit(0) if __name__ == "__main__": main()