#!/usr/bin/env python3 """ Mark entries with failed archiving attempts. Records why URLs couldn't be archived (HTTP 403, timeouts, invalid URLs, etc.) so future processing can skip known failures. Usage: python scripts/mark_archive_failures.py """ import sys from datetime import datetime, timezone from pathlib import Path import yaml ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') # Map of entry filenames to failure reasons (from archiving log) FAILURES = { "0001_Q2679819.yaml": [ ("http://www.hunebedcentrum.eu/", "HTTP 403 - Forbidden"), ("https://twitter.com/hunebedcentrum", "Timeout - Twitter rate limiting"), ], "1044_unknown.yaml": [("https://www.lkca.nl/", "Timeout")], "1503_meertens_instituut_mi.yaml": [("http://www.meertens.knaw.nl/", "Timeout")], "1521_beemster_museum.yaml": [("https://www.historischgenootschapbeemster.nl/afdelingen/agrarisch-museum-westerhem/", "HTTP 404 - Page not found")], "1562_louwman_museum.yaml": [("https://louwmanmuseum.nl/", "Timeout")], "1582_museum_de_waag_deventer.yaml": [("https://museumdewaag.nl/", "Timeout")], "1593_levend_paardenmuseum_amsterdam.yaml": [("https://dehollandschemanege.nl/levend-paardenmuseum/", "HTTP 403 - Forbidden")], "1624_stedelijk_museum_kampen_incl_voormalige_synagoge.yaml": [("https://stedelijkmuseumkampen.nl/", "HTTP 403 - Forbidden")], "1652_rietveld_schr_derhuis.yaml": [("https://www.rietveldschroderhuis.nl/nl", "Timeout")], "1653_speelgoedmuseum_deventer.yaml": [("https://hetspeelgoedmuseum.nl/", "HTTP 403 - Forbidden")], "1673_museum_schoonewelle.yaml": [("http://www.schoonewelle.nl/", "SSL certificate expired")], "1679_aflegvereniging_rachel.yaml": [("http://www.aflegverenigingrachel@gmail.com", "Invalid URL - email address, not website")], "1685_ben_morshuis_stichting.yaml": [("http://www.bmsootmarsum.nl", "DNS resolution failed - domain not found")], "1694_cvah.yaml": [("http://www.cvah.nl", "HTTP 403 - Forbidden")], "1709_federatie_surinaamse_aflegverenigingen_nederland.yaml": [("http://www.federatieafleggers.nl", "SSL certificate error")], "1727_het_loterijfonds_ravenstein.yaml": [("http://www.loterijfondsravenstein.info", "Timeout")], "1736_knov_en_nbvk.yaml": [("http://www.dethuisbevalcultuur.nl", "DNS resolution failed - domain not found")], "1738_kvva.yaml": [("http://marcdenelzen.nl/assets/ontwijken-of-samenleven-(column-74).pdf", "DNS resolution failed - domain not found")], "1753_nederlands_steendrukmuseum.yaml": [("http://www.steendrukmuseum.nl", "Timeout")], "1762_ninsee.yaml": [("http://www.ninsee.nl", "HTTP 403 - Forbidden")], "1770_piratencultuur.yaml": [("http://www.piratencultuur.nl", "Timeout")], "1774_pottenbakkerij_hoogland.yaml": [("http://www.niekhoogland.nl", "HTTP 500 - Server error")], "1778_schola_cantorum_karolus_magnus.yaml": [("http://www.karolus-magnus.nl", "Timeout")], "1780_schuttersbroederschap_st_sebastianus_1617_kerkrade.yaml": [("http://Schuttersbroederschap%20St.%20Sebastianus", "Invalid URL - not a valid URL format")], "1783_sintrale_kommisje_skûtsjesilen.yaml": [("https://www.skutsjesilen.nl/", "Timeout")], "1808_stichting_het_lekkerste_brabantse_worstenbroodje.yaml": [("http://www.hetlekkerstebrabantseworstenbroodje.nl", "HTTP 403 - Forbidden")], "1853_vreugdevuur_scheveningen_noorderstrand.yaml": [("http://www.vreugdevuur-scheveningen.nl", "HTTP 427 - Client error")], "1859_oliemolensnl.yaml": [("https://oliemolens.nl", "SSL certificate error")], } def main(): timestamp = datetime.now(timezone.utc).isoformat() updated = 0 for filename, failures in FAILURES.items(): filepath = ENTRIES_DIR / filename if not filepath.exists(): print(f" ✗ Not found: {filename}") continue with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: continue # Add archive failure metadata if 'web_enrichment' not in data: data['web_enrichment'] = {} if 'archive_failures' not in data['web_enrichment']: data['web_enrichment']['archive_failures'] = [] for url, reason in failures: data['web_enrichment']['archive_failures'].append({ 'url': url, 'failure_reason': reason, 'attempt_timestamp': timestamp, 'archivable': False }) data['web_enrichment']['archive_failure_timestamp'] = timestamp with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f" ✓ Marked failures: {filename}") updated += 1 print() print(f"Updated {updated} entries with archive failure metadata") if __name__ == '__main__': main()