glam/scripts/mark_archive_failures.py
2025-12-05 16:25:39 +01:00

99 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""
Mark entries with failed archiving attempts.
Records why URLs couldn't be archived (HTTP 403, timeouts, invalid URLs, etc.)
so future processing can skip known failures.
Usage:
python scripts/mark_archive_failures.py
"""
import sys
from datetime import datetime, timezone
from pathlib import Path
import yaml
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
# Map of entry filenames to failure reasons (from archiving log)
FAILURES = {
"0001_Q2679819.yaml": [
("http://www.hunebedcentrum.eu/", "HTTP 403 - Forbidden"),
("https://twitter.com/hunebedcentrum", "Timeout - Twitter rate limiting"),
],
"1044_unknown.yaml": [("https://www.lkca.nl/", "Timeout")],
"1503_meertens_instituut_mi.yaml": [("http://www.meertens.knaw.nl/", "Timeout")],
"1521_beemster_museum.yaml": [("https://www.historischgenootschapbeemster.nl/afdelingen/agrarisch-museum-westerhem/", "HTTP 404 - Page not found")],
"1562_louwman_museum.yaml": [("https://louwmanmuseum.nl/", "Timeout")],
"1582_museum_de_waag_deventer.yaml": [("https://museumdewaag.nl/", "Timeout")],
"1593_levend_paardenmuseum_amsterdam.yaml": [("https://dehollandschemanege.nl/levend-paardenmuseum/", "HTTP 403 - Forbidden")],
"1624_stedelijk_museum_kampen_incl_voormalige_synagoge.yaml": [("https://stedelijkmuseumkampen.nl/", "HTTP 403 - Forbidden")],
"1652_rietveld_schr_derhuis.yaml": [("https://www.rietveldschroderhuis.nl/nl", "Timeout")],
"1653_speelgoedmuseum_deventer.yaml": [("https://hetspeelgoedmuseum.nl/", "HTTP 403 - Forbidden")],
"1673_museum_schoonewelle.yaml": [("http://www.schoonewelle.nl/", "SSL certificate expired")],
"1679_aflegvereniging_rachel.yaml": [("http://www.aflegverenigingrachel@gmail.com", "Invalid URL - email address, not website")],
"1685_ben_morshuis_stichting.yaml": [("http://www.bmsootmarsum.nl", "DNS resolution failed - domain not found")],
"1694_cvah.yaml": [("http://www.cvah.nl", "HTTP 403 - Forbidden")],
"1709_federatie_surinaamse_aflegverenigingen_nederland.yaml": [("http://www.federatieafleggers.nl", "SSL certificate error")],
"1727_het_loterijfonds_ravenstein.yaml": [("http://www.loterijfondsravenstein.info", "Timeout")],
"1736_knov_en_nbvk.yaml": [("http://www.dethuisbevalcultuur.nl", "DNS resolution failed - domain not found")],
"1738_kvva.yaml": [("http://marcdenelzen.nl/assets/ontwijken-of-samenleven-(column-74).pdf", "DNS resolution failed - domain not found")],
"1753_nederlands_steendrukmuseum.yaml": [("http://www.steendrukmuseum.nl", "Timeout")],
"1762_ninsee.yaml": [("http://www.ninsee.nl", "HTTP 403 - Forbidden")],
"1770_piratencultuur.yaml": [("http://www.piratencultuur.nl", "Timeout")],
"1774_pottenbakkerij_hoogland.yaml": [("http://www.niekhoogland.nl", "HTTP 500 - Server error")],
"1778_schola_cantorum_karolus_magnus.yaml": [("http://www.karolus-magnus.nl", "Timeout")],
"1780_schuttersbroederschap_st_sebastianus_1617_kerkrade.yaml": [("http://Schuttersbroederschap%20St.%20Sebastianus", "Invalid URL - not a valid URL format")],
"1783_sintrale_kommisje_skûtsjesilen.yaml": [("https://www.skutsjesilen.nl/", "Timeout")],
"1808_stichting_het_lekkerste_brabantse_worstenbroodje.yaml": [("http://www.hetlekkerstebrabantseworstenbroodje.nl", "HTTP 403 - Forbidden")],
"1853_vreugdevuur_scheveningen_noorderstrand.yaml": [("http://www.vreugdevuur-scheveningen.nl", "HTTP 427 - Client error")],
"1859_oliemolensnl.yaml": [("https://oliemolens.nl", "SSL certificate error")],
}
def main():
timestamp = datetime.now(timezone.utc).isoformat()
updated = 0
for filename, failures in FAILURES.items():
filepath = ENTRIES_DIR / filename
if not filepath.exists():
print(f" ✗ Not found: {filename}")
continue
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
continue
# Add archive failure metadata
if 'web_enrichment' not in data:
data['web_enrichment'] = {}
if 'archive_failures' not in data['web_enrichment']:
data['web_enrichment']['archive_failures'] = []
for url, reason in failures:
data['web_enrichment']['archive_failures'].append({
'url': url,
'failure_reason': reason,
'attempt_timestamp': timestamp,
'archivable': False
})
data['web_enrichment']['archive_failure_timestamp'] = timestamp
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f" ✓ Marked failures: {filename}")
updated += 1
print()
print(f"Updated {updated} entries with archive failure metadata")
if __name__ == '__main__':
main()