99 lines
4.9 KiB
Python
99 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Mark entries with failed archiving attempts.
|
|
|
|
Records why URLs couldn't be archived (HTTP 403, timeouts, invalid URLs, etc.)
|
|
so future processing can skip known failures.
|
|
|
|
Usage:
|
|
python scripts/mark_archive_failures.py
|
|
"""
|
|
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
|
|
# Map of entry filenames to failure reasons (from archiving log)
|
|
FAILURES = {
|
|
"0001_Q2679819.yaml": [
|
|
("http://www.hunebedcentrum.eu/", "HTTP 403 - Forbidden"),
|
|
("https://twitter.com/hunebedcentrum", "Timeout - Twitter rate limiting"),
|
|
],
|
|
"1044_unknown.yaml": [("https://www.lkca.nl/", "Timeout")],
|
|
"1503_meertens_instituut_mi.yaml": [("http://www.meertens.knaw.nl/", "Timeout")],
|
|
"1521_beemster_museum.yaml": [("https://www.historischgenootschapbeemster.nl/afdelingen/agrarisch-museum-westerhem/", "HTTP 404 - Page not found")],
|
|
"1562_louwman_museum.yaml": [("https://louwmanmuseum.nl/", "Timeout")],
|
|
"1582_museum_de_waag_deventer.yaml": [("https://museumdewaag.nl/", "Timeout")],
|
|
"1593_levend_paardenmuseum_amsterdam.yaml": [("https://dehollandschemanege.nl/levend-paardenmuseum/", "HTTP 403 - Forbidden")],
|
|
"1624_stedelijk_museum_kampen_incl_voormalige_synagoge.yaml": [("https://stedelijkmuseumkampen.nl/", "HTTP 403 - Forbidden")],
|
|
"1652_rietveld_schr_derhuis.yaml": [("https://www.rietveldschroderhuis.nl/nl", "Timeout")],
|
|
"1653_speelgoedmuseum_deventer.yaml": [("https://hetspeelgoedmuseum.nl/", "HTTP 403 - Forbidden")],
|
|
"1673_museum_schoonewelle.yaml": [("http://www.schoonewelle.nl/", "SSL certificate expired")],
|
|
"1679_aflegvereniging_rachel.yaml": [("http://www.aflegverenigingrachel@gmail.com", "Invalid URL - email address, not website")],
|
|
"1685_ben_morshuis_stichting.yaml": [("http://www.bmsootmarsum.nl", "DNS resolution failed - domain not found")],
|
|
"1694_cvah.yaml": [("http://www.cvah.nl", "HTTP 403 - Forbidden")],
|
|
"1709_federatie_surinaamse_aflegverenigingen_nederland.yaml": [("http://www.federatieafleggers.nl", "SSL certificate error")],
|
|
"1727_het_loterijfonds_ravenstein.yaml": [("http://www.loterijfondsravenstein.info", "Timeout")],
|
|
"1736_knov_en_nbvk.yaml": [("http://www.dethuisbevalcultuur.nl", "DNS resolution failed - domain not found")],
|
|
"1738_kvva.yaml": [("http://marcdenelzen.nl/assets/ontwijken-of-samenleven-(column-74).pdf", "DNS resolution failed - domain not found")],
|
|
"1753_nederlands_steendrukmuseum.yaml": [("http://www.steendrukmuseum.nl", "Timeout")],
|
|
"1762_ninsee.yaml": [("http://www.ninsee.nl", "HTTP 403 - Forbidden")],
|
|
"1770_piratencultuur.yaml": [("http://www.piratencultuur.nl", "Timeout")],
|
|
"1774_pottenbakkerij_hoogland.yaml": [("http://www.niekhoogland.nl", "HTTP 500 - Server error")],
|
|
"1778_schola_cantorum_karolus_magnus.yaml": [("http://www.karolus-magnus.nl", "Timeout")],
|
|
"1780_schuttersbroederschap_st_sebastianus_1617_kerkrade.yaml": [("http://Schuttersbroederschap%20St.%20Sebastianus", "Invalid URL - not a valid URL format")],
|
|
"1783_sintrale_kommisje_skûtsjesilen.yaml": [("https://www.skutsjesilen.nl/", "Timeout")],
|
|
"1808_stichting_het_lekkerste_brabantse_worstenbroodje.yaml": [("http://www.hetlekkerstebrabantseworstenbroodje.nl", "HTTP 403 - Forbidden")],
|
|
"1853_vreugdevuur_scheveningen_noorderstrand.yaml": [("http://www.vreugdevuur-scheveningen.nl", "HTTP 427 - Client error")],
|
|
"1859_oliemolensnl.yaml": [("https://oliemolens.nl", "SSL certificate error")],
|
|
}
|
|
|
|
|
|
def main():
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
updated = 0
|
|
|
|
for filename, failures in FAILURES.items():
|
|
filepath = ENTRIES_DIR / filename
|
|
if not filepath.exists():
|
|
print(f" ✗ Not found: {filename}")
|
|
continue
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
continue
|
|
|
|
# Add archive failure metadata
|
|
if 'web_enrichment' not in data:
|
|
data['web_enrichment'] = {}
|
|
|
|
if 'archive_failures' not in data['web_enrichment']:
|
|
data['web_enrichment']['archive_failures'] = []
|
|
|
|
for url, reason in failures:
|
|
data['web_enrichment']['archive_failures'].append({
|
|
'url': url,
|
|
'failure_reason': reason,
|
|
'attempt_timestamp': timestamp,
|
|
'archivable': False
|
|
})
|
|
|
|
data['web_enrichment']['archive_failure_timestamp'] = timestamp
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f" ✓ Marked failures: {filename}")
|
|
updated += 1
|
|
|
|
print()
|
|
print(f"Updated {updated} entries with archive failure metadata")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|