#!/usr/bin/env python3 """ Fetch website content and store as markdown files. This script: 1. Reads YAML entry files to find source URLs 2. Fetches each URL and converts to markdown 3. Stores markdown in data/nde/enriched/entries/web/{entry_number}/ 4. Updates the YAML file with reference to stored markdown Usage: python scripts/fetch_website_markdown.py [--dry-run] [--limit N] [--entry ENTRY_NUM] """ import argparse import hashlib import re import sys import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse import yaml try: import httpx from markdownify import markdownify as md HAS_DEPS = True except ImportError: HAS_DEPS = False print("Warning: httpx and/or markdownify not installed. Install with:") print(" pip install httpx markdownify") # Directories ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') WEB_DIR = ENTRIES_DIR / 'web' def sanitize_filename(url: str) -> str: """Create a safe filename from a URL.""" parsed = urlparse(url) # Use domain + path, sanitized name = f"{parsed.netloc}{parsed.path}" # Replace unsafe characters name = re.sub(r'[^\w\-.]', '_', name) # Limit length if len(name) > 100: # Use hash for long names name = name[:50] + '_' + hashlib.md5(name.encode()).hexdigest()[:16] return name def fetch_and_convert(url: str, timeout: int = 30) -> tuple[str | None, str | None]: """ Fetch URL and convert to markdown. Returns: Tuple of (markdown_content, error_message) """ if not HAS_DEPS: return None, "Dependencies not installed (httpx, markdownify)" try: headers = { 'User-Agent': 'Mozilla/5.0 (compatible; GLAMBot/1.0; heritage-data-collection)' } with httpx.Client(follow_redirects=True, timeout=timeout) as client: response = client.get(url, headers=headers) response.raise_for_status() content_type = response.headers.get('content-type', '') if 'text/html' not in content_type.lower(): return None, f"Not HTML content: {content_type}" html = response.text # Pre-process HTML to remove script content that might leak through from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') # Remove unwanted elements for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'iframe', 'noscript', 'svg', 'button', 'input', 'select', 'textarea']): element.decompose() # Remove elements with common ad/tracking classes for element in soup.find_all(class_=lambda x: x and any( term in str(x).lower() for term in ['cookie', 'gdpr', 'consent', 'tracking', 'analytics', 'advertisement'] )): element.decompose() # Convert cleaned HTML to markdown markdown = md( str(soup), heading_style='atx', bullets='-', strip=[] # Already stripped above ) # Clean up excessive whitespace markdown = re.sub(r'\n{3,}', '\n\n', markdown) markdown = markdown.strip() return markdown, None except httpx.TimeoutException: return None, f"Timeout fetching {url}" except httpx.HTTPStatusError as e: return None, f"HTTP {e.response.status_code}: {url}" except Exception as e: return None, f"Error fetching {url}: {str(e)}" def get_urls_from_entry(data: dict) -> list[str]: """Extract all source URLs from an entry.""" urls = set() # Check web_enrichment if 'web_enrichment' in data: we = data['web_enrichment'] if we.get('source_url'): urls.add(we['source_url']) # Check raw_sources for source in we.get('raw_sources', []): if source.get('url'): urls.add(source['url']) # Check original_entry for website if 'original_entry' in data: oe = data['original_entry'] if oe.get('webadres_organisatie'): urls.add(oe['webadres_organisatie']) # Filter out non-http URLs return [u for u in urls if u.startswith('http')] def extract_entry_number(filename: str) -> str: """Extract entry number from filename like '0034_rolder_historisch_gezelschap.yaml'.""" match = re.match(r'^(\d+)', filename) return match.group(1) if match else filename.replace('.yaml', '') def process_entry(filepath: Path, dry_run: bool = False) -> tuple[int, int, list[str]]: """ Process a single entry file. Returns: Tuple of (urls_fetched, urls_failed, error_messages) """ with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return 0, 0, ["Empty file"] urls = get_urls_from_entry(data) if not urls: return 0, 0, [] entry_num = extract_entry_number(filepath.name) entry_web_dir = WEB_DIR / entry_num fetched = 0 failed = 0 errors = [] markdown_files = [] for url in urls: filename = sanitize_filename(url) + '.md' md_path = entry_web_dir / filename # Check if already fetched if md_path.exists(): markdown_files.append(str(md_path.relative_to(ENTRIES_DIR))) continue if dry_run: print(f" Would fetch: {url}") fetched += 1 continue # Fetch and save markdown, error = fetch_and_convert(url) if error: errors.append(error) failed += 1 continue if not markdown or len(markdown) < 100: errors.append(f"Empty or too short content from {url}") failed += 1 continue # Create directory and save entry_web_dir.mkdir(parents=True, exist_ok=True) # Add metadata header header = f"""--- source_url: {url} fetch_timestamp: {datetime.now(timezone.utc).isoformat()} entry_file: {filepath.name} --- """ with open(md_path, 'w', encoding='utf-8') as f: f.write(header + markdown) markdown_files.append(str(md_path.relative_to(ENTRIES_DIR))) fetched += 1 # Rate limiting time.sleep(1) # Update YAML with markdown file references if markdown_files and not dry_run: if 'web_enrichment' not in data: data['web_enrichment'] = {} data['web_enrichment']['markdown_files'] = markdown_files data['web_enrichment']['markdown_fetch_timestamp'] = datetime.now(timezone.utc).isoformat() with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return fetched, failed, errors def main(): parser = argparse.ArgumentParser(description='Fetch website content and store as markdown') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') parser.add_argument('--limit', type=int, default=None, help='Limit number of entries to process') parser.add_argument('--entry', type=str, default=None, help='Process specific entry number') parser.add_argument('--skip-existing', action='store_true', default=True, help='Skip entries that already have markdown files') args = parser.parse_args() if not HAS_DEPS and not args.dry_run: print("Error: Required dependencies not installed. Use --dry-run or install deps.") return 1 # Find entry files if args.entry: files = list(ENTRIES_DIR.glob(f'{args.entry}*.yaml')) else: files = sorted(ENTRIES_DIR.glob('*.yaml')) if args.limit: files = files[:args.limit] total_fetched = 0 total_failed = 0 total_skipped = 0 entries_processed = 0 for filepath in files: # Skip if already has markdown files if args.skip_existing: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data and data.get('web_enrichment', {}).get('markdown_files'): total_skipped += 1 continue print(f"Processing: {filepath.name}") fetched, failed, errors = process_entry(filepath, dry_run=args.dry_run) if fetched or failed: entries_processed += 1 total_fetched += fetched total_failed += failed if errors: for e in errors: print(f" Error: {e}") print(f"\n{'DRY RUN ' if args.dry_run else ''}Summary:") print(f" Entries processed: {entries_processed}") print(f" Entries skipped (already have markdown): {total_skipped}") print(f" URLs fetched: {total_fetched}") print(f" URLs failed: {total_failed}") return 0 if total_failed == 0 else 1 if __name__ == '__main__': sys.exit(main())