#!/usr/bin/env python3 """ Resume CONABIP scrape with progress saving. Saves after every 50 profiles to avoid data loss on timeout. """ import sys import json import csv from pathlib import Path # Add parent directory to path to import scraper sys.path.insert(0, str(Path(__file__).parent)) from scrape_conabip_argentina import CONABIPScraper def main(): output_dir = Path("data/isil/AR") output_csv = output_dir / "conabip_libraries_full_with_profiles.csv" output_json = output_dir / "conabip_libraries_full_with_profiles.json" checkpoint_file = output_dir / "scrape_checkpoint.json" scraper = CONABIPScraper(rate_limit_delay=2.5) # Load checkpoint if exists start_index = 0 if checkpoint_file.exists(): with open(checkpoint_file, 'r', encoding='utf-8') as f: checkpoint = json.load(f) start_index = checkpoint.get('last_profile_index', 0) print(f"Resuming from institution #{start_index}") # Scrape main pages first print("Scraping main search pages...") scraper.scrape_all_pages() total = len(scraper.institutions) print(f"Found {total} institutions") # Scrape profiles with checkpointing print(f"\nScraping profiles {start_index+1} to {total}...") for i in range(start_index, total): inst = scraper.institutions[i] print(f"[{i+1}/{total}] Scraping {inst['name']}") try: enhanced = scraper.scrape_profile_page(inst) scraper.institutions[i] = enhanced # Save checkpoint every 50 institutions if (i + 1) % 50 == 0: # Save progress scraper.export_to_csv(str(output_csv)) scraper.export_to_json(str(output_json)) # Update checkpoint with open(checkpoint_file, 'w', encoding='utf-8') as f: json.dump({'last_profile_index': i + 1}, f) print(f" āœ“ Checkpoint saved at {i+1}/{total}") except Exception as e: print(f" āœ— Error scraping profile: {e}") continue # Final save scraper.export_to_csv(str(output_csv)) scraper.export_to_json(str(output_json)) # Remove checkpoint on completion if checkpoint_file.exists(): checkpoint_file.unlink() print(f"\nāœ“ Complete! Scraped {total} institutions") print(f" CSV: {output_csv}") print(f" JSON: {output_json}") if __name__ == "__main__": main()