glam/scripts/scrapers/scrape_conabip_resume.py
2025-11-19 23:25:22 +01:00

77 lines
2.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Resume CONABIP scrape with progress saving.
Saves after every 50 profiles to avoid data loss on timeout.
"""
import sys
import json
import csv
from pathlib import Path
# Add parent directory to path to import scraper
sys.path.insert(0, str(Path(__file__).parent))
from scrape_conabip_argentina import CONABIPScraper
def main():
output_dir = Path("data/isil/AR")
output_csv = output_dir / "conabip_libraries_full_with_profiles.csv"
output_json = output_dir / "conabip_libraries_full_with_profiles.json"
checkpoint_file = output_dir / "scrape_checkpoint.json"
scraper = CONABIPScraper(rate_limit_delay=2.5)
# Load checkpoint if exists
start_index = 0
if checkpoint_file.exists():
with open(checkpoint_file, 'r', encoding='utf-8') as f:
checkpoint = json.load(f)
start_index = checkpoint.get('last_profile_index', 0)
print(f"Resuming from institution #{start_index}")
# Scrape main pages first
print("Scraping main search pages...")
scraper.scrape_all_pages()
total = len(scraper.institutions)
print(f"Found {total} institutions")
# Scrape profiles with checkpointing
print(f"\nScraping profiles {start_index+1} to {total}...")
for i in range(start_index, total):
inst = scraper.institutions[i]
print(f"[{i+1}/{total}] Scraping {inst['name']}")
try:
enhanced = scraper.scrape_profile_page(inst)
scraper.institutions[i] = enhanced
# Save checkpoint every 50 institutions
if (i + 1) % 50 == 0:
# Save progress
scraper.export_to_csv(str(output_csv))
scraper.export_to_json(str(output_json))
# Update checkpoint
with open(checkpoint_file, 'w', encoding='utf-8') as f:
json.dump({'last_profile_index': i + 1}, f)
print(f" ✓ Checkpoint saved at {i+1}/{total}")
except Exception as e:
print(f" ✗ Error scraping profile: {e}")
continue
# Final save
scraper.export_to_csv(str(output_csv))
scraper.export_to_json(str(output_json))
# Remove checkpoint on completion
if checkpoint_file.exists():
checkpoint_file.unlink()
print(f"\n✓ Complete! Scraped {total} institutions")
print(f" CSV: {output_csv}")
print(f" JSON: {output_json}")
if __name__ == "__main__":
main()