77 lines
2.5 KiB
Python
Executable file
77 lines
2.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Resume CONABIP scrape with progress saving.
|
|
Saves after every 50 profiles to avoid data loss on timeout.
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import csv
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path to import scraper
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from scrape_conabip_argentina import CONABIPScraper
|
|
|
|
def main():
|
|
output_dir = Path("data/isil/AR")
|
|
output_csv = output_dir / "conabip_libraries_full_with_profiles.csv"
|
|
output_json = output_dir / "conabip_libraries_full_with_profiles.json"
|
|
checkpoint_file = output_dir / "scrape_checkpoint.json"
|
|
|
|
scraper = CONABIPScraper(rate_limit_delay=2.5)
|
|
|
|
# Load checkpoint if exists
|
|
start_index = 0
|
|
if checkpoint_file.exists():
|
|
with open(checkpoint_file, 'r', encoding='utf-8') as f:
|
|
checkpoint = json.load(f)
|
|
start_index = checkpoint.get('last_profile_index', 0)
|
|
print(f"Resuming from institution #{start_index}")
|
|
|
|
# Scrape main pages first
|
|
print("Scraping main search pages...")
|
|
scraper.scrape_all_pages()
|
|
total = len(scraper.institutions)
|
|
print(f"Found {total} institutions")
|
|
|
|
# Scrape profiles with checkpointing
|
|
print(f"\nScraping profiles {start_index+1} to {total}...")
|
|
for i in range(start_index, total):
|
|
inst = scraper.institutions[i]
|
|
print(f"[{i+1}/{total}] Scraping {inst['name']}")
|
|
|
|
try:
|
|
enhanced = scraper.scrape_profile_page(inst)
|
|
scraper.institutions[i] = enhanced
|
|
|
|
# Save checkpoint every 50 institutions
|
|
if (i + 1) % 50 == 0:
|
|
# Save progress
|
|
scraper.export_to_csv(str(output_csv))
|
|
scraper.export_to_json(str(output_json))
|
|
|
|
# Update checkpoint
|
|
with open(checkpoint_file, 'w', encoding='utf-8') as f:
|
|
json.dump({'last_profile_index': i + 1}, f)
|
|
|
|
print(f" ✓ Checkpoint saved at {i+1}/{total}")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error scraping profile: {e}")
|
|
continue
|
|
|
|
# Final save
|
|
scraper.export_to_csv(str(output_csv))
|
|
scraper.export_to_json(str(output_json))
|
|
|
|
# Remove checkpoint on completion
|
|
if checkpoint_file.exists():
|
|
checkpoint_file.unlink()
|
|
|
|
print(f"\n✓ Complete! Scraped {total} institutions")
|
|
print(f" CSV: {output_csv}")
|
|
print(f" JSON: {output_json}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|