#!/bin/bash # Batch scrape CONABIP in 3 runs of ~100 institutions each # Each run takes ~5 minutes (within timeout limit) cd /Users/kempersc/apps/glam echo "=== Batch 1: Institutions 1-100 ===" python3 << 'PYTHON1' import sys sys.path.insert(0, 'scripts/scrapers') from scrape_conabip_argentina import CONABIPScraper import json scraper = CONABIPScraper(rate_limit_delay=2.0) # Faster rate scraper.scrape_all(scrape_profiles=False) # Get basic data first print(f"Total institutions: {len(scraper.institutions)}") # Scrape first 100 profiles only for i in range(min(100, len(scraper.institutions))): inst = scraper.institutions[i] if inst.get('profile_url'): try: from urllib.parse import urlparse profile_path = urlparse(inst['profile_url']).path profile_data = scraper._scrape_profile_page(profile_path) scraper.institutions[i].update(profile_data) print(f"[{i+1}/100] {inst['name']}") except: pass # Save batch 1 scraper.export_to_csv("data/isil/AR/conabip_batch1.csv") scraper.export_to_json("data/isil/AR/conabip_batch1.json") print("Batch 1 complete!") PYTHON1 echo -e "\n=== Batch 2: Institutions 101-200 ===" python3 << 'PYTHON2' import sys sys.path.insert(0, 'scripts/scrapers') from scrape_conabip_argentina import CONABIPScraper from urllib.parse import urlparse scraper = CONABIPScraper(rate_limit_delay=2.0) scraper.scrape_all(scrape_profiles=False) for i in range(100, min(200, len(scraper.institutions))): inst = scraper.institutions[i] if inst.get('profile_url'): try: profile_path = urlparse(inst['profile_url']).path profile_data = scraper._scrape_profile_page(profile_path) scraper.institutions[i].update(profile_data) print(f"[{i+1}/200] {inst['name']}") except: pass scraper.export_to_csv("data/isil/AR/conabip_batch2.csv") scraper.export_to_json("data/isil/AR/conabip_batch2.json") print("Batch 2 complete!") PYTHON2 echo -e "\n=== Batch 3: Institutions 201-288 ===" python3 << 'PYTHON3' import sys sys.path.insert(0, 'scripts/scrapers') from scrape_conabip_argentina import CONABIPScraper from urllib.parse import urlparse scraper = CONABIPScraper(rate_limit_delay=2.0) scraper.scrape_all(scrape_profiles=False) for i in range(200, len(scraper.institutions)): inst = scraper.institutions[i] if inst.get('profile_url'): try: profile_path = urlparse(inst['profile_url']).path profile_data = scraper._scrape_profile_page(profile_path) scraper.institutions[i].update(profile_data) print(f"[{i+1}/{len(scraper.institutions)}] {inst['name']}") except: pass scraper.export_to_csv("data/isil/AR/conabip_batch3.csv") scraper.export_to_json("data/isil/AR/conabip_batch3.json") print("Batch 3 complete!") PYTHON3 echo -e "\n=== Merging batches ===" python3 << 'PYTHONMERGE' import json from pathlib import Path # Merge all JSON files all_institutions = [] for i in [1, 2, 3]: file = Path(f"data/isil/AR/conabip_batch{i}.json") if file.exists(): data = json.load(open(file)) all_institutions.extend(data['institutions']) # Create final output output = { "metadata": { "source": "CONABIP Argentina", "url": "https://www.conabip.gob.ar/buscador_bp", "total_institutions": len(all_institutions), "institutions_with_coordinates": sum(1 for i in all_institutions if i.get('latitude')), "institutions_with_services": sum(1 for i in all_institutions if i.get('services')), }, "institutions": all_institutions } with open("data/isil/AR/conabip_libraries_full.json", 'w', encoding='utf-8') as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"✓ Merged {len(all_institutions)} institutions") print(f" Output: data/isil/AR/conabip_libraries_full.json") PYTHONMERGE