glam/scripts/scrapers/batch_scrape_conabip.sh
2025-11-19 23:25:22 +01:00

120 lines
3.9 KiB
Bash
Executable file

#!/bin/bash
# Batch scrape CONABIP in 3 runs of ~100 institutions each
# Each run takes ~5 minutes (within timeout limit)
cd /Users/kempersc/apps/glam
echo "=== Batch 1: Institutions 1-100 ==="
python3 << 'PYTHON1'
import sys
sys.path.insert(0, 'scripts/scrapers')
from scrape_conabip_argentina import CONABIPScraper
import json
scraper = CONABIPScraper(rate_limit_delay=2.0) # Faster rate
scraper.scrape_all(scrape_profiles=False) # Get basic data first
print(f"Total institutions: {len(scraper.institutions)}")
# Scrape first 100 profiles only
for i in range(min(100, len(scraper.institutions))):
inst = scraper.institutions[i]
if inst.get('profile_url'):
try:
from urllib.parse import urlparse
profile_path = urlparse(inst['profile_url']).path
profile_data = scraper._scrape_profile_page(profile_path)
scraper.institutions[i].update(profile_data)
print(f"[{i+1}/100] {inst['name']}")
except:
pass
# Save batch 1
scraper.export_to_csv("data/isil/AR/conabip_batch1.csv")
scraper.export_to_json("data/isil/AR/conabip_batch1.json")
print("Batch 1 complete!")
PYTHON1
echo -e "\n=== Batch 2: Institutions 101-200 ==="
python3 << 'PYTHON2'
import sys
sys.path.insert(0, 'scripts/scrapers')
from scrape_conabip_argentina import CONABIPScraper
from urllib.parse import urlparse
scraper = CONABIPScraper(rate_limit_delay=2.0)
scraper.scrape_all(scrape_profiles=False)
for i in range(100, min(200, len(scraper.institutions))):
inst = scraper.institutions[i]
if inst.get('profile_url'):
try:
profile_path = urlparse(inst['profile_url']).path
profile_data = scraper._scrape_profile_page(profile_path)
scraper.institutions[i].update(profile_data)
print(f"[{i+1}/200] {inst['name']}")
except:
pass
scraper.export_to_csv("data/isil/AR/conabip_batch2.csv")
scraper.export_to_json("data/isil/AR/conabip_batch2.json")
print("Batch 2 complete!")
PYTHON2
echo -e "\n=== Batch 3: Institutions 201-288 ==="
python3 << 'PYTHON3'
import sys
sys.path.insert(0, 'scripts/scrapers')
from scrape_conabip_argentina import CONABIPScraper
from urllib.parse import urlparse
scraper = CONABIPScraper(rate_limit_delay=2.0)
scraper.scrape_all(scrape_profiles=False)
for i in range(200, len(scraper.institutions)):
inst = scraper.institutions[i]
if inst.get('profile_url'):
try:
profile_path = urlparse(inst['profile_url']).path
profile_data = scraper._scrape_profile_page(profile_path)
scraper.institutions[i].update(profile_data)
print(f"[{i+1}/{len(scraper.institutions)}] {inst['name']}")
except:
pass
scraper.export_to_csv("data/isil/AR/conabip_batch3.csv")
scraper.export_to_json("data/isil/AR/conabip_batch3.json")
print("Batch 3 complete!")
PYTHON3
echo -e "\n=== Merging batches ==="
python3 << 'PYTHONMERGE'
import json
from pathlib import Path
# Merge all JSON files
all_institutions = []
for i in [1, 2, 3]:
file = Path(f"data/isil/AR/conabip_batch{i}.json")
if file.exists():
data = json.load(open(file))
all_institutions.extend(data['institutions'])
# Create final output
output = {
"metadata": {
"source": "CONABIP Argentina",
"url": "https://www.conabip.gob.ar/buscador_bp",
"total_institutions": len(all_institutions),
"institutions_with_coordinates": sum(1 for i in all_institutions if i.get('latitude')),
"institutions_with_services": sum(1 for i in all_institutions if i.get('services')),
},
"institutions": all_institutions
}
with open("data/isil/AR/conabip_libraries_full.json", 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"✓ Merged {len(all_institutions)} institutions")
print(f" Output: data/isil/AR/conabip_libraries_full.json")
PYTHONMERGE