120 lines
3.9 KiB
Bash
Executable file
120 lines
3.9 KiB
Bash
Executable file
#!/bin/bash
|
|
# Batch scrape CONABIP in 3 runs of ~100 institutions each
|
|
# Each run takes ~5 minutes (within timeout limit)
|
|
|
|
cd /Users/kempersc/apps/glam
|
|
|
|
echo "=== Batch 1: Institutions 1-100 ==="
|
|
python3 << 'PYTHON1'
|
|
import sys
|
|
sys.path.insert(0, 'scripts/scrapers')
|
|
from scrape_conabip_argentina import CONABIPScraper
|
|
import json
|
|
|
|
scraper = CONABIPScraper(rate_limit_delay=2.0) # Faster rate
|
|
scraper.scrape_all(scrape_profiles=False) # Get basic data first
|
|
print(f"Total institutions: {len(scraper.institutions)}")
|
|
|
|
# Scrape first 100 profiles only
|
|
for i in range(min(100, len(scraper.institutions))):
|
|
inst = scraper.institutions[i]
|
|
if inst.get('profile_url'):
|
|
try:
|
|
from urllib.parse import urlparse
|
|
profile_path = urlparse(inst['profile_url']).path
|
|
profile_data = scraper._scrape_profile_page(profile_path)
|
|
scraper.institutions[i].update(profile_data)
|
|
print(f"[{i+1}/100] {inst['name']}")
|
|
except:
|
|
pass
|
|
|
|
# Save batch 1
|
|
scraper.export_to_csv("data/isil/AR/conabip_batch1.csv")
|
|
scraper.export_to_json("data/isil/AR/conabip_batch1.json")
|
|
print("Batch 1 complete!")
|
|
PYTHON1
|
|
|
|
echo -e "\n=== Batch 2: Institutions 101-200 ==="
|
|
python3 << 'PYTHON2'
|
|
import sys
|
|
sys.path.insert(0, 'scripts/scrapers')
|
|
from scrape_conabip_argentina import CONABIPScraper
|
|
from urllib.parse import urlparse
|
|
|
|
scraper = CONABIPScraper(rate_limit_delay=2.0)
|
|
scraper.scrape_all(scrape_profiles=False)
|
|
|
|
for i in range(100, min(200, len(scraper.institutions))):
|
|
inst = scraper.institutions[i]
|
|
if inst.get('profile_url'):
|
|
try:
|
|
profile_path = urlparse(inst['profile_url']).path
|
|
profile_data = scraper._scrape_profile_page(profile_path)
|
|
scraper.institutions[i].update(profile_data)
|
|
print(f"[{i+1}/200] {inst['name']}")
|
|
except:
|
|
pass
|
|
|
|
scraper.export_to_csv("data/isil/AR/conabip_batch2.csv")
|
|
scraper.export_to_json("data/isil/AR/conabip_batch2.json")
|
|
print("Batch 2 complete!")
|
|
PYTHON2
|
|
|
|
echo -e "\n=== Batch 3: Institutions 201-288 ==="
|
|
python3 << 'PYTHON3'
|
|
import sys
|
|
sys.path.insert(0, 'scripts/scrapers')
|
|
from scrape_conabip_argentina import CONABIPScraper
|
|
from urllib.parse import urlparse
|
|
|
|
scraper = CONABIPScraper(rate_limit_delay=2.0)
|
|
scraper.scrape_all(scrape_profiles=False)
|
|
|
|
for i in range(200, len(scraper.institutions)):
|
|
inst = scraper.institutions[i]
|
|
if inst.get('profile_url'):
|
|
try:
|
|
profile_path = urlparse(inst['profile_url']).path
|
|
profile_data = scraper._scrape_profile_page(profile_path)
|
|
scraper.institutions[i].update(profile_data)
|
|
print(f"[{i+1}/{len(scraper.institutions)}] {inst['name']}")
|
|
except:
|
|
pass
|
|
|
|
scraper.export_to_csv("data/isil/AR/conabip_batch3.csv")
|
|
scraper.export_to_json("data/isil/AR/conabip_batch3.json")
|
|
print("Batch 3 complete!")
|
|
PYTHON3
|
|
|
|
echo -e "\n=== Merging batches ==="
|
|
python3 << 'PYTHONMERGE'
|
|
import json
|
|
from pathlib import Path
|
|
|
|
# Merge all JSON files
|
|
all_institutions = []
|
|
for i in [1, 2, 3]:
|
|
file = Path(f"data/isil/AR/conabip_batch{i}.json")
|
|
if file.exists():
|
|
data = json.load(open(file))
|
|
all_institutions.extend(data['institutions'])
|
|
|
|
# Create final output
|
|
output = {
|
|
"metadata": {
|
|
"source": "CONABIP Argentina",
|
|
"url": "https://www.conabip.gob.ar/buscador_bp",
|
|
"total_institutions": len(all_institutions),
|
|
"institutions_with_coordinates": sum(1 for i in all_institutions if i.get('latitude')),
|
|
"institutions_with_services": sum(1 for i in all_institutions if i.get('services')),
|
|
},
|
|
"institutions": all_institutions
|
|
}
|
|
|
|
with open("data/isil/AR/conabip_libraries_full.json", 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✓ Merged {len(all_institutions)} institutions")
|
|
print(f" Output: data/isil/AR/conabip_libraries_full.json")
|
|
PYTHONMERGE
|
|
|