glam/scripts/scrapers/scrape_conabip_argentina.py
2025-11-19 23:25:22 +01:00

552 lines
19 KiB
Python

#!/usr/bin/env python3
"""
CONABIP (Argentina) Popular Libraries Web Scraper
Extracts data from Argentina's National Commission on Popular Libraries (CONABIP).
Scrapes the searchable database at https://www.conabip.gob.ar/buscador_bp
Data Fields Extracted:
- Registration number (REG: XXXXX)
- Library name
- Province
- City/Locality
- Neighborhood (Barrio)
- Street address
- Geographic coordinates (from Google Maps links)
- Detail page URL
- Services offered (from profile pages)
Author: GLAM Data Extraction Project
Date: 2025-11-17
License: MIT
"""
import requests
from bs4 import BeautifulSoup
import time
import csv
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional, Tuple
import logging
from urllib.parse import urljoin, urlparse, parse_qs
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Output directory
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "AR"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
class CONABIPScraper:
"""
Scrapes Argentina's CONABIP popular library database.
Respects server resources with rate limiting and error handling.
"""
def __init__(self, rate_limit_delay: float = 2.0):
"""
Initialize the scraper.
Args:
rate_limit_delay: Seconds to wait between requests (default: 2.0)
"""
self.base_url = "https://www.conabip.gob.ar"
self.search_url = f"{self.base_url}/buscador_bp"
self.rate_limit_delay = rate_limit_delay
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Extraction Bot; Academic Research; +https://github.com/kempersc/glam)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'es-AR,es;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate',
})
self.institutions = []
self.errors = []
def _parse_registration_number(self, text: str) -> Optional[int]:
"""
Extract registration number from text like "REG: 18" or "(REG: 18)".
Args:
text: Text containing registration number
Returns:
Registration number as integer, or None if not found
"""
match = re.search(r'REG:\s*(\d+)', text, re.IGNORECASE)
if match:
return int(match.group(1))
return None
def _parse_coordinates(self, maps_url: str) -> Tuple[Optional[float], Optional[float]]:
"""
Extract latitude/longitude from Google Maps URL.
Example URL: http://www.google.com/maps/place/-34.598461,-58.494690
Args:
maps_url: Google Maps place URL
Returns:
Tuple of (latitude, longitude) or (None, None) if parsing fails
"""
try:
# Pattern: /maps/place/LAT,LON
match = re.search(r'/maps/place/([-\d.]+),([-\d.]+)', maps_url)
if match:
lat = float(match.group(1))
lon = float(match.group(2))
return (lat, lon)
except (ValueError, AttributeError) as e:
logger.warning(f"Failed to parse coordinates from {maps_url}: {e}")
return (None, None)
def _extract_services_from_images(self, soup: BeautifulSoup) -> List[str]:
"""
Extract services offered from icon images on profile page.
Args:
soup: BeautifulSoup of profile page
Returns:
List of service names
"""
services = []
# Services are indicated by image icons with alt/title attributes
service_images = soup.select('.bipopServices img')
for img in service_images:
service_name = img.get('title') or img.get('alt')
if service_name:
services.append(service_name.strip())
return services
def _scrape_profile_page(self, profile_url: str) -> Dict:
"""
Scrape additional details from institution profile page.
Args:
profile_url: URL of profile page (e.g., /bipop/1342)
Returns:
Dictionary with additional fields (services, coordinates, etc.)
"""
full_url = urljoin(self.base_url, profile_url)
try:
time.sleep(self.rate_limit_delay) # Rate limiting
response = self.session.get(full_url, timeout=30)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
profile_data = {}
# Extract services from icons
services = self._extract_services_from_images(soup)
if services:
profile_data['services'] = services
# Extract Google Maps coordinates
maps_link = soup.select_one('.mapdata a[href*="google.com/maps"]')
if maps_link:
maps_url = maps_link['href']
lat, lon = self._parse_coordinates(maps_url)
if lat and lon:
profile_data['latitude'] = lat
profile_data['longitude'] = lon
profile_data['maps_url'] = maps_url
return profile_data
except requests.RequestException as e:
logger.error(f"Failed to scrape profile {full_url}: {e}")
self.errors.append({
'url': full_url,
'error': str(e),
'timestamp': datetime.now(timezone.utc).isoformat()
})
return {}
def scrape_page(self, page_num: int = 0) -> List[Dict]:
"""
Scrape a single page of search results.
Args:
page_num: Page number (0-indexed for query parameter)
Returns:
List of institution dictionaries
"""
params = {
'province': 'All',
'city': '',
'field_nombre_de_la_biblioteca_value': '',
'field_n_mero_de_registro_value': '',
}
if page_num > 0:
params['page'] = page_num
logger.info(f"Scraping page {page_num + 1}...")
try:
time.sleep(self.rate_limit_delay) # Rate limiting
response = self.session.get(self.search_url, params=params, timeout=30)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# Find results table
table = soup.select_one('table.views-table')
if not table:
logger.warning(f"No results table found on page {page_num + 1}")
return []
# Extract rows
rows = table.select('tbody tr')
if not rows:
logger.info(f"No results on page {page_num + 1} (end of pagination)")
return []
page_institutions = []
for row in rows:
try:
cells = row.find_all('td')
if len(cells) < 6:
logger.warning(f"Row has fewer than 6 cells: {len(cells)}")
continue
# Column 0: Name and registration number
name_cell = cells[0]
name_strong = name_cell.find('b')
if name_strong:
full_name_text = name_strong.get_text(strip=True)
# Extract registration number from full text
reg_text = name_cell.get_text(strip=True)
reg_number = self._parse_registration_number(reg_text)
name = full_name_text
else:
name = name_cell.get_text(strip=True)
reg_number = None
# Column 1: Province
province = cells[1].get_text(strip=True)
# Column 2: City/Locality
city = cells[2].get_text(strip=True)
# Column 3: Neighborhood
neighborhood = cells[3].get_text(strip=True)
# Column 4: Street address (may contain <br> with duplicate neighborhood)
address_cell = cells[4]
# Get first line before <br> tag
address_parts = list(address_cell.stripped_strings)
street_address = address_parts[0] if address_parts else ''
# Column 5: Profile link
profile_link = cells[5].select_one('a[href^="/bipop/"]')
profile_url = profile_link['href'] if profile_link else None
institution = {
'conabip_reg': str(reg_number) if reg_number is not None else None,
'name': name,
'province': province if province else None,
'city': city if city else None,
'neighborhood': neighborhood if neighborhood else None,
'street_address': street_address if street_address else None,
'profile_url': urljoin(self.base_url, profile_url) if profile_url else None,
'extraction_date': datetime.now(timezone.utc).isoformat(),
'data_source': 'CONABIP',
'country': 'AR'
}
page_institutions.append(institution)
except Exception as e:
logger.error(f"Error parsing row: {e}")
self.errors.append({
'page': page_num,
'error': str(e),
'timestamp': datetime.now(timezone.utc).isoformat()
})
logger.info(f"Extracted {len(page_institutions)} institutions from page {page_num + 1}")
return page_institutions
except requests.RequestException as e:
logger.error(f"Failed to scrape page {page_num + 1}: {e}")
self.errors.append({
'page': page_num,
'error': str(e),
'timestamp': datetime.now(timezone.utc).isoformat()
})
return []
def get_total_pages(self) -> int:
"""
Determine the total number of pages in the search results.
Returns:
Total number of pages
"""
try:
response = self.session.get(self.search_url, timeout=30)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# Find pagination
pagination = soup.select('.pagination li a')
max_page = 0
for link in pagination:
href = link.get('href', '')
# Extract page parameter from URL
match = re.search(r'page=(\d+)', href)
if match:
page_num = int(match.group(1))
max_page = max(max_page, page_num)
# Page numbers are 0-indexed, so total pages = max_page + 1
total_pages = max_page + 1 if max_page > 0 else 1
logger.info(f"Found {total_pages} total pages")
return total_pages
except requests.RequestException as e:
logger.error(f"Failed to determine total pages: {e}")
return 1 # Default to 1 page if detection fails
def scrape_all(self, max_pages: Optional[int] = None, scrape_profiles: bool = False) -> List[Dict]:
"""
Scrape all pages of the CONABIP database.
Args:
max_pages: Maximum number of pages to scrape (None = all pages)
scrape_profiles: Whether to scrape individual profile pages for additional data
Returns:
List of all institution dictionaries
"""
logger.info("Starting CONABIP scrape...")
logger.info(f"Rate limit delay: {self.rate_limit_delay} seconds")
logger.info(f"Scrape profiles: {scrape_profiles}")
# Determine total pages
total_pages = self.get_total_pages()
if max_pages:
total_pages = min(total_pages, max_pages)
logger.info(f"Limiting scrape to {total_pages} pages")
# Scrape each page
for page_num in range(total_pages):
page_institutions = self.scrape_page(page_num)
self.institutions.extend(page_institutions)
logger.info(f"Scraped {len(self.institutions)} institutions from {total_pages} pages")
# Optionally scrape profile pages
if scrape_profiles:
logger.info("Scraping individual profile pages...")
for idx, institution in enumerate(self.institutions, 1):
profile_url = institution.get('profile_url')
if profile_url:
logger.info(f"[{idx}/{len(self.institutions)}] Scraping profile: {institution['name']}")
# Extract relative URL path
parsed = urlparse(profile_url)
profile_path = parsed.path
profile_data = self._scrape_profile_page(profile_path)
institution.update(profile_data)
logger.info(f"Scraping complete! Total institutions: {len(self.institutions)}")
logger.info(f"Total errors: {len(self.errors)}")
return self.institutions
def export_to_csv(self, filename: str = "conabip_libraries.csv"):
"""
Export scraped data to CSV.
Args:
filename: Output CSV filename
"""
output_path = OUTPUT_DIR / filename
if not self.institutions:
logger.warning("No institutions to export")
return
# Determine all fields (union of all keys)
all_fields = set()
for inst in self.institutions:
all_fields.update(inst.keys())
# Sort fields for consistent output
fieldnames = sorted(all_fields)
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(self.institutions)
logger.info(f"Exported {len(self.institutions)} institutions to {output_path}")
def export_to_json(self, filename: str = "conabip_libraries.json"):
"""
Export scraped data to JSON.
Args:
filename: Output JSON filename
"""
output_path = OUTPUT_DIR / filename
output_data = {
'metadata': {
'source': 'CONABIP - Comisión Nacional de Bibliotecas Populares',
'source_url': self.search_url,
'extraction_date': datetime.now(timezone.utc).isoformat(),
'total_institutions': len(self.institutions),
'total_errors': len(self.errors),
'data_type': 'Popular libraries (bibliotecas populares)',
'country': 'AR',
'extractor': 'scrape_conabip_argentina.py'
},
'institutions': self.institutions,
'errors': self.errors
}
with open(output_path, 'w', encoding='utf-8') as jsonfile:
json.dump(output_data, jsonfile, indent=2, ensure_ascii=False)
logger.info(f"Exported {len(self.institutions)} institutions to {output_path}")
def print_summary(self):
"""Print summary statistics of the scrape."""
logger.info("\n" + "="*60)
logger.info("CONABIP SCRAPE SUMMARY")
logger.info("="*60)
logger.info(f"Total institutions extracted: {len(self.institutions)}")
logger.info(f"Total errors: {len(self.errors)}")
if self.institutions:
# Count by province
provinces = {}
cities = {}
with_coords = 0
with_services = 0
for inst in self.institutions:
prov = inst.get('province', 'Unknown')
city = inst.get('city', 'Unknown')
provinces[prov] = provinces.get(prov, 0) + 1
cities[city] = cities.get(city, 0) + 1
if inst.get('latitude') and inst.get('longitude'):
with_coords += 1
if inst.get('services'):
with_services += 1
logger.info(f"\nProvinces covered: {len(provinces)}")
logger.info(f"Cities covered: {len(cities)}")
logger.info(f"Institutions with coordinates: {with_coords}")
logger.info(f"Institutions with services data: {with_services}")
# Top 10 provinces
logger.info("\nTop 10 provinces by institution count:")
sorted_provinces = sorted(provinces.items(), key=lambda x: x[1], reverse=True)
for prov, count in sorted_provinces[:10]:
logger.info(f" {prov}: {count}")
logger.info("="*60 + "\n")
def main():
"""Main execution function."""
import argparse
parser = argparse.ArgumentParser(
description="Scrape Argentina's CONABIP popular library database"
)
parser.add_argument(
'--max-pages',
type=int,
default=None,
help='Maximum number of pages to scrape (default: all pages)'
)
parser.add_argument(
'--scrape-profiles',
action='store_true',
help='Scrape individual profile pages for additional data (services, coordinates)'
)
parser.add_argument(
'--rate-limit',
type=float,
default=2.0,
help='Delay in seconds between requests (default: 2.0)'
)
parser.add_argument(
'--output-csv',
type=str,
default='conabip_libraries.csv',
help='Output CSV filename (default: conabip_libraries.csv)'
)
parser.add_argument(
'--output-json',
type=str,
default='conabip_libraries.json',
help='Output JSON filename (default: conabip_libraries.json)'
)
args = parser.parse_args()
# Create scraper
scraper = CONABIPScraper(rate_limit_delay=args.rate_limit)
# Scrape data
scraper.scrape_all(
max_pages=args.max_pages,
scrape_profiles=args.scrape_profiles
)
# Print summary
scraper.print_summary()
# Export
scraper.export_to_csv(args.output_csv)
scraper.export_to_json(args.output_json)
logger.info("✅ CONABIP scraping complete!")
if __name__ == "__main__":
main()