#!/usr/bin/env python3 """ CONABIP (Argentina) Popular Libraries Web Scraper Extracts data from Argentina's National Commission on Popular Libraries (CONABIP). Scrapes the searchable database at https://www.conabip.gob.ar/buscador_bp Data Fields Extracted: - Registration number (REG: XXXXX) - Library name - Province - City/Locality - Neighborhood (Barrio) - Street address - Geographic coordinates (from Google Maps links) - Detail page URL - Services offered (from profile pages) Author: GLAM Data Extraction Project Date: 2025-11-17 License: MIT """ import requests from bs4 import BeautifulSoup import time import csv import json import re from datetime import datetime, timezone from pathlib import Path from typing import List, Dict, Optional, Tuple import logging from urllib.parse import urljoin, urlparse, parse_qs # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Output directory OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "AR" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) class CONABIPScraper: """ Scrapes Argentina's CONABIP popular library database. Respects server resources with rate limiting and error handling. """ def __init__(self, rate_limit_delay: float = 2.0): """ Initialize the scraper. Args: rate_limit_delay: Seconds to wait between requests (default: 2.0) """ self.base_url = "https://www.conabip.gob.ar" self.search_url = f"{self.base_url}/buscador_bp" self.rate_limit_delay = rate_limit_delay self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Extraction Bot; Academic Research; +https://github.com/kempersc/glam)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'es-AR,es;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate', }) self.institutions = [] self.errors = [] def _parse_registration_number(self, text: str) -> Optional[int]: """ Extract registration number from text like "REG: 18" or "(REG: 18)". Args: text: Text containing registration number Returns: Registration number as integer, or None if not found """ match = re.search(r'REG:\s*(\d+)', text, re.IGNORECASE) if match: return int(match.group(1)) return None def _parse_coordinates(self, maps_url: str) -> Tuple[Optional[float], Optional[float]]: """ Extract latitude/longitude from Google Maps URL. Example URL: http://www.google.com/maps/place/-34.598461,-58.494690 Args: maps_url: Google Maps place URL Returns: Tuple of (latitude, longitude) or (None, None) if parsing fails """ try: # Pattern: /maps/place/LAT,LON match = re.search(r'/maps/place/([-\d.]+),([-\d.]+)', maps_url) if match: lat = float(match.group(1)) lon = float(match.group(2)) return (lat, lon) except (ValueError, AttributeError) as e: logger.warning(f"Failed to parse coordinates from {maps_url}: {e}") return (None, None) def _extract_services_from_images(self, soup: BeautifulSoup) -> List[str]: """ Extract services offered from icon images on profile page. Args: soup: BeautifulSoup of profile page Returns: List of service names """ services = [] # Services are indicated by image icons with alt/title attributes service_images = soup.select('.bipopServices img') for img in service_images: service_name = img.get('title') or img.get('alt') if service_name: services.append(service_name.strip()) return services def _scrape_profile_page(self, profile_url: str) -> Dict: """ Scrape additional details from institution profile page. Args: profile_url: URL of profile page (e.g., /bipop/1342) Returns: Dictionary with additional fields (services, coordinates, etc.) """ full_url = urljoin(self.base_url, profile_url) try: time.sleep(self.rate_limit_delay) # Rate limiting response = self.session.get(full_url, timeout=30) response.raise_for_status() response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') profile_data = {} # Extract services from icons services = self._extract_services_from_images(soup) if services: profile_data['services'] = services # Extract Google Maps coordinates maps_link = soup.select_one('.mapdata a[href*="google.com/maps"]') if maps_link: maps_url = maps_link['href'] lat, lon = self._parse_coordinates(maps_url) if lat and lon: profile_data['latitude'] = lat profile_data['longitude'] = lon profile_data['maps_url'] = maps_url return profile_data except requests.RequestException as e: logger.error(f"Failed to scrape profile {full_url}: {e}") self.errors.append({ 'url': full_url, 'error': str(e), 'timestamp': datetime.now(timezone.utc).isoformat() }) return {} def scrape_page(self, page_num: int = 0) -> List[Dict]: """ Scrape a single page of search results. Args: page_num: Page number (0-indexed for query parameter) Returns: List of institution dictionaries """ params = { 'province': 'All', 'city': '', 'field_nombre_de_la_biblioteca_value': '', 'field_n_mero_de_registro_value': '', } if page_num > 0: params['page'] = page_num logger.info(f"Scraping page {page_num + 1}...") try: time.sleep(self.rate_limit_delay) # Rate limiting response = self.session.get(self.search_url, params=params, timeout=30) response.raise_for_status() response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # Find results table table = soup.select_one('table.views-table') if not table: logger.warning(f"No results table found on page {page_num + 1}") return [] # Extract rows rows = table.select('tbody tr') if not rows: logger.info(f"No results on page {page_num + 1} (end of pagination)") return [] page_institutions = [] for row in rows: try: cells = row.find_all('td') if len(cells) < 6: logger.warning(f"Row has fewer than 6 cells: {len(cells)}") continue # Column 0: Name and registration number name_cell = cells[0] name_strong = name_cell.find('b') if name_strong: full_name_text = name_strong.get_text(strip=True) # Extract registration number from full text reg_text = name_cell.get_text(strip=True) reg_number = self._parse_registration_number(reg_text) name = full_name_text else: name = name_cell.get_text(strip=True) reg_number = None # Column 1: Province province = cells[1].get_text(strip=True) # Column 2: City/Locality city = cells[2].get_text(strip=True) # Column 3: Neighborhood neighborhood = cells[3].get_text(strip=True) # Column 4: Street address (may contain
with duplicate neighborhood) address_cell = cells[4] # Get first line before
tag address_parts = list(address_cell.stripped_strings) street_address = address_parts[0] if address_parts else '' # Column 5: Profile link profile_link = cells[5].select_one('a[href^="/bipop/"]') profile_url = profile_link['href'] if profile_link else None institution = { 'conabip_reg': str(reg_number) if reg_number is not None else None, 'name': name, 'province': province if province else None, 'city': city if city else None, 'neighborhood': neighborhood if neighborhood else None, 'street_address': street_address if street_address else None, 'profile_url': urljoin(self.base_url, profile_url) if profile_url else None, 'extraction_date': datetime.now(timezone.utc).isoformat(), 'data_source': 'CONABIP', 'country': 'AR' } page_institutions.append(institution) except Exception as e: logger.error(f"Error parsing row: {e}") self.errors.append({ 'page': page_num, 'error': str(e), 'timestamp': datetime.now(timezone.utc).isoformat() }) logger.info(f"Extracted {len(page_institutions)} institutions from page {page_num + 1}") return page_institutions except requests.RequestException as e: logger.error(f"Failed to scrape page {page_num + 1}: {e}") self.errors.append({ 'page': page_num, 'error': str(e), 'timestamp': datetime.now(timezone.utc).isoformat() }) return [] def get_total_pages(self) -> int: """ Determine the total number of pages in the search results. Returns: Total number of pages """ try: response = self.session.get(self.search_url, timeout=30) response.raise_for_status() response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # Find pagination pagination = soup.select('.pagination li a') max_page = 0 for link in pagination: href = link.get('href', '') # Extract page parameter from URL match = re.search(r'page=(\d+)', href) if match: page_num = int(match.group(1)) max_page = max(max_page, page_num) # Page numbers are 0-indexed, so total pages = max_page + 1 total_pages = max_page + 1 if max_page > 0 else 1 logger.info(f"Found {total_pages} total pages") return total_pages except requests.RequestException as e: logger.error(f"Failed to determine total pages: {e}") return 1 # Default to 1 page if detection fails def scrape_all(self, max_pages: Optional[int] = None, scrape_profiles: bool = False) -> List[Dict]: """ Scrape all pages of the CONABIP database. Args: max_pages: Maximum number of pages to scrape (None = all pages) scrape_profiles: Whether to scrape individual profile pages for additional data Returns: List of all institution dictionaries """ logger.info("Starting CONABIP scrape...") logger.info(f"Rate limit delay: {self.rate_limit_delay} seconds") logger.info(f"Scrape profiles: {scrape_profiles}") # Determine total pages total_pages = self.get_total_pages() if max_pages: total_pages = min(total_pages, max_pages) logger.info(f"Limiting scrape to {total_pages} pages") # Scrape each page for page_num in range(total_pages): page_institutions = self.scrape_page(page_num) self.institutions.extend(page_institutions) logger.info(f"Scraped {len(self.institutions)} institutions from {total_pages} pages") # Optionally scrape profile pages if scrape_profiles: logger.info("Scraping individual profile pages...") for idx, institution in enumerate(self.institutions, 1): profile_url = institution.get('profile_url') if profile_url: logger.info(f"[{idx}/{len(self.institutions)}] Scraping profile: {institution['name']}") # Extract relative URL path parsed = urlparse(profile_url) profile_path = parsed.path profile_data = self._scrape_profile_page(profile_path) institution.update(profile_data) logger.info(f"Scraping complete! Total institutions: {len(self.institutions)}") logger.info(f"Total errors: {len(self.errors)}") return self.institutions def export_to_csv(self, filename: str = "conabip_libraries.csv"): """ Export scraped data to CSV. Args: filename: Output CSV filename """ output_path = OUTPUT_DIR / filename if not self.institutions: logger.warning("No institutions to export") return # Determine all fields (union of all keys) all_fields = set() for inst in self.institutions: all_fields.update(inst.keys()) # Sort fields for consistent output fieldnames = sorted(all_fields) with open(output_path, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(self.institutions) logger.info(f"Exported {len(self.institutions)} institutions to {output_path}") def export_to_json(self, filename: str = "conabip_libraries.json"): """ Export scraped data to JSON. Args: filename: Output JSON filename """ output_path = OUTPUT_DIR / filename output_data = { 'metadata': { 'source': 'CONABIP - Comisión Nacional de Bibliotecas Populares', 'source_url': self.search_url, 'extraction_date': datetime.now(timezone.utc).isoformat(), 'total_institutions': len(self.institutions), 'total_errors': len(self.errors), 'data_type': 'Popular libraries (bibliotecas populares)', 'country': 'AR', 'extractor': 'scrape_conabip_argentina.py' }, 'institutions': self.institutions, 'errors': self.errors } with open(output_path, 'w', encoding='utf-8') as jsonfile: json.dump(output_data, jsonfile, indent=2, ensure_ascii=False) logger.info(f"Exported {len(self.institutions)} institutions to {output_path}") def print_summary(self): """Print summary statistics of the scrape.""" logger.info("\n" + "="*60) logger.info("CONABIP SCRAPE SUMMARY") logger.info("="*60) logger.info(f"Total institutions extracted: {len(self.institutions)}") logger.info(f"Total errors: {len(self.errors)}") if self.institutions: # Count by province provinces = {} cities = {} with_coords = 0 with_services = 0 for inst in self.institutions: prov = inst.get('province', 'Unknown') city = inst.get('city', 'Unknown') provinces[prov] = provinces.get(prov, 0) + 1 cities[city] = cities.get(city, 0) + 1 if inst.get('latitude') and inst.get('longitude'): with_coords += 1 if inst.get('services'): with_services += 1 logger.info(f"\nProvinces covered: {len(provinces)}") logger.info(f"Cities covered: {len(cities)}") logger.info(f"Institutions with coordinates: {with_coords}") logger.info(f"Institutions with services data: {with_services}") # Top 10 provinces logger.info("\nTop 10 provinces by institution count:") sorted_provinces = sorted(provinces.items(), key=lambda x: x[1], reverse=True) for prov, count in sorted_provinces[:10]: logger.info(f" {prov}: {count}") logger.info("="*60 + "\n") def main(): """Main execution function.""" import argparse parser = argparse.ArgumentParser( description="Scrape Argentina's CONABIP popular library database" ) parser.add_argument( '--max-pages', type=int, default=None, help='Maximum number of pages to scrape (default: all pages)' ) parser.add_argument( '--scrape-profiles', action='store_true', help='Scrape individual profile pages for additional data (services, coordinates)' ) parser.add_argument( '--rate-limit', type=float, default=2.0, help='Delay in seconds between requests (default: 2.0)' ) parser.add_argument( '--output-csv', type=str, default='conabip_libraries.csv', help='Output CSV filename (default: conabip_libraries.csv)' ) parser.add_argument( '--output-json', type=str, default='conabip_libraries.json', help='Output JSON filename (default: conabip_libraries.json)' ) args = parser.parse_args() # Create scraper scraper = CONABIPScraper(rate_limit_delay=args.rate_limit) # Scrape data scraper.scrape_all( max_pages=args.max_pages, scrape_profiles=args.scrape_profiles ) # Print summary scraper.print_summary() # Export scraper.export_to_csv(args.output_csv) scraper.export_to_json(args.output_json) logger.info("✅ CONABIP scraping complete!") if __name__ == "__main__": main()