glam/scripts/scrapers/scrape_conabip_argentina.py

#!/usr/bin/env python3
"""
CONABIP (Argentina) Popular Libraries Web Scraper

Extracts data from Argentina's National Commission on Popular Libraries (CONABIP).
Scrapes the searchable database at https://www.conabip.gob.ar/buscador_bp

Data Fields Extracted:
- Registration number (REG: XXXXX)
- Library name
- Province
- City/Locality
- Neighborhood (Barrio)
- Street address
- Geographic coordinates (from Google Maps links)
- Detail page URL
- Services offered (from profile pages)

Author: GLAM Data Extraction Project
Date: 2025-11-17
License: MIT
"""

import requests
from bs4 import BeautifulSoup
import time
import csv
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional, Tuple
import logging
from urllib.parse import urljoin, urlparse, parse_qs

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Output directory
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "AR"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


class CONABIPScraper:
    """
    Scrapes Argentina's CONABIP popular library database.

    Respects server resources with rate limiting and error handling.
    """

    def __init__(self, rate_limit_delay: float = 2.0):
        """
        Initialize the scraper.

        Args:
            rate_limit_delay: Seconds to wait between requests (default: 2.0)
        """
        self.base_url = "https://www.conabip.gob.ar"
        self.search_url = f"{self.base_url}/buscador_bp"
        self.rate_limit_delay = rate_limit_delay

        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Extraction Bot; Academic Research; +https://github.com/kempersc/glam)',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'es-AR,es;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
        })

        self.institutions = []
        self.errors = []

    def _parse_registration_number(self, text: str) -> Optional[int]:
        """
        Extract registration number from text like "REG: 18" or "(REG: 18)".

        Args:
            text: Text containing registration number

        Returns:
            Registration number as integer, or None if not found
        """
        match = re.search(r'REG:\s*(\d+)', text, re.IGNORECASE)
        if match:
            return int(match.group(1))
        return None

    def _parse_coordinates(self, maps_url: str) -> Tuple[Optional[float], Optional[float]]:
        """
        Extract latitude/longitude from Google Maps URL.

        Example URL: http://www.google.com/maps/place/-34.598461,-58.494690

        Args:
            maps_url: Google Maps place URL

        Returns:
            Tuple of (latitude, longitude) or (None, None) if parsing fails
        """
        try:
            # Pattern: /maps/place/LAT,LON
            match = re.search(r'/maps/place/([-\d.]+),([-\d.]+)', maps_url)
            if match:
                lat = float(match.group(1))
                lon = float(match.group(2))
                return (lat, lon)
        except (ValueError, AttributeError) as e:
            logger.warning(f"Failed to parse coordinates from {maps_url}: {e}")

        return (None, None)

    def _extract_services_from_images(self, soup: BeautifulSoup) -> List[str]:
        """
        Extract services offered from icon images on profile page.

        Args:
            soup: BeautifulSoup of profile page

        Returns:
            List of service names
        """
        services = []

        # Services are indicated by image icons with alt/title attributes
        service_images = soup.select('.bipopServices img')

        for img in service_images:
            service_name = img.get('title') or img.get('alt')
            if service_name:
                services.append(service_name.strip())

        return services

    def _scrape_profile_page(self, profile_url: str) -> Dict:
        """
        Scrape additional details from institution profile page.

        Args:
            profile_url: URL of profile page (e.g., /bipop/1342)

        Returns:
            Dictionary with additional fields (services, coordinates, etc.)
        """
        full_url = urljoin(self.base_url, profile_url)

        try:
            time.sleep(self.rate_limit_delay)  # Rate limiting

            response = self.session.get(full_url, timeout=30)
            response.raise_for_status()
            response.encoding = 'utf-8'

            soup = BeautifulSoup(response.text, 'html.parser')

            profile_data = {}

            # Extract services from icons
            services = self._extract_services_from_images(soup)
            if services:
                profile_data['services'] = services

            # Extract Google Maps coordinates
            maps_link = soup.select_one('.mapdata a[href*="google.com/maps"]')
            if maps_link:
                maps_url = maps_link['href']
                lat, lon = self._parse_coordinates(maps_url)
                if lat and lon:
                    profile_data['latitude'] = lat
                    profile_data['longitude'] = lon
                    profile_data['maps_url'] = maps_url

            return profile_data

        except requests.RequestException as e:
            logger.error(f"Failed to scrape profile {full_url}: {e}")
            self.errors.append({
                'url': full_url,
                'error': str(e),
                'timestamp': datetime.now(timezone.utc).isoformat()
            })
            return {}

    def scrape_page(self, page_num: int = 0) -> List[Dict]:
        """
        Scrape a single page of search results.

        Args:
            page_num: Page number (0-indexed for query parameter)

        Returns:
            List of institution dictionaries
        """
        params = {
            'province': 'All',
            'city': '',
            'field_nombre_de_la_biblioteca_value': '',
            'field_n_mero_de_registro_value': '',
        }

        if page_num > 0:
            params['page'] = page_num

        logger.info(f"Scraping page {page_num + 1}...")

        try:
            time.sleep(self.rate_limit_delay)  # Rate limiting

            response = self.session.get(self.search_url, params=params, timeout=30)
            response.raise_for_status()
            response.encoding = 'utf-8'

            soup = BeautifulSoup(response.text, 'html.parser')

            # Find results table
            table = soup.select_one('table.views-table')
            if not table:
                logger.warning(f"No results table found on page {page_num + 1}")
                return []

            # Extract rows
            rows = table.select('tbody tr')

            if not rows:
                logger.info(f"No results on page {page_num + 1} (end of pagination)")
                return []

            page_institutions = []

            for row in rows:
                try:
                    cells = row.find_all('td')
                    if len(cells) < 6:
                        logger.warning(f"Row has fewer than 6 cells: {len(cells)}")
                        continue

                    # Column 0: Name and registration number
                    name_cell = cells[0]
                    name_strong = name_cell.find('b')
                    if name_strong:
                        full_name_text = name_strong.get_text(strip=True)
                        # Extract registration number from full text
                        reg_text = name_cell.get_text(strip=True)
                        reg_number = self._parse_registration_number(reg_text)
                        name = full_name_text
                    else:
                        name = name_cell.get_text(strip=True)
                        reg_number = None

                    # Column 1: Province
                    province = cells[1].get_text(strip=True)

                    # Column 2: City/Locality
                    city = cells[2].get_text(strip=True)

                    # Column 3: Neighborhood
                    neighborhood = cells[3].get_text(strip=True)

                    # Column 4: Street address (may contain <br> with duplicate neighborhood)
                    address_cell = cells[4]
                    # Get first line before <br> tag
                    address_parts = list(address_cell.stripped_strings)
                    street_address = address_parts[0] if address_parts else ''

                    # Column 5: Profile link
                    profile_link = cells[5].select_one('a[href^="/bipop/"]')
                    profile_url = profile_link['href'] if profile_link else None

                    institution = {
                        'conabip_reg': str(reg_number) if reg_number is not None else None,
                        'name': name,
                        'province': province if province else None,
                        'city': city if city else None,
                        'neighborhood': neighborhood if neighborhood else None,
                        'street_address': street_address if street_address else None,
                        'profile_url': urljoin(self.base_url, profile_url) if profile_url else None,
                        'extraction_date': datetime.now(timezone.utc).isoformat(),
                        'data_source': 'CONABIP',
                        'country': 'AR'
                    }

                    page_institutions.append(institution)

                except Exception as e:
                    logger.error(f"Error parsing row: {e}")
                    self.errors.append({
                        'page': page_num,
                        'error': str(e),
                        'timestamp': datetime.now(timezone.utc).isoformat()
                    })

            logger.info(f"Extracted {len(page_institutions)} institutions from page {page_num + 1}")
            return page_institutions

        except requests.RequestException as e:
            logger.error(f"Failed to scrape page {page_num + 1}: {e}")
            self.errors.append({
                'page': page_num,
                'error': str(e),
                'timestamp': datetime.now(timezone.utc).isoformat()
            })
            return []

    def get_total_pages(self) -> int:
        """
        Determine the total number of pages in the search results.

        Returns:
            Total number of pages
        """
        try:
            response = self.session.get(self.search_url, timeout=30)
            response.raise_for_status()
            response.encoding = 'utf-8'

            soup = BeautifulSoup(response.text, 'html.parser')

            # Find pagination
            pagination = soup.select('.pagination li a')

            max_page = 0
            for link in pagination:
                href = link.get('href', '')
                # Extract page parameter from URL
                match = re.search(r'page=(\d+)', href)
                if match:
                    page_num = int(match.group(1))
                    max_page = max(max_page, page_num)

            # Page numbers are 0-indexed, so total pages = max_page + 1
            total_pages = max_page + 1 if max_page > 0 else 1

            logger.info(f"Found {total_pages} total pages")
            return total_pages

        except requests.RequestException as e:
            logger.error(f"Failed to determine total pages: {e}")
            return 1  # Default to 1 page if detection fails

    def scrape_all(self, max_pages: Optional[int] = None, scrape_profiles: bool = False) -> List[Dict]:
        """
        Scrape all pages of the CONABIP database.

        Args:
            max_pages: Maximum number of pages to scrape (None = all pages)
            scrape_profiles: Whether to scrape individual profile pages for additional data

        Returns:
            List of all institution dictionaries
        """
        logger.info("Starting CONABIP scrape...")
        logger.info(f"Rate limit delay: {self.rate_limit_delay} seconds")
        logger.info(f"Scrape profiles: {scrape_profiles}")

        # Determine total pages
        total_pages = self.get_total_pages()

        if max_pages:
            total_pages = min(total_pages, max_pages)
            logger.info(f"Limiting scrape to {total_pages} pages")

        # Scrape each page
        for page_num in range(total_pages):
            page_institutions = self.scrape_page(page_num)
            self.institutions.extend(page_institutions)

        logger.info(f"Scraped {len(self.institutions)} institutions from {total_pages} pages")

        # Optionally scrape profile pages
        if scrape_profiles:
            logger.info("Scraping individual profile pages...")

            for idx, institution in enumerate(self.institutions, 1):
                profile_url = institution.get('profile_url')

                if profile_url:
                    logger.info(f"[{idx}/{len(self.institutions)}] Scraping profile: {institution['name']}")

                    # Extract relative URL path
                    parsed = urlparse(profile_url)
                    profile_path = parsed.path

                    profile_data = self._scrape_profile_page(profile_path)
                    institution.update(profile_data)

        logger.info(f"Scraping complete! Total institutions: {len(self.institutions)}")
        logger.info(f"Total errors: {len(self.errors)}")

        return self.institutions

    def export_to_csv(self, filename: str = "conabip_libraries.csv"):
        """
        Export scraped data to CSV.

        Args:
            filename: Output CSV filename
        """
        output_path = OUTPUT_DIR / filename

        if not self.institutions:
            logger.warning("No institutions to export")
            return

        # Determine all fields (union of all keys)
        all_fields = set()
        for inst in self.institutions:
            all_fields.update(inst.keys())

        # Sort fields for consistent output
        fieldnames = sorted(all_fields)

        with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(self.institutions)

        logger.info(f"Exported {len(self.institutions)} institutions to {output_path}")

    def export_to_json(self, filename: str = "conabip_libraries.json"):
        """
        Export scraped data to JSON.

        Args:
            filename: Output JSON filename
        """
        output_path = OUTPUT_DIR / filename

        output_data = {
            'metadata': {
                'source': 'CONABIP - Comisión Nacional de Bibliotecas Populares',
                'source_url': self.search_url,
                'extraction_date': datetime.now(timezone.utc).isoformat(),
                'total_institutions': len(self.institutions),
                'total_errors': len(self.errors),
                'data_type': 'Popular libraries (bibliotecas populares)',
                'country': 'AR',
                'extractor': 'scrape_conabip_argentina.py'
            },
            'institutions': self.institutions,
            'errors': self.errors
        }

        with open(output_path, 'w', encoding='utf-8') as jsonfile:
            json.dump(output_data, jsonfile, indent=2, ensure_ascii=False)

        logger.info(f"Exported {len(self.institutions)} institutions to {output_path}")

    def print_summary(self):
        """Print summary statistics of the scrape."""
        logger.info("\n" + "="*60)
        logger.info("CONABIP SCRAPE SUMMARY")
        logger.info("="*60)
        logger.info(f"Total institutions extracted: {len(self.institutions)}")
        logger.info(f"Total errors: {len(self.errors)}")

        if self.institutions:
            # Count by province
            provinces = {}
            cities = {}
            with_coords = 0
            with_services = 0

            for inst in self.institutions:
                prov = inst.get('province', 'Unknown')
                city = inst.get('city', 'Unknown')

                provinces[prov] = provinces.get(prov, 0) + 1
                cities[city] = cities.get(city, 0) + 1

                if inst.get('latitude') and inst.get('longitude'):
                    with_coords += 1

                if inst.get('services'):
                    with_services += 1

            logger.info(f"\nProvinces covered: {len(provinces)}")
            logger.info(f"Cities covered: {len(cities)}")
            logger.info(f"Institutions with coordinates: {with_coords}")
            logger.info(f"Institutions with services data: {with_services}")

            # Top 10 provinces
            logger.info("\nTop 10 provinces by institution count:")
            sorted_provinces = sorted(provinces.items(), key=lambda x: x[1], reverse=True)
            for prov, count in sorted_provinces[:10]:
                logger.info(f"  {prov}: {count}")

        logger.info("="*60 + "\n")


def main():
    """Main execution function."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Scrape Argentina's CONABIP popular library database"
    )
    parser.add_argument(
        '--max-pages',
        type=int,
        default=None,
        help='Maximum number of pages to scrape (default: all pages)'
    )
    parser.add_argument(
        '--scrape-profiles',
        action='store_true',
        help='Scrape individual profile pages for additional data (services, coordinates)'
    )
    parser.add_argument(
        '--rate-limit',
        type=float,
        default=2.0,
        help='Delay in seconds between requests (default: 2.0)'
    )
    parser.add_argument(
        '--output-csv',
        type=str,
        default='conabip_libraries.csv',
        help='Output CSV filename (default: conabip_libraries.csv)'
    )
    parser.add_argument(
        '--output-json',
        type=str,
        default='conabip_libraries.json',
        help='Output JSON filename (default: conabip_libraries.json)'
    )

    args = parser.parse_args()

    # Create scraper
    scraper = CONABIPScraper(rate_limit_delay=args.rate_limit)

    # Scrape data
    scraper.scrape_all(
        max_pages=args.max_pages,
        scrape_profiles=args.scrape_profiles
    )

    # Print summary
    scraper.print_summary()

    # Export
    scraper.export_to_csv(args.output_csv)
    scraper.export_to_json(args.output_json)

    logger.info("✅ CONABIP scraping complete!")


if __name__ == "__main__":
    main()