552 lines
19 KiB
Python
552 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
CONABIP (Argentina) Popular Libraries Web Scraper
|
|
|
|
Extracts data from Argentina's National Commission on Popular Libraries (CONABIP).
|
|
Scrapes the searchable database at https://www.conabip.gob.ar/buscador_bp
|
|
|
|
Data Fields Extracted:
|
|
- Registration number (REG: XXXXX)
|
|
- Library name
|
|
- Province
|
|
- City/Locality
|
|
- Neighborhood (Barrio)
|
|
- Street address
|
|
- Geographic coordinates (from Google Maps links)
|
|
- Detail page URL
|
|
- Services offered (from profile pages)
|
|
|
|
Author: GLAM Data Extraction Project
|
|
Date: 2025-11-17
|
|
License: MIT
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
import csv
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional, Tuple
|
|
import logging
|
|
from urllib.parse import urljoin, urlparse, parse_qs
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Output directory
|
|
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data" / "isil" / "AR"
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
class CONABIPScraper:
|
|
"""
|
|
Scrapes Argentina's CONABIP popular library database.
|
|
|
|
Respects server resources with rate limiting and error handling.
|
|
"""
|
|
|
|
def __init__(self, rate_limit_delay: float = 2.0):
|
|
"""
|
|
Initialize the scraper.
|
|
|
|
Args:
|
|
rate_limit_delay: Seconds to wait between requests (default: 2.0)
|
|
"""
|
|
self.base_url = "https://www.conabip.gob.ar"
|
|
self.search_url = f"{self.base_url}/buscador_bp"
|
|
self.rate_limit_delay = rate_limit_delay
|
|
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (GLAM Heritage Data Extraction Bot; Academic Research; +https://github.com/kempersc/glam)',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'es-AR,es;q=0.9,en;q=0.8',
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
})
|
|
|
|
self.institutions = []
|
|
self.errors = []
|
|
|
|
def _parse_registration_number(self, text: str) -> Optional[int]:
|
|
"""
|
|
Extract registration number from text like "REG: 18" or "(REG: 18)".
|
|
|
|
Args:
|
|
text: Text containing registration number
|
|
|
|
Returns:
|
|
Registration number as integer, or None if not found
|
|
"""
|
|
match = re.search(r'REG:\s*(\d+)', text, re.IGNORECASE)
|
|
if match:
|
|
return int(match.group(1))
|
|
return None
|
|
|
|
def _parse_coordinates(self, maps_url: str) -> Tuple[Optional[float], Optional[float]]:
|
|
"""
|
|
Extract latitude/longitude from Google Maps URL.
|
|
|
|
Example URL: http://www.google.com/maps/place/-34.598461,-58.494690
|
|
|
|
Args:
|
|
maps_url: Google Maps place URL
|
|
|
|
Returns:
|
|
Tuple of (latitude, longitude) or (None, None) if parsing fails
|
|
"""
|
|
try:
|
|
# Pattern: /maps/place/LAT,LON
|
|
match = re.search(r'/maps/place/([-\d.]+),([-\d.]+)', maps_url)
|
|
if match:
|
|
lat = float(match.group(1))
|
|
lon = float(match.group(2))
|
|
return (lat, lon)
|
|
except (ValueError, AttributeError) as e:
|
|
logger.warning(f"Failed to parse coordinates from {maps_url}: {e}")
|
|
|
|
return (None, None)
|
|
|
|
def _extract_services_from_images(self, soup: BeautifulSoup) -> List[str]:
|
|
"""
|
|
Extract services offered from icon images on profile page.
|
|
|
|
Args:
|
|
soup: BeautifulSoup of profile page
|
|
|
|
Returns:
|
|
List of service names
|
|
"""
|
|
services = []
|
|
|
|
# Services are indicated by image icons with alt/title attributes
|
|
service_images = soup.select('.bipopServices img')
|
|
|
|
for img in service_images:
|
|
service_name = img.get('title') or img.get('alt')
|
|
if service_name:
|
|
services.append(service_name.strip())
|
|
|
|
return services
|
|
|
|
def _scrape_profile_page(self, profile_url: str) -> Dict:
|
|
"""
|
|
Scrape additional details from institution profile page.
|
|
|
|
Args:
|
|
profile_url: URL of profile page (e.g., /bipop/1342)
|
|
|
|
Returns:
|
|
Dictionary with additional fields (services, coordinates, etc.)
|
|
"""
|
|
full_url = urljoin(self.base_url, profile_url)
|
|
|
|
try:
|
|
time.sleep(self.rate_limit_delay) # Rate limiting
|
|
|
|
response = self.session.get(full_url, timeout=30)
|
|
response.raise_for_status()
|
|
response.encoding = 'utf-8'
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
profile_data = {}
|
|
|
|
# Extract services from icons
|
|
services = self._extract_services_from_images(soup)
|
|
if services:
|
|
profile_data['services'] = services
|
|
|
|
# Extract Google Maps coordinates
|
|
maps_link = soup.select_one('.mapdata a[href*="google.com/maps"]')
|
|
if maps_link:
|
|
maps_url = maps_link['href']
|
|
lat, lon = self._parse_coordinates(maps_url)
|
|
if lat and lon:
|
|
profile_data['latitude'] = lat
|
|
profile_data['longitude'] = lon
|
|
profile_data['maps_url'] = maps_url
|
|
|
|
return profile_data
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Failed to scrape profile {full_url}: {e}")
|
|
self.errors.append({
|
|
'url': full_url,
|
|
'error': str(e),
|
|
'timestamp': datetime.now(timezone.utc).isoformat()
|
|
})
|
|
return {}
|
|
|
|
def scrape_page(self, page_num: int = 0) -> List[Dict]:
|
|
"""
|
|
Scrape a single page of search results.
|
|
|
|
Args:
|
|
page_num: Page number (0-indexed for query parameter)
|
|
|
|
Returns:
|
|
List of institution dictionaries
|
|
"""
|
|
params = {
|
|
'province': 'All',
|
|
'city': '',
|
|
'field_nombre_de_la_biblioteca_value': '',
|
|
'field_n_mero_de_registro_value': '',
|
|
}
|
|
|
|
if page_num > 0:
|
|
params['page'] = page_num
|
|
|
|
logger.info(f"Scraping page {page_num + 1}...")
|
|
|
|
try:
|
|
time.sleep(self.rate_limit_delay) # Rate limiting
|
|
|
|
response = self.session.get(self.search_url, params=params, timeout=30)
|
|
response.raise_for_status()
|
|
response.encoding = 'utf-8'
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Find results table
|
|
table = soup.select_one('table.views-table')
|
|
if not table:
|
|
logger.warning(f"No results table found on page {page_num + 1}")
|
|
return []
|
|
|
|
# Extract rows
|
|
rows = table.select('tbody tr')
|
|
|
|
if not rows:
|
|
logger.info(f"No results on page {page_num + 1} (end of pagination)")
|
|
return []
|
|
|
|
page_institutions = []
|
|
|
|
for row in rows:
|
|
try:
|
|
cells = row.find_all('td')
|
|
if len(cells) < 6:
|
|
logger.warning(f"Row has fewer than 6 cells: {len(cells)}")
|
|
continue
|
|
|
|
# Column 0: Name and registration number
|
|
name_cell = cells[0]
|
|
name_strong = name_cell.find('b')
|
|
if name_strong:
|
|
full_name_text = name_strong.get_text(strip=True)
|
|
# Extract registration number from full text
|
|
reg_text = name_cell.get_text(strip=True)
|
|
reg_number = self._parse_registration_number(reg_text)
|
|
name = full_name_text
|
|
else:
|
|
name = name_cell.get_text(strip=True)
|
|
reg_number = None
|
|
|
|
# Column 1: Province
|
|
province = cells[1].get_text(strip=True)
|
|
|
|
# Column 2: City/Locality
|
|
city = cells[2].get_text(strip=True)
|
|
|
|
# Column 3: Neighborhood
|
|
neighborhood = cells[3].get_text(strip=True)
|
|
|
|
# Column 4: Street address (may contain <br> with duplicate neighborhood)
|
|
address_cell = cells[4]
|
|
# Get first line before <br> tag
|
|
address_parts = list(address_cell.stripped_strings)
|
|
street_address = address_parts[0] if address_parts else ''
|
|
|
|
# Column 5: Profile link
|
|
profile_link = cells[5].select_one('a[href^="/bipop/"]')
|
|
profile_url = profile_link['href'] if profile_link else None
|
|
|
|
institution = {
|
|
'conabip_reg': str(reg_number) if reg_number is not None else None,
|
|
'name': name,
|
|
'province': province if province else None,
|
|
'city': city if city else None,
|
|
'neighborhood': neighborhood if neighborhood else None,
|
|
'street_address': street_address if street_address else None,
|
|
'profile_url': urljoin(self.base_url, profile_url) if profile_url else None,
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'data_source': 'CONABIP',
|
|
'country': 'AR'
|
|
}
|
|
|
|
page_institutions.append(institution)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing row: {e}")
|
|
self.errors.append({
|
|
'page': page_num,
|
|
'error': str(e),
|
|
'timestamp': datetime.now(timezone.utc).isoformat()
|
|
})
|
|
|
|
logger.info(f"Extracted {len(page_institutions)} institutions from page {page_num + 1}")
|
|
return page_institutions
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Failed to scrape page {page_num + 1}: {e}")
|
|
self.errors.append({
|
|
'page': page_num,
|
|
'error': str(e),
|
|
'timestamp': datetime.now(timezone.utc).isoformat()
|
|
})
|
|
return []
|
|
|
|
def get_total_pages(self) -> int:
|
|
"""
|
|
Determine the total number of pages in the search results.
|
|
|
|
Returns:
|
|
Total number of pages
|
|
"""
|
|
try:
|
|
response = self.session.get(self.search_url, timeout=30)
|
|
response.raise_for_status()
|
|
response.encoding = 'utf-8'
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Find pagination
|
|
pagination = soup.select('.pagination li a')
|
|
|
|
max_page = 0
|
|
for link in pagination:
|
|
href = link.get('href', '')
|
|
# Extract page parameter from URL
|
|
match = re.search(r'page=(\d+)', href)
|
|
if match:
|
|
page_num = int(match.group(1))
|
|
max_page = max(max_page, page_num)
|
|
|
|
# Page numbers are 0-indexed, so total pages = max_page + 1
|
|
total_pages = max_page + 1 if max_page > 0 else 1
|
|
|
|
logger.info(f"Found {total_pages} total pages")
|
|
return total_pages
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Failed to determine total pages: {e}")
|
|
return 1 # Default to 1 page if detection fails
|
|
|
|
def scrape_all(self, max_pages: Optional[int] = None, scrape_profiles: bool = False) -> List[Dict]:
|
|
"""
|
|
Scrape all pages of the CONABIP database.
|
|
|
|
Args:
|
|
max_pages: Maximum number of pages to scrape (None = all pages)
|
|
scrape_profiles: Whether to scrape individual profile pages for additional data
|
|
|
|
Returns:
|
|
List of all institution dictionaries
|
|
"""
|
|
logger.info("Starting CONABIP scrape...")
|
|
logger.info(f"Rate limit delay: {self.rate_limit_delay} seconds")
|
|
logger.info(f"Scrape profiles: {scrape_profiles}")
|
|
|
|
# Determine total pages
|
|
total_pages = self.get_total_pages()
|
|
|
|
if max_pages:
|
|
total_pages = min(total_pages, max_pages)
|
|
logger.info(f"Limiting scrape to {total_pages} pages")
|
|
|
|
# Scrape each page
|
|
for page_num in range(total_pages):
|
|
page_institutions = self.scrape_page(page_num)
|
|
self.institutions.extend(page_institutions)
|
|
|
|
logger.info(f"Scraped {len(self.institutions)} institutions from {total_pages} pages")
|
|
|
|
# Optionally scrape profile pages
|
|
if scrape_profiles:
|
|
logger.info("Scraping individual profile pages...")
|
|
|
|
for idx, institution in enumerate(self.institutions, 1):
|
|
profile_url = institution.get('profile_url')
|
|
|
|
if profile_url:
|
|
logger.info(f"[{idx}/{len(self.institutions)}] Scraping profile: {institution['name']}")
|
|
|
|
# Extract relative URL path
|
|
parsed = urlparse(profile_url)
|
|
profile_path = parsed.path
|
|
|
|
profile_data = self._scrape_profile_page(profile_path)
|
|
institution.update(profile_data)
|
|
|
|
logger.info(f"Scraping complete! Total institutions: {len(self.institutions)}")
|
|
logger.info(f"Total errors: {len(self.errors)}")
|
|
|
|
return self.institutions
|
|
|
|
def export_to_csv(self, filename: str = "conabip_libraries.csv"):
|
|
"""
|
|
Export scraped data to CSV.
|
|
|
|
Args:
|
|
filename: Output CSV filename
|
|
"""
|
|
output_path = OUTPUT_DIR / filename
|
|
|
|
if not self.institutions:
|
|
logger.warning("No institutions to export")
|
|
return
|
|
|
|
# Determine all fields (union of all keys)
|
|
all_fields = set()
|
|
for inst in self.institutions:
|
|
all_fields.update(inst.keys())
|
|
|
|
# Sort fields for consistent output
|
|
fieldnames = sorted(all_fields)
|
|
|
|
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(self.institutions)
|
|
|
|
logger.info(f"Exported {len(self.institutions)} institutions to {output_path}")
|
|
|
|
def export_to_json(self, filename: str = "conabip_libraries.json"):
|
|
"""
|
|
Export scraped data to JSON.
|
|
|
|
Args:
|
|
filename: Output JSON filename
|
|
"""
|
|
output_path = OUTPUT_DIR / filename
|
|
|
|
output_data = {
|
|
'metadata': {
|
|
'source': 'CONABIP - Comisión Nacional de Bibliotecas Populares',
|
|
'source_url': self.search_url,
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'total_institutions': len(self.institutions),
|
|
'total_errors': len(self.errors),
|
|
'data_type': 'Popular libraries (bibliotecas populares)',
|
|
'country': 'AR',
|
|
'extractor': 'scrape_conabip_argentina.py'
|
|
},
|
|
'institutions': self.institutions,
|
|
'errors': self.errors
|
|
}
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as jsonfile:
|
|
json.dump(output_data, jsonfile, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Exported {len(self.institutions)} institutions to {output_path}")
|
|
|
|
def print_summary(self):
|
|
"""Print summary statistics of the scrape."""
|
|
logger.info("\n" + "="*60)
|
|
logger.info("CONABIP SCRAPE SUMMARY")
|
|
logger.info("="*60)
|
|
logger.info(f"Total institutions extracted: {len(self.institutions)}")
|
|
logger.info(f"Total errors: {len(self.errors)}")
|
|
|
|
if self.institutions:
|
|
# Count by province
|
|
provinces = {}
|
|
cities = {}
|
|
with_coords = 0
|
|
with_services = 0
|
|
|
|
for inst in self.institutions:
|
|
prov = inst.get('province', 'Unknown')
|
|
city = inst.get('city', 'Unknown')
|
|
|
|
provinces[prov] = provinces.get(prov, 0) + 1
|
|
cities[city] = cities.get(city, 0) + 1
|
|
|
|
if inst.get('latitude') and inst.get('longitude'):
|
|
with_coords += 1
|
|
|
|
if inst.get('services'):
|
|
with_services += 1
|
|
|
|
logger.info(f"\nProvinces covered: {len(provinces)}")
|
|
logger.info(f"Cities covered: {len(cities)}")
|
|
logger.info(f"Institutions with coordinates: {with_coords}")
|
|
logger.info(f"Institutions with services data: {with_services}")
|
|
|
|
# Top 10 provinces
|
|
logger.info("\nTop 10 provinces by institution count:")
|
|
sorted_provinces = sorted(provinces.items(), key=lambda x: x[1], reverse=True)
|
|
for prov, count in sorted_provinces[:10]:
|
|
logger.info(f" {prov}: {count}")
|
|
|
|
logger.info("="*60 + "\n")
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Scrape Argentina's CONABIP popular library database"
|
|
)
|
|
parser.add_argument(
|
|
'--max-pages',
|
|
type=int,
|
|
default=None,
|
|
help='Maximum number of pages to scrape (default: all pages)'
|
|
)
|
|
parser.add_argument(
|
|
'--scrape-profiles',
|
|
action='store_true',
|
|
help='Scrape individual profile pages for additional data (services, coordinates)'
|
|
)
|
|
parser.add_argument(
|
|
'--rate-limit',
|
|
type=float,
|
|
default=2.0,
|
|
help='Delay in seconds between requests (default: 2.0)'
|
|
)
|
|
parser.add_argument(
|
|
'--output-csv',
|
|
type=str,
|
|
default='conabip_libraries.csv',
|
|
help='Output CSV filename (default: conabip_libraries.csv)'
|
|
)
|
|
parser.add_argument(
|
|
'--output-json',
|
|
type=str,
|
|
default='conabip_libraries.json',
|
|
help='Output JSON filename (default: conabip_libraries.json)'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Create scraper
|
|
scraper = CONABIPScraper(rate_limit_delay=args.rate_limit)
|
|
|
|
# Scrape data
|
|
scraper.scrape_all(
|
|
max_pages=args.max_pages,
|
|
scrape_profiles=args.scrape_profiles
|
|
)
|
|
|
|
# Print summary
|
|
scraper.print_summary()
|
|
|
|
# Export
|
|
scraper.export_to_csv(args.output_csv)
|
|
scraper.export_to_json(args.output_json)
|
|
|
|
logger.info("✅ CONABIP scraping complete!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|