glam/tests/scrapers/test_conabip_scraper.py
2025-11-19 23:25:22 +01:00

385 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Unit tests for CONABIP Argentina scraper.
Tests cover:
- HTML parsing logic
- Registration number extraction
- Coordinate parsing
- Service extraction
- Error handling
- CSV/JSON export
Author: GLAM Data Extraction Project
Date: 2025-11-17
"""
import pytest
import sys
from pathlib import Path
from bs4 import BeautifulSoup
# Add scripts directory to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "scripts" / "scrapers"))
from scrape_conabip_argentina import CONABIPScraper
class TestCONABIPScraper:
"""Test suite for CONABIP scraper."""
@pytest.fixture
def scraper(self):
"""Create a scraper instance for testing."""
return CONABIPScraper(rate_limit_delay=0.1) # Faster for tests
def test_initialization(self, scraper):
"""Test scraper initialization."""
assert scraper.base_url == "https://www.conabip.gob.ar"
assert scraper.rate_limit_delay == 0.1
assert len(scraper.institutions) == 0
assert len(scraper.errors) == 0
def test_parse_registration_number_valid(self, scraper):
"""Test parsing valid registration numbers."""
test_cases = [
("(REG: 18)", 18),
("REG: 123", 123),
("Biblioteca Popular (REG: 456)", 456),
("reg: 789", 789), # Case insensitive
]
for text, expected in test_cases:
result = scraper._parse_registration_number(text)
assert result == expected, f"Failed to parse '{text}'"
def test_parse_registration_number_invalid(self, scraper):
"""Test parsing invalid registration numbers."""
test_cases = [
"No number here",
"REG:",
"REG: ABC",
"",
]
for text in test_cases:
result = scraper._parse_registration_number(text)
assert result is None, f"Should return None for '{text}'"
def test_parse_coordinates_valid(self, scraper):
"""Test parsing valid Google Maps coordinates."""
test_cases = [
("http://www.google.com/maps/place/-34.598461,-58.494690", (-34.598461, -58.494690)),
("https://www.google.com/maps/place/40.416775,-3.703790", (40.416775, -3.703790)),
("http://google.com/maps/place/-23.5505,-46.6333", (-23.5505, -46.6333)),
]
for url, expected in test_cases:
result = scraper._parse_coordinates(url)
assert result == expected, f"Failed to parse '{url}'"
def test_parse_coordinates_invalid(self, scraper):
"""Test parsing invalid coordinate URLs."""
test_cases = [
"http://www.google.com/maps/",
"not a url",
"http://www.google.com/maps/place/",
"",
]
for url in test_cases:
result = scraper._parse_coordinates(url)
assert result == (None, None), f"Should return (None, None) for '{url}'"
def test_extract_services_from_images(self, scraper):
"""Test extracting services from HTML."""
html = """
<div class="bipopServices">
<img src="/icon1.png" alt="Internet" title="Internet"/>
<img src="/icon2.png" alt="Wifi" title="Wifi"/>
<img src="/icon3.png" title="Rincón Infantil"/>
</div>
"""
soup = BeautifulSoup(html, 'html.parser')
services = scraper._extract_services_from_images(soup)
assert len(services) == 3
assert "Internet" in services
assert "Wifi" in services
assert "Rincón Infantil" in services
def test_extract_services_empty(self, scraper):
"""Test extracting services from HTML with no services."""
html = """
<div class="bipopServices">
<p>No services listed</p>
</div>
"""
soup = BeautifulSoup(html, 'html.parser')
services = scraper._extract_services_from_images(soup)
assert len(services) == 0
def test_parse_table_row_complete(self, scraper):
"""Test parsing a complete table row."""
html = """
<tr class="odd">
<td class="views-field views-field-field-nombre-de-la-biblioteca">
<b>Biblioteca Popular Helena Larroque de Roffo</b> (REG: 18)
</td>
<td class="views-field views-field-province">
CIUDAD AUTÓNOMA DE BUENOS AIRES
</td>
<td class="views-field views-field-city">
Ciudad Autónoma de Buenos Aires
</td>
<td class="views-field views-field-additional">
Villa del Parque
</td>
<td class="views-field views-field-street">
Simbrón 3058 <br />Villa del Parque
</td>
<td class="views-field views-field-view-user">
<a href="/bipop/1342">Ver más</a>
</td>
</tr>
"""
soup = BeautifulSoup(html, 'html.parser')
row = soup.find('tr')
cells = row.find_all('td')
# Simulate parsing logic from scrape_page
name_cell = cells[0]
name_strong = name_cell.find('b')
name = name_strong.get_text(strip=True) if name_strong else ""
reg_text = name_cell.get_text(strip=True)
reg_number = scraper._parse_registration_number(reg_text)
province = cells[1].get_text(strip=True)
city = cells[2].get_text(strip=True)
neighborhood = cells[3].get_text(strip=True)
address_cell = cells[4]
address_parts = list(address_cell.stripped_strings)
street_address = address_parts[0] if address_parts else ''
profile_link = cells[5].select_one('a[href^="/bipop/"]')
profile_url = profile_link['href'] if profile_link else None
# Assertions
assert name == "Biblioteca Popular Helena Larroque de Roffo"
assert reg_number == 18
assert province == "CIUDAD AUTÓNOMA DE BUENOS AIRES"
assert city == "Ciudad Autónoma de Buenos Aires"
assert neighborhood == "Villa del Parque"
assert street_address == "Simbrón 3058"
assert profile_url == "/bipop/1342"
def test_parse_table_row_minimal(self, scraper):
"""Test parsing a minimal table row with missing data."""
html = """
<tr class="even">
<td class="views-field views-field-field-nombre-de-la-biblioteca">
<b>Biblioteca Popular Test</b> (REG: 999)
</td>
<td class="views-field views-field-province"></td>
<td class="views-field views-field-city"></td>
<td class="views-field views-field-additional"></td>
<td class="views-field views-field-street"></td>
<td class="views-field views-field-view-user"></td>
</tr>
"""
soup = BeautifulSoup(html, 'html.parser')
row = soup.find('tr')
cells = row.find_all('td')
# Simulate parsing logic
name_cell = cells[0]
name_strong = name_cell.find('b')
name = name_strong.get_text(strip=True) if name_strong else ""
reg_text = name_cell.get_text(strip=True)
reg_number = scraper._parse_registration_number(reg_text)
province = cells[1].get_text(strip=True)
city = cells[2].get_text(strip=True)
# Assertions
assert name == "Biblioteca Popular Test"
assert reg_number == 999
assert province == "" # Empty but present
assert city == "" # Empty but present
def test_export_csv_no_data(self, scraper, tmp_path):
"""Test CSV export with no data."""
import logging
# Redirect output directory to tmp for testing
scraper.institutions = []
# This should log a warning but not crash
output_file = tmp_path / "test_output.csv"
# Mock the OUTPUT_DIR
original_output_dir = Path(scraper.__module__).parent.parent.parent / "data" / "isil" / "AR"
# Just verify it doesn't crash (warning is expected)
try:
scraper.export_to_csv("test_output.csv")
except Exception as e:
pytest.fail(f"export_to_csv should not crash with no data: {e}")
def test_export_json_with_data(self, scraper, tmp_path):
"""Test JSON export with sample data."""
scraper.institutions = [
{
'conabip_reg': '18',
'name': 'Test Library',
'province': 'Buenos Aires',
'city': 'Buenos Aires',
'country': 'AR'
}
]
# This test verifies the export doesn't crash
# (actual file output goes to OUTPUT_DIR, not tmp_path)
try:
scraper.export_to_json("test_output.json")
except Exception as e:
pytest.fail(f"export_to_json should not crash: {e}")
def test_print_summary_no_data(self, scraper, caplog):
"""Test summary with no data."""
import logging
caplog.set_level(logging.INFO)
scraper.institutions = []
scraper.errors = []
scraper.print_summary()
assert "Total institutions extracted: 0" in caplog.text
assert "Total errors: 0" in caplog.text
def test_print_summary_with_data(self, scraper, caplog):
"""Test summary with sample data."""
import logging
caplog.set_level(logging.INFO)
scraper.institutions = [
{'province': 'Buenos Aires', 'city': 'La Plata'},
{'province': 'Buenos Aires', 'city': 'Mar del Plata'},
{'province': 'Córdoba', 'city': 'Córdoba'},
]
scraper.errors = []
scraper.print_summary()
assert "Total institutions extracted: 3" in caplog.text
assert "Provinces covered: 2" in caplog.text
assert "Cities covered: 3" in caplog.text
class TestCONABIPScraperEdgeCases:
"""Test edge cases and error handling."""
@pytest.fixture
def scraper(self):
"""Create a scraper instance for testing."""
return CONABIPScraper(rate_limit_delay=0.1)
def test_malformed_registration_number(self, scraper):
"""Test handling of malformed registration numbers."""
test_cases = [
"REG: 12.5", # Decimal
"REG: 1,234", # Comma
"REG: -50", # Negative
"REG: 99999999999999999999", # Too large
]
for text in test_cases:
result = scraper._parse_registration_number(text)
# Should either parse the integer part or return None
assert result is None or isinstance(result, int)
def test_coordinate_edge_cases(self, scraper):
"""Test coordinate parsing edge cases."""
test_cases = [
"http://www.google.com/maps/place/0,0", # Equator/Prime Meridian
"http://www.google.com/maps/place/90,-180", # North Pole/Dateline
"http://www.google.com/maps/place/-90,180", # South Pole/Dateline
]
for url in test_cases:
result = scraper._parse_coordinates(url)
assert isinstance(result, tuple)
assert len(result) == 2
def test_unicode_handling(self, scraper):
"""Test handling of Unicode characters in names."""
test_cases = [
"Biblioteca Popular José Hernández",
"Biblioteca Nicolás Avellaneda",
"Biblioteca Popular Nuñez",
"Biblioteca Popular Güemes",
]
# Should not crash on Unicode
for name in test_cases:
assert isinstance(name, str)
def test_empty_table_handling(self, scraper):
"""Test handling of empty results table."""
html = """
<table class="views-table">
<thead><tr><th>Name</th></tr></thead>
<tbody></tbody>
</table>
"""
soup = BeautifulSoup(html, 'html.parser')
table = soup.select_one('table.views-table')
rows = table.select('tbody tr')
assert len(rows) == 0
# Integration test marker
@pytest.mark.integration
class TestCONABIPScraperIntegration:
"""Integration tests that make real HTTP requests."""
@pytest.fixture
def scraper(self):
"""Create a scraper instance for integration testing."""
return CONABIPScraper(rate_limit_delay=2.0) # Respectful delay
def test_scrape_first_page_real(self, scraper):
"""Test scraping the first page of real data (INTEGRATION)."""
institutions = scraper.scrape_page(page_num=0)
# Should find institutions on first page
assert len(institutions) > 0
# Check structure of first institution
first = institutions[0]
assert 'name' in first
assert 'province' in first
assert 'country' in first
assert first['country'] == 'AR'
assert first['data_source'] == 'CONABIP'
def test_get_total_pages_real(self, scraper):
"""Test getting total page count from real site (INTEGRATION)."""
total_pages = scraper.get_total_pages()
# Should find many pages (typically 50-100+)
assert total_pages > 10
assert total_pages < 1000 # Sanity check
if __name__ == "__main__":
pytest.main([__file__, "-v"])