385 lines
13 KiB
Python
385 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Unit tests for CONABIP Argentina scraper.
|
|
|
|
Tests cover:
|
|
- HTML parsing logic
|
|
- Registration number extraction
|
|
- Coordinate parsing
|
|
- Service extraction
|
|
- Error handling
|
|
- CSV/JSON export
|
|
|
|
Author: GLAM Data Extraction Project
|
|
Date: 2025-11-17
|
|
"""
|
|
|
|
import pytest
|
|
import sys
|
|
from pathlib import Path
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Add scripts directory to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "scripts" / "scrapers"))
|
|
|
|
from scrape_conabip_argentina import CONABIPScraper
|
|
|
|
|
|
class TestCONABIPScraper:
|
|
"""Test suite for CONABIP scraper."""
|
|
|
|
@pytest.fixture
|
|
def scraper(self):
|
|
"""Create a scraper instance for testing."""
|
|
return CONABIPScraper(rate_limit_delay=0.1) # Faster for tests
|
|
|
|
def test_initialization(self, scraper):
|
|
"""Test scraper initialization."""
|
|
assert scraper.base_url == "https://www.conabip.gob.ar"
|
|
assert scraper.rate_limit_delay == 0.1
|
|
assert len(scraper.institutions) == 0
|
|
assert len(scraper.errors) == 0
|
|
|
|
def test_parse_registration_number_valid(self, scraper):
|
|
"""Test parsing valid registration numbers."""
|
|
test_cases = [
|
|
("(REG: 18)", 18),
|
|
("REG: 123", 123),
|
|
("Biblioteca Popular (REG: 456)", 456),
|
|
("reg: 789", 789), # Case insensitive
|
|
]
|
|
|
|
for text, expected in test_cases:
|
|
result = scraper._parse_registration_number(text)
|
|
assert result == expected, f"Failed to parse '{text}'"
|
|
|
|
def test_parse_registration_number_invalid(self, scraper):
|
|
"""Test parsing invalid registration numbers."""
|
|
test_cases = [
|
|
"No number here",
|
|
"REG:",
|
|
"REG: ABC",
|
|
"",
|
|
]
|
|
|
|
for text in test_cases:
|
|
result = scraper._parse_registration_number(text)
|
|
assert result is None, f"Should return None for '{text}'"
|
|
|
|
def test_parse_coordinates_valid(self, scraper):
|
|
"""Test parsing valid Google Maps coordinates."""
|
|
test_cases = [
|
|
("http://www.google.com/maps/place/-34.598461,-58.494690", (-34.598461, -58.494690)),
|
|
("https://www.google.com/maps/place/40.416775,-3.703790", (40.416775, -3.703790)),
|
|
("http://google.com/maps/place/-23.5505,-46.6333", (-23.5505, -46.6333)),
|
|
]
|
|
|
|
for url, expected in test_cases:
|
|
result = scraper._parse_coordinates(url)
|
|
assert result == expected, f"Failed to parse '{url}'"
|
|
|
|
def test_parse_coordinates_invalid(self, scraper):
|
|
"""Test parsing invalid coordinate URLs."""
|
|
test_cases = [
|
|
"http://www.google.com/maps/",
|
|
"not a url",
|
|
"http://www.google.com/maps/place/",
|
|
"",
|
|
]
|
|
|
|
for url in test_cases:
|
|
result = scraper._parse_coordinates(url)
|
|
assert result == (None, None), f"Should return (None, None) for '{url}'"
|
|
|
|
def test_extract_services_from_images(self, scraper):
|
|
"""Test extracting services from HTML."""
|
|
html = """
|
|
<div class="bipopServices">
|
|
<img src="/icon1.png" alt="Internet" title="Internet"/>
|
|
<img src="/icon2.png" alt="Wifi" title="Wifi"/>
|
|
<img src="/icon3.png" title="Rincón Infantil"/>
|
|
</div>
|
|
"""
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
services = scraper._extract_services_from_images(soup)
|
|
|
|
assert len(services) == 3
|
|
assert "Internet" in services
|
|
assert "Wifi" in services
|
|
assert "Rincón Infantil" in services
|
|
|
|
def test_extract_services_empty(self, scraper):
|
|
"""Test extracting services from HTML with no services."""
|
|
html = """
|
|
<div class="bipopServices">
|
|
<p>No services listed</p>
|
|
</div>
|
|
"""
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
services = scraper._extract_services_from_images(soup)
|
|
|
|
assert len(services) == 0
|
|
|
|
def test_parse_table_row_complete(self, scraper):
|
|
"""Test parsing a complete table row."""
|
|
html = """
|
|
<tr class="odd">
|
|
<td class="views-field views-field-field-nombre-de-la-biblioteca">
|
|
<b>Biblioteca Popular Helena Larroque de Roffo</b> (REG: 18)
|
|
</td>
|
|
<td class="views-field views-field-province">
|
|
CIUDAD AUTÓNOMA DE BUENOS AIRES
|
|
</td>
|
|
<td class="views-field views-field-city">
|
|
Ciudad Autónoma de Buenos Aires
|
|
</td>
|
|
<td class="views-field views-field-additional">
|
|
Villa del Parque
|
|
</td>
|
|
<td class="views-field views-field-street">
|
|
Simbrón 3058 <br />Villa del Parque
|
|
</td>
|
|
<td class="views-field views-field-view-user">
|
|
<a href="/bipop/1342">Ver más</a>
|
|
</td>
|
|
</tr>
|
|
"""
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
row = soup.find('tr')
|
|
cells = row.find_all('td')
|
|
|
|
# Simulate parsing logic from scrape_page
|
|
name_cell = cells[0]
|
|
name_strong = name_cell.find('b')
|
|
name = name_strong.get_text(strip=True) if name_strong else ""
|
|
reg_text = name_cell.get_text(strip=True)
|
|
reg_number = scraper._parse_registration_number(reg_text)
|
|
|
|
province = cells[1].get_text(strip=True)
|
|
city = cells[2].get_text(strip=True)
|
|
neighborhood = cells[3].get_text(strip=True)
|
|
|
|
address_cell = cells[4]
|
|
address_parts = list(address_cell.stripped_strings)
|
|
street_address = address_parts[0] if address_parts else ''
|
|
|
|
profile_link = cells[5].select_one('a[href^="/bipop/"]')
|
|
profile_url = profile_link['href'] if profile_link else None
|
|
|
|
# Assertions
|
|
assert name == "Biblioteca Popular Helena Larroque de Roffo"
|
|
assert reg_number == 18
|
|
assert province == "CIUDAD AUTÓNOMA DE BUENOS AIRES"
|
|
assert city == "Ciudad Autónoma de Buenos Aires"
|
|
assert neighborhood == "Villa del Parque"
|
|
assert street_address == "Simbrón 3058"
|
|
assert profile_url == "/bipop/1342"
|
|
|
|
def test_parse_table_row_minimal(self, scraper):
|
|
"""Test parsing a minimal table row with missing data."""
|
|
html = """
|
|
<tr class="even">
|
|
<td class="views-field views-field-field-nombre-de-la-biblioteca">
|
|
<b>Biblioteca Popular Test</b> (REG: 999)
|
|
</td>
|
|
<td class="views-field views-field-province"></td>
|
|
<td class="views-field views-field-city"></td>
|
|
<td class="views-field views-field-additional"></td>
|
|
<td class="views-field views-field-street"></td>
|
|
<td class="views-field views-field-view-user"></td>
|
|
</tr>
|
|
"""
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
row = soup.find('tr')
|
|
cells = row.find_all('td')
|
|
|
|
# Simulate parsing logic
|
|
name_cell = cells[0]
|
|
name_strong = name_cell.find('b')
|
|
name = name_strong.get_text(strip=True) if name_strong else ""
|
|
reg_text = name_cell.get_text(strip=True)
|
|
reg_number = scraper._parse_registration_number(reg_text)
|
|
|
|
province = cells[1].get_text(strip=True)
|
|
city = cells[2].get_text(strip=True)
|
|
|
|
# Assertions
|
|
assert name == "Biblioteca Popular Test"
|
|
assert reg_number == 999
|
|
assert province == "" # Empty but present
|
|
assert city == "" # Empty but present
|
|
|
|
def test_export_csv_no_data(self, scraper, tmp_path):
|
|
"""Test CSV export with no data."""
|
|
import logging
|
|
|
|
# Redirect output directory to tmp for testing
|
|
scraper.institutions = []
|
|
|
|
# This should log a warning but not crash
|
|
output_file = tmp_path / "test_output.csv"
|
|
|
|
# Mock the OUTPUT_DIR
|
|
original_output_dir = Path(scraper.__module__).parent.parent.parent / "data" / "isil" / "AR"
|
|
|
|
# Just verify it doesn't crash (warning is expected)
|
|
try:
|
|
scraper.export_to_csv("test_output.csv")
|
|
except Exception as e:
|
|
pytest.fail(f"export_to_csv should not crash with no data: {e}")
|
|
|
|
def test_export_json_with_data(self, scraper, tmp_path):
|
|
"""Test JSON export with sample data."""
|
|
scraper.institutions = [
|
|
{
|
|
'conabip_reg': '18',
|
|
'name': 'Test Library',
|
|
'province': 'Buenos Aires',
|
|
'city': 'Buenos Aires',
|
|
'country': 'AR'
|
|
}
|
|
]
|
|
|
|
# This test verifies the export doesn't crash
|
|
# (actual file output goes to OUTPUT_DIR, not tmp_path)
|
|
try:
|
|
scraper.export_to_json("test_output.json")
|
|
except Exception as e:
|
|
pytest.fail(f"export_to_json should not crash: {e}")
|
|
|
|
def test_print_summary_no_data(self, scraper, caplog):
|
|
"""Test summary with no data."""
|
|
import logging
|
|
caplog.set_level(logging.INFO)
|
|
|
|
scraper.institutions = []
|
|
scraper.errors = []
|
|
|
|
scraper.print_summary()
|
|
|
|
assert "Total institutions extracted: 0" in caplog.text
|
|
assert "Total errors: 0" in caplog.text
|
|
|
|
def test_print_summary_with_data(self, scraper, caplog):
|
|
"""Test summary with sample data."""
|
|
import logging
|
|
caplog.set_level(logging.INFO)
|
|
|
|
scraper.institutions = [
|
|
{'province': 'Buenos Aires', 'city': 'La Plata'},
|
|
{'province': 'Buenos Aires', 'city': 'Mar del Plata'},
|
|
{'province': 'Córdoba', 'city': 'Córdoba'},
|
|
]
|
|
scraper.errors = []
|
|
|
|
scraper.print_summary()
|
|
|
|
assert "Total institutions extracted: 3" in caplog.text
|
|
assert "Provinces covered: 2" in caplog.text
|
|
assert "Cities covered: 3" in caplog.text
|
|
|
|
|
|
class TestCONABIPScraperEdgeCases:
|
|
"""Test edge cases and error handling."""
|
|
|
|
@pytest.fixture
|
|
def scraper(self):
|
|
"""Create a scraper instance for testing."""
|
|
return CONABIPScraper(rate_limit_delay=0.1)
|
|
|
|
def test_malformed_registration_number(self, scraper):
|
|
"""Test handling of malformed registration numbers."""
|
|
test_cases = [
|
|
"REG: 12.5", # Decimal
|
|
"REG: 1,234", # Comma
|
|
"REG: -50", # Negative
|
|
"REG: 99999999999999999999", # Too large
|
|
]
|
|
|
|
for text in test_cases:
|
|
result = scraper._parse_registration_number(text)
|
|
# Should either parse the integer part or return None
|
|
assert result is None or isinstance(result, int)
|
|
|
|
def test_coordinate_edge_cases(self, scraper):
|
|
"""Test coordinate parsing edge cases."""
|
|
test_cases = [
|
|
"http://www.google.com/maps/place/0,0", # Equator/Prime Meridian
|
|
"http://www.google.com/maps/place/90,-180", # North Pole/Dateline
|
|
"http://www.google.com/maps/place/-90,180", # South Pole/Dateline
|
|
]
|
|
|
|
for url in test_cases:
|
|
result = scraper._parse_coordinates(url)
|
|
assert isinstance(result, tuple)
|
|
assert len(result) == 2
|
|
|
|
def test_unicode_handling(self, scraper):
|
|
"""Test handling of Unicode characters in names."""
|
|
test_cases = [
|
|
"Biblioteca Popular José Hernández",
|
|
"Biblioteca Nicolás Avellaneda",
|
|
"Biblioteca Popular Nuñez",
|
|
"Biblioteca Popular Güemes",
|
|
]
|
|
|
|
# Should not crash on Unicode
|
|
for name in test_cases:
|
|
assert isinstance(name, str)
|
|
|
|
def test_empty_table_handling(self, scraper):
|
|
"""Test handling of empty results table."""
|
|
html = """
|
|
<table class="views-table">
|
|
<thead><tr><th>Name</th></tr></thead>
|
|
<tbody></tbody>
|
|
</table>
|
|
"""
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
table = soup.select_one('table.views-table')
|
|
rows = table.select('tbody tr')
|
|
|
|
assert len(rows) == 0
|
|
|
|
|
|
# Integration test marker
|
|
@pytest.mark.integration
|
|
class TestCONABIPScraperIntegration:
|
|
"""Integration tests that make real HTTP requests."""
|
|
|
|
@pytest.fixture
|
|
def scraper(self):
|
|
"""Create a scraper instance for integration testing."""
|
|
return CONABIPScraper(rate_limit_delay=2.0) # Respectful delay
|
|
|
|
def test_scrape_first_page_real(self, scraper):
|
|
"""Test scraping the first page of real data (INTEGRATION)."""
|
|
institutions = scraper.scrape_page(page_num=0)
|
|
|
|
# Should find institutions on first page
|
|
assert len(institutions) > 0
|
|
|
|
# Check structure of first institution
|
|
first = institutions[0]
|
|
assert 'name' in first
|
|
assert 'province' in first
|
|
assert 'country' in first
|
|
assert first['country'] == 'AR'
|
|
assert first['data_source'] == 'CONABIP'
|
|
|
|
def test_get_total_pages_real(self, scraper):
|
|
"""Test getting total page count from real site (INTEGRATION)."""
|
|
total_pages = scraper.get_total_pages()
|
|
|
|
# Should find many pages (typically 50-100+)
|
|
assert total_pages > 10
|
|
assert total_pages < 1000 # Sanity check
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|