#!/usr/bin/env python3 """ Unit tests for CONABIP Argentina scraper. Tests cover: - HTML parsing logic - Registration number extraction - Coordinate parsing - Service extraction - Error handling - CSV/JSON export Author: GLAM Data Extraction Project Date: 2025-11-17 """ import pytest import sys from pathlib import Path from bs4 import BeautifulSoup # Add scripts directory to path sys.path.insert(0, str(Path(__file__).parent.parent.parent / "scripts" / "scrapers")) from scrape_conabip_argentina import CONABIPScraper class TestCONABIPScraper: """Test suite for CONABIP scraper.""" @pytest.fixture def scraper(self): """Create a scraper instance for testing.""" return CONABIPScraper(rate_limit_delay=0.1) # Faster for tests def test_initialization(self, scraper): """Test scraper initialization.""" assert scraper.base_url == "https://www.conabip.gob.ar" assert scraper.rate_limit_delay == 0.1 assert len(scraper.institutions) == 0 assert len(scraper.errors) == 0 def test_parse_registration_number_valid(self, scraper): """Test parsing valid registration numbers.""" test_cases = [ ("(REG: 18)", 18), ("REG: 123", 123), ("Biblioteca Popular (REG: 456)", 456), ("reg: 789", 789), # Case insensitive ] for text, expected in test_cases: result = scraper._parse_registration_number(text) assert result == expected, f"Failed to parse '{text}'" def test_parse_registration_number_invalid(self, scraper): """Test parsing invalid registration numbers.""" test_cases = [ "No number here", "REG:", "REG: ABC", "", ] for text in test_cases: result = scraper._parse_registration_number(text) assert result is None, f"Should return None for '{text}'" def test_parse_coordinates_valid(self, scraper): """Test parsing valid Google Maps coordinates.""" test_cases = [ ("http://www.google.com/maps/place/-34.598461,-58.494690", (-34.598461, -58.494690)), ("https://www.google.com/maps/place/40.416775,-3.703790", (40.416775, -3.703790)), ("http://google.com/maps/place/-23.5505,-46.6333", (-23.5505, -46.6333)), ] for url, expected in test_cases: result = scraper._parse_coordinates(url) assert result == expected, f"Failed to parse '{url}'" def test_parse_coordinates_invalid(self, scraper): """Test parsing invalid coordinate URLs.""" test_cases = [ "http://www.google.com/maps/", "not a url", "http://www.google.com/maps/place/", "", ] for url in test_cases: result = scraper._parse_coordinates(url) assert result == (None, None), f"Should return (None, None) for '{url}'" def test_extract_services_from_images(self, scraper): """Test extracting services from HTML.""" html = """
Internet Wifi
""" soup = BeautifulSoup(html, 'html.parser') services = scraper._extract_services_from_images(soup) assert len(services) == 3 assert "Internet" in services assert "Wifi" in services assert "Rincón Infantil" in services def test_extract_services_empty(self, scraper): """Test extracting services from HTML with no services.""" html = """

No services listed

""" soup = BeautifulSoup(html, 'html.parser') services = scraper._extract_services_from_images(soup) assert len(services) == 0 def test_parse_table_row_complete(self, scraper): """Test parsing a complete table row.""" html = """ Biblioteca Popular Helena Larroque de Roffo (REG: 18) CIUDAD AUTÓNOMA DE BUENOS AIRES Ciudad Autónoma de Buenos Aires Villa del Parque Simbrón 3058
Villa del Parque Ver más """ soup = BeautifulSoup(html, 'html.parser') row = soup.find('tr') cells = row.find_all('td') # Simulate parsing logic from scrape_page name_cell = cells[0] name_strong = name_cell.find('b') name = name_strong.get_text(strip=True) if name_strong else "" reg_text = name_cell.get_text(strip=True) reg_number = scraper._parse_registration_number(reg_text) province = cells[1].get_text(strip=True) city = cells[2].get_text(strip=True) neighborhood = cells[3].get_text(strip=True) address_cell = cells[4] address_parts = list(address_cell.stripped_strings) street_address = address_parts[0] if address_parts else '' profile_link = cells[5].select_one('a[href^="/bipop/"]') profile_url = profile_link['href'] if profile_link else None # Assertions assert name == "Biblioteca Popular Helena Larroque de Roffo" assert reg_number == 18 assert province == "CIUDAD AUTÓNOMA DE BUENOS AIRES" assert city == "Ciudad Autónoma de Buenos Aires" assert neighborhood == "Villa del Parque" assert street_address == "Simbrón 3058" assert profile_url == "/bipop/1342" def test_parse_table_row_minimal(self, scraper): """Test parsing a minimal table row with missing data.""" html = """ Biblioteca Popular Test (REG: 999) """ soup = BeautifulSoup(html, 'html.parser') row = soup.find('tr') cells = row.find_all('td') # Simulate parsing logic name_cell = cells[0] name_strong = name_cell.find('b') name = name_strong.get_text(strip=True) if name_strong else "" reg_text = name_cell.get_text(strip=True) reg_number = scraper._parse_registration_number(reg_text) province = cells[1].get_text(strip=True) city = cells[2].get_text(strip=True) # Assertions assert name == "Biblioteca Popular Test" assert reg_number == 999 assert province == "" # Empty but present assert city == "" # Empty but present def test_export_csv_no_data(self, scraper, tmp_path): """Test CSV export with no data.""" import logging # Redirect output directory to tmp for testing scraper.institutions = [] # This should log a warning but not crash output_file = tmp_path / "test_output.csv" # Mock the OUTPUT_DIR original_output_dir = Path(scraper.__module__).parent.parent.parent / "data" / "isil" / "AR" # Just verify it doesn't crash (warning is expected) try: scraper.export_to_csv("test_output.csv") except Exception as e: pytest.fail(f"export_to_csv should not crash with no data: {e}") def test_export_json_with_data(self, scraper, tmp_path): """Test JSON export with sample data.""" scraper.institutions = [ { 'conabip_reg': '18', 'name': 'Test Library', 'province': 'Buenos Aires', 'city': 'Buenos Aires', 'country': 'AR' } ] # This test verifies the export doesn't crash # (actual file output goes to OUTPUT_DIR, not tmp_path) try: scraper.export_to_json("test_output.json") except Exception as e: pytest.fail(f"export_to_json should not crash: {e}") def test_print_summary_no_data(self, scraper, caplog): """Test summary with no data.""" import logging caplog.set_level(logging.INFO) scraper.institutions = [] scraper.errors = [] scraper.print_summary() assert "Total institutions extracted: 0" in caplog.text assert "Total errors: 0" in caplog.text def test_print_summary_with_data(self, scraper, caplog): """Test summary with sample data.""" import logging caplog.set_level(logging.INFO) scraper.institutions = [ {'province': 'Buenos Aires', 'city': 'La Plata'}, {'province': 'Buenos Aires', 'city': 'Mar del Plata'}, {'province': 'Córdoba', 'city': 'Córdoba'}, ] scraper.errors = [] scraper.print_summary() assert "Total institutions extracted: 3" in caplog.text assert "Provinces covered: 2" in caplog.text assert "Cities covered: 3" in caplog.text class TestCONABIPScraperEdgeCases: """Test edge cases and error handling.""" @pytest.fixture def scraper(self): """Create a scraper instance for testing.""" return CONABIPScraper(rate_limit_delay=0.1) def test_malformed_registration_number(self, scraper): """Test handling of malformed registration numbers.""" test_cases = [ "REG: 12.5", # Decimal "REG: 1,234", # Comma "REG: -50", # Negative "REG: 99999999999999999999", # Too large ] for text in test_cases: result = scraper._parse_registration_number(text) # Should either parse the integer part or return None assert result is None or isinstance(result, int) def test_coordinate_edge_cases(self, scraper): """Test coordinate parsing edge cases.""" test_cases = [ "http://www.google.com/maps/place/0,0", # Equator/Prime Meridian "http://www.google.com/maps/place/90,-180", # North Pole/Dateline "http://www.google.com/maps/place/-90,180", # South Pole/Dateline ] for url in test_cases: result = scraper._parse_coordinates(url) assert isinstance(result, tuple) assert len(result) == 2 def test_unicode_handling(self, scraper): """Test handling of Unicode characters in names.""" test_cases = [ "Biblioteca Popular José Hernández", "Biblioteca Nicolás Avellaneda", "Biblioteca Popular Nuñez", "Biblioteca Popular Güemes", ] # Should not crash on Unicode for name in test_cases: assert isinstance(name, str) def test_empty_table_handling(self, scraper): """Test handling of empty results table.""" html = """
Name
""" soup = BeautifulSoup(html, 'html.parser') table = soup.select_one('table.views-table') rows = table.select('tbody tr') assert len(rows) == 0 # Integration test marker @pytest.mark.integration class TestCONABIPScraperIntegration: """Integration tests that make real HTTP requests.""" @pytest.fixture def scraper(self): """Create a scraper instance for integration testing.""" return CONABIPScraper(rate_limit_delay=2.0) # Respectful delay def test_scrape_first_page_real(self, scraper): """Test scraping the first page of real data (INTEGRATION).""" institutions = scraper.scrape_page(page_num=0) # Should find institutions on first page assert len(institutions) > 0 # Check structure of first institution first = institutions[0] assert 'name' in first assert 'province' in first assert 'country' in first assert first['country'] == 'AR' assert first['data_source'] == 'CONABIP' def test_get_total_pages_real(self, scraper): """Test getting total page count from real site (INTEGRATION).""" total_pages = scraper.get_total_pages() # Should find many pages (typically 50-100+) assert total_pages > 10 assert total_pages < 1000 # Sanity check if __name__ == "__main__": pytest.main([__file__, "-v"])