glam/tests/scrapers/test_conabip_scraper.py

#!/usr/bin/env python3
"""
Unit tests for CONABIP Argentina scraper.

Tests cover:
- HTML parsing logic
- Registration number extraction
- Coordinate parsing
- Service extraction
- Error handling
- CSV/JSON export

Author: GLAM Data Extraction Project
Date: 2025-11-17
"""

import pytest
import sys
from pathlib import Path
from bs4 import BeautifulSoup

# Add scripts directory to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "scripts" / "scrapers"))

from scrape_conabip_argentina import CONABIPScraper


class TestCONABIPScraper:
    """Test suite for CONABIP scraper."""

    @pytest.fixture
    def scraper(self):
        """Create a scraper instance for testing."""
        return CONABIPScraper(rate_limit_delay=0.1)  # Faster for tests

    def test_initialization(self, scraper):
        """Test scraper initialization."""
        assert scraper.base_url == "https://www.conabip.gob.ar"
        assert scraper.rate_limit_delay == 0.1
        assert len(scraper.institutions) == 0
        assert len(scraper.errors) == 0

    def test_parse_registration_number_valid(self, scraper):
        """Test parsing valid registration numbers."""
        test_cases = [
            ("(REG: 18)", 18),
            ("REG: 123", 123),
            ("Biblioteca Popular (REG: 456)", 456),
            ("reg: 789", 789),  # Case insensitive
        ]

        for text, expected in test_cases:
            result = scraper._parse_registration_number(text)
            assert result == expected, f"Failed to parse '{text}'"

    def test_parse_registration_number_invalid(self, scraper):
        """Test parsing invalid registration numbers."""
        test_cases = [
            "No number here",
            "REG:",
            "REG: ABC",
            "",
        ]

        for text in test_cases:
            result = scraper._parse_registration_number(text)
            assert result is None, f"Should return None for '{text}'"

    def test_parse_coordinates_valid(self, scraper):
        """Test parsing valid Google Maps coordinates."""
        test_cases = [
            ("http://www.google.com/maps/place/-34.598461,-58.494690", (-34.598461, -58.494690)),
            ("https://www.google.com/maps/place/40.416775,-3.703790", (40.416775, -3.703790)),
            ("http://google.com/maps/place/-23.5505,-46.6333", (-23.5505, -46.6333)),
        ]

        for url, expected in test_cases:
            result = scraper._parse_coordinates(url)
            assert result == expected, f"Failed to parse '{url}'"

    def test_parse_coordinates_invalid(self, scraper):
        """Test parsing invalid coordinate URLs."""
        test_cases = [
            "http://www.google.com/maps/",
            "not a url",
            "http://www.google.com/maps/place/",
            "",
        ]

        for url in test_cases:
            result = scraper._parse_coordinates(url)
            assert result == (None, None), f"Should return (None, None) for '{url}'"

    def test_extract_services_from_images(self, scraper):
        """Test extracting services from HTML."""
        html = """
        <div class="bipopServices">
            <img src="/icon1.png" alt="Internet" title="Internet"/>
            <img src="/icon2.png" alt="Wifi" title="Wifi"/>
            <img src="/icon3.png" title="Rincón Infantil"/>
        </div>
        """

        soup = BeautifulSoup(html, 'html.parser')
        services = scraper._extract_services_from_images(soup)

        assert len(services) == 3
        assert "Internet" in services
        assert "Wifi" in services
        assert "Rincón Infantil" in services

    def test_extract_services_empty(self, scraper):
        """Test extracting services from HTML with no services."""
        html = """
        <div class="bipopServices">
            <p>No services listed</p>
        </div>
        """

        soup = BeautifulSoup(html, 'html.parser')
        services = scraper._extract_services_from_images(soup)

        assert len(services) == 0

    def test_parse_table_row_complete(self, scraper):
        """Test parsing a complete table row."""
        html = """
        <tr class="odd">
            <td class="views-field views-field-field-nombre-de-la-biblioteca">
                <b>Biblioteca Popular Helena Larroque de Roffo</b> (REG: 18)
            </td>
            <td class="views-field views-field-province">
                CIUDAD AUTÓNOMA DE BUENOS AIRES
            </td>
            <td class="views-field views-field-city">
                Ciudad Autónoma de Buenos Aires
            </td>
            <td class="views-field views-field-additional">
                Villa del Parque
            </td>
            <td class="views-field views-field-street">
                Simbrón 3058 <br />Villa del Parque
            </td>
            <td class="views-field views-field-view-user">
                <a href="/bipop/1342">Ver más</a>
            </td>
        </tr>
        """

        soup = BeautifulSoup(html, 'html.parser')
        row = soup.find('tr')
        cells = row.find_all('td')

        # Simulate parsing logic from scrape_page
        name_cell = cells[0]
        name_strong = name_cell.find('b')
        name = name_strong.get_text(strip=True) if name_strong else ""
        reg_text = name_cell.get_text(strip=True)
        reg_number = scraper._parse_registration_number(reg_text)

        province = cells[1].get_text(strip=True)
        city = cells[2].get_text(strip=True)
        neighborhood = cells[3].get_text(strip=True)

        address_cell = cells[4]
        address_parts = list(address_cell.stripped_strings)
        street_address = address_parts[0] if address_parts else ''

        profile_link = cells[5].select_one('a[href^="/bipop/"]')
        profile_url = profile_link['href'] if profile_link else None

        # Assertions
        assert name == "Biblioteca Popular Helena Larroque de Roffo"
        assert reg_number == 18
        assert province == "CIUDAD AUTÓNOMA DE BUENOS AIRES"
        assert city == "Ciudad Autónoma de Buenos Aires"
        assert neighborhood == "Villa del Parque"
        assert street_address == "Simbrón 3058"
        assert profile_url == "/bipop/1342"

    def test_parse_table_row_minimal(self, scraper):
        """Test parsing a minimal table row with missing data."""
        html = """
        <tr class="even">
            <td class="views-field views-field-field-nombre-de-la-biblioteca">
                <b>Biblioteca Popular Test</b> (REG: 999)
            </td>
            <td class="views-field views-field-province"></td>
            <td class="views-field views-field-city"></td>
            <td class="views-field views-field-additional"></td>
            <td class="views-field views-field-street"></td>
            <td class="views-field views-field-view-user"></td>
        </tr>
        """

        soup = BeautifulSoup(html, 'html.parser')
        row = soup.find('tr')
        cells = row.find_all('td')

        # Simulate parsing logic
        name_cell = cells[0]
        name_strong = name_cell.find('b')
        name = name_strong.get_text(strip=True) if name_strong else ""
        reg_text = name_cell.get_text(strip=True)
        reg_number = scraper._parse_registration_number(reg_text)

        province = cells[1].get_text(strip=True)
        city = cells[2].get_text(strip=True)

        # Assertions
        assert name == "Biblioteca Popular Test"
        assert reg_number == 999
        assert province == ""  # Empty but present
        assert city == ""  # Empty but present

    def test_export_csv_no_data(self, scraper, tmp_path):
        """Test CSV export with no data."""
        import logging

        # Redirect output directory to tmp for testing
        scraper.institutions = []

        # This should log a warning but not crash
        output_file = tmp_path / "test_output.csv"

        # Mock the OUTPUT_DIR
        original_output_dir = Path(scraper.__module__).parent.parent.parent / "data" / "isil" / "AR"

        # Just verify it doesn't crash (warning is expected)
        try:
            scraper.export_to_csv("test_output.csv")
        except Exception as e:
            pytest.fail(f"export_to_csv should not crash with no data: {e}")

    def test_export_json_with_data(self, scraper, tmp_path):
        """Test JSON export with sample data."""
        scraper.institutions = [
            {
                'conabip_reg': '18',
                'name': 'Test Library',
                'province': 'Buenos Aires',
                'city': 'Buenos Aires',
                'country': 'AR'
            }
        ]

        # This test verifies the export doesn't crash
        # (actual file output goes to OUTPUT_DIR, not tmp_path)
        try:
            scraper.export_to_json("test_output.json")
        except Exception as e:
            pytest.fail(f"export_to_json should not crash: {e}")

    def test_print_summary_no_data(self, scraper, caplog):
        """Test summary with no data."""
        import logging
        caplog.set_level(logging.INFO)

        scraper.institutions = []
        scraper.errors = []

        scraper.print_summary()

        assert "Total institutions extracted: 0" in caplog.text
        assert "Total errors: 0" in caplog.text

    def test_print_summary_with_data(self, scraper, caplog):
        """Test summary with sample data."""
        import logging
        caplog.set_level(logging.INFO)

        scraper.institutions = [
            {'province': 'Buenos Aires', 'city': 'La Plata'},
            {'province': 'Buenos Aires', 'city': 'Mar del Plata'},
            {'province': 'Córdoba', 'city': 'Córdoba'},
        ]
        scraper.errors = []

        scraper.print_summary()

        assert "Total institutions extracted: 3" in caplog.text
        assert "Provinces covered: 2" in caplog.text
        assert "Cities covered: 3" in caplog.text


class TestCONABIPScraperEdgeCases:
    """Test edge cases and error handling."""

    @pytest.fixture
    def scraper(self):
        """Create a scraper instance for testing."""
        return CONABIPScraper(rate_limit_delay=0.1)

    def test_malformed_registration_number(self, scraper):
        """Test handling of malformed registration numbers."""
        test_cases = [
            "REG: 12.5",  # Decimal
            "REG: 1,234",  # Comma
            "REG: -50",  # Negative
            "REG: 99999999999999999999",  # Too large
        ]

        for text in test_cases:
            result = scraper._parse_registration_number(text)
            # Should either parse the integer part or return None
            assert result is None or isinstance(result, int)

    def test_coordinate_edge_cases(self, scraper):
        """Test coordinate parsing edge cases."""
        test_cases = [
            "http://www.google.com/maps/place/0,0",  # Equator/Prime Meridian
            "http://www.google.com/maps/place/90,-180",  # North Pole/Dateline
            "http://www.google.com/maps/place/-90,180",  # South Pole/Dateline
        ]

        for url in test_cases:
            result = scraper._parse_coordinates(url)
            assert isinstance(result, tuple)
            assert len(result) == 2

    def test_unicode_handling(self, scraper):
        """Test handling of Unicode characters in names."""
        test_cases = [
            "Biblioteca Popular José Hernández",
            "Biblioteca Nicolás Avellaneda",
            "Biblioteca Popular Nuñez",
            "Biblioteca Popular Güemes",
        ]

        # Should not crash on Unicode
        for name in test_cases:
            assert isinstance(name, str)

    def test_empty_table_handling(self, scraper):
        """Test handling of empty results table."""
        html = """
        <table class="views-table">
            <thead><tr><th>Name</th></tr></thead>
            <tbody></tbody>
        </table>
        """

        soup = BeautifulSoup(html, 'html.parser')
        table = soup.select_one('table.views-table')
        rows = table.select('tbody tr')

        assert len(rows) == 0


# Integration test marker
@pytest.mark.integration
class TestCONABIPScraperIntegration:
    """Integration tests that make real HTTP requests."""

    @pytest.fixture
    def scraper(self):
        """Create a scraper instance for integration testing."""
        return CONABIPScraper(rate_limit_delay=2.0)  # Respectful delay

    def test_scrape_first_page_real(self, scraper):
        """Test scraping the first page of real data (INTEGRATION)."""
        institutions = scraper.scrape_page(page_num=0)

        # Should find institutions on first page
        assert len(institutions) > 0

        # Check structure of first institution
        first = institutions[0]
        assert 'name' in first
        assert 'province' in first
        assert 'country' in first
        assert first['country'] == 'AR'
        assert first['data_source'] == 'CONABIP'

    def test_get_total_pages_real(self, scraper):
        """Test getting total page count from real site (INTEGRATION)."""
        total_pages = scraper.get_total_pages()

        # Should find many pages (typically 50-100+)
        assert total_pages > 10
        assert total_pages < 1000  # Sanity check


if __name__ == "__main__":
    pytest.main([__file__, "-v"])