glam/tests/parsers/test_eu_isil.py

"""
Tests for EU ISIL Registry Parser
"""

import pytest
from pathlib import Path
import tempfile
import os

from glam_extractor.parsers.eu_isil import EUIsilParser, EUIsilRecord
from glam_extractor.models import DataTier, DataSource, InstitutionType


# Sample EU ISIL text data (extracted from PDF)
SAMPLE_EU_ISIL_TEXT = '''
    EUR-COR0001                                                                        European Committee of the                                                                              Belliard 99-101
                                                              06-Jan-17                                                                 /                                 CoR                                                                          Belgium                  Brussels
                                                                                          Regions

    EUR-EP00001                                                                                                                         Library                           EP                   Rue Wiertz,                  Belgium                 Brussels
                                                              21-May-18                   European Parliament

    EUR- CURIA0001                                                                        Court of Justice of the                                                                              rue du Fort Niedergrünewald Grand-Duché de Luxembourg
                                                              03-Aug-22                                                                 européenne                        CJUE                                                                         Luxembourg   L-2925
                                                                                          European Union                                                                                                                   (Grand Duchy of Luxembourg)
                                                                                                                                                                          Curia / CVRIA

    EUR-EUI0001                                               20-Jun-16                   European University Institute                 Archives                          HAEU                 156, Via Bolognese           Italy                    Florence       50014
                                                                                                                                        Historical Archives of the
                                                                                                                                        European Union
'''


@pytest.fixture
def sample_eu_text_file():
    """Create a temporary EU ISIL text file with sample data"""
    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
        f.write(SAMPLE_EU_ISIL_TEXT)
        temp_path = f.name

    yield temp_path

    # Cleanup
    os.unlink(temp_path)


class TestEUIsilRecord:
    """Test EUIsilRecord Pydantic model"""

    def test_create_valid_record(self):
        """Test creating a valid EU ISIL record"""
        record = EUIsilRecord(
            isil="EUR-COR0001",
            approval_date="06-Jan-17",
            organisation_name="European Committee of the Regions",
            subunit=None,
            variants="CoR",
            address="Belliard 99-101",
            state="Belgium",
            city="Brussels",
            postcode=None
        )

        assert record.isil == "EUR-COR0001"
        assert record.approval_date == "06-Jan-17"
        assert record.organisation_name == "European Committee of the Regions"
        assert record.variants == "CoR"
        assert record.city == "Brussels"
        assert record.state == "Belgium"

    def test_isil_validation(self):
        """Test ISIL code validation"""
        # Valid EUR ISIL codes
        valid_codes = [
            "EUR-COR0001",
            "EUR-EP00001",
            "EUR-CURIA0001",
            "EUR-GSC0001",
            "EUR-EC00001",
        ]

        for code in valid_codes:
            record = EUIsilRecord(
                isil=code,
                organisation_name="Test Institution"
            )
            assert record.isil == code

        # Invalid codes should raise ValueError
        with pytest.raises(ValueError):
            EUIsilRecord(
                isil="NL-AsdRM",  # Not EUR prefix
                organisation_name="Test"
            )

        with pytest.raises(ValueError):
            EUIsilRecord(
                isil="EURO-COR0001",  # Wrong prefix
                organisation_name="Test"
            )


class TestEUIsilParser:
    """Test EU ISIL Parser"""

    def test_parse_file(self, sample_eu_text_file):
        """Test parsing EU ISIL text file"""
        parser = EUIsilParser()
        records = list(parser.parse_file(sample_eu_text_file))

        assert len(records) == 4

        # Check first record (EUR-COR0001)
        assert records[0].isil == "EUR-COR0001"
        assert records[0].organisation_name == "European Committee of the Regions"
        assert records[0].city == "Brussels"
        assert records[0].state == "Belgium"
        assert records[0].variants is None or "CoR" in records[0].variants

        # Check second record (EUR-EP00001)
        assert records[1].isil == "EUR-EP00001"
        assert records[1].organisation_name == "European Parliament"
        assert records[1].subunit == "Library"
        assert records[1].variants is None or "EP" in records[1].variants

        # Check CURIA record (with space in ISIL code)
        curia_record = [r for r in records if r.isil == "EUR-CURIA0001"][0]
        assert curia_record.organisation_name == "Court of Justice of the European Union"
        assert curia_record.city == "Luxembourg"
        assert curia_record.variants is None or "CJEU" in curia_record.variants or "CJUE" in curia_record.variants

        # Check EUI record
        eui_record = [r for r in records if r.isil == "EUR-EUI0001"][0]
        assert eui_record.organisation_name == "European University Institute"
        assert eui_record.subunit == "Archives"
        assert eui_record.city == "Florence"
        assert eui_record.state == "Italy"

    def test_parse_real_file(self):
        """Test parsing the real EU ISIL directory file"""
        real_file = Path("/Users/kempersc/apps/glam/data/isil/EUR/isil-directory.txt")

        if not real_file.exists():
            pytest.skip("Real EU ISIL file not found")

        parser = EUIsilParser()
        records = list(parser.parse_file(real_file))

        # Should parse all 10 institutions
        assert len(records) == 10

        # Verify specific institutions
        isil_codes = {r.isil for r in records}
        expected_codes = {
            "EUR-COR0001",
            "EUR-EP00001",
            "EUR-GSC0001",
            "EUR-GSC0002",
            "EUR-EUI0001",
            "EUR-EC00001",
            "EUR-EC00002",
            "EUR-EESC0001",
            "EUR-CURIA0001",
            "EUR-EUI0002"
        }
        assert isil_codes == expected_codes

    def test_to_heritage_custodian(self, sample_eu_text_file):
        """Test conversion to HeritageCustodian model"""
        parser = EUIsilParser()
        records = list(parser.parse_file(sample_eu_text_file))

        # Convert first record
        custodian = parser.to_heritage_custodian(records[0])

        assert custodian.name == "European Committee of the Regions"
        assert str(custodian.institution_type) == 'OFFICIAL_INSTITUTION'

        # Check identifiers
        assert len(custodian.identifiers) == 1
        assert custodian.identifiers[0].identifier_scheme == "ISIL"
        assert custodian.identifiers[0].identifier_value == "EUR-COR0001"

        # Check locations
        assert len(custodian.locations) == 1
        assert custodian.locations[0].city == "Brussels"
        assert custodian.locations[0].country == "BE"

        # Check provenance
        assert str(custodian.provenance.data_source) == 'ISIL_REGISTRY'
        assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'

    def test_institution_type_mapping(self, sample_eu_text_file):
        """Test that EU institutions are classified correctly"""
        parser = EUIsilParser()
        records = list(parser.parse_file(sample_eu_text_file))

        for record in records:
            custodian = parser.to_heritage_custodian(record)

            # Type should match subunit:
            # - Library subunits → LIBRARY
            # - Archive subunits → ARCHIVE
            # - Other EU institutions → OFFICIAL_INSTITUTION
            if record.subunit and 'library' in record.subunit.lower():
                assert str(custodian.institution_type) == 'LIBRARY'
            elif record.subunit and 'archive' in record.subunit.lower():
                assert str(custodian.institution_type) == 'ARCHIVE'
            else:
                assert str(custodian.institution_type) == 'OFFICIAL_INSTITUTION'

    def test_ghcid_generation(self, sample_eu_text_file):
        """Test GHCID generation for EU institutions"""
        parser = EUIsilParser()
        records = list(parser.parse_file(sample_eu_text_file))

        # Convert and check GHCIDs
        for record in records:
            custodian = parser.to_heritage_custodian(record)

            # GHCID should be generated
            assert custodian.ghcid_current is not None
            assert custodian.ghcid_uuid is not None
            assert custodian.ghcid_numeric is not None

            # GHCID should follow format: CC-RR-CCC-T-XXX
            # Examples:
            # - Brussels, Belgium: BE-00-BRU-O-XXX (Official Institution)
            # - Florence, Italy: IT-00-FLO-A-XXX (Archives)
            # - Luxembourg: LU-00-LUX-O-XXX (Official Institution)
            # - Library subunits: XX-00-XXX-L-XXX (Library)
            ghcid_parts = custodian.ghcid_current.split('-')
            assert len(ghcid_parts) == 5
            assert ghcid_parts[0] in ['BE', 'IT', 'LU']  # Country codes
            assert ghcid_parts[1] == '00'  # Region code for EU institutions
            assert ghcid_parts[3] in ['O', 'L', 'A']  # Institution type codes

    def test_space_in_isil_code(self):
        """Test handling of ISIL codes with spaces (EUR- CURIA0001)"""
        text_with_space = '''
                                                                                                                                        Cour de justice de l'Union        CJEU
    EUR- CURIA0001                                                                        Court of Justice of the                                                                              rue du Fort Niedergrünewald Grand-Duché de Luxembourg
                                                              03-Aug-22                                                                 européenne                        CJUE                                                                         Luxembourg   L-2925
                                                                                          European Union                                                                                                                   (Grand Duchy of Luxembourg)
                                                                                                                                                                          Curia / CVRIA
        '''

        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
            f.write(text_with_space)
            temp_path = f.name

        try:
            parser = EUIsilParser()
            records = list(parser.parse_file(temp_path))

            assert len(records) == 1
            # Space should be normalized out
            assert records[0].isil == "EUR-CURIA0001"
            assert records[0].organisation_name == "Court of Justice of the European Union"
        finally:
            os.unlink(temp_path)

    def test_multi_line_organization_name(self):
        """Test reconstruction of organization names split across multiple lines"""
        text_multi_line = '''
    EUR-COR0001                                                                        European Committee of the                                                                              Belliard 99-101
                                                              06-Jan-17                                                                 /                                 CoR                                                                          Belgium                  Brussels
                                                                                          Regions
        '''

        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
            f.write(text_multi_line)
            temp_path = f.name

        try:
            parser = EUIsilParser()
            records = list(parser.parse_file(temp_path))

            assert len(records) == 1
            # Should reconstruct full name from lines 1 and 3
            assert records[0].organisation_name == "European Committee of the Regions"
        finally:
            os.unlink(temp_path)

    def test_empty_file(self):
        """Test parsing empty file"""
        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
            f.write("")
            temp_path = f.name

        try:
            parser = EUIsilParser()
            records = list(parser.parse_file(temp_path))
            assert len(records) == 0
        finally:
            os.unlink(temp_path)

    def test_country_code_mapping(self, sample_eu_text_file):
        """Test country code mapping for EU institutions"""
        parser = EUIsilParser()
        records = list(parser.parse_file(sample_eu_text_file))

        country_mapping = {
            "Belgium": "BE",
            "Italy": "IT",
            "Grand-Duché de Luxembourg": "LU",
            "Grand Duchy of Luxembourg": "LU"
        }

        for record in records:
            custodian = parser.to_heritage_custodian(record)
            if custodian.locations:
                location = custodian.locations[0]
                if record.state:
                    expected_country = country_mapping.get(record.state)
                    if expected_country:
                        assert location.country == expected_country