glam/tests/parsers/test_isil_registry.py

"""
Tests for ISIL Registry CSV Parser
"""

import pytest
from datetime import datetime
from pathlib import Path
import tempfile
import os

from glam_extractor.parsers.isil_registry import ISILRegistryParser, ISILRegistryRecord
from glam_extractor.models import DataTier, DataSource


# Sample ISIL CSV data (with unusual quote format)
SAMPLE_ISIL_CSV = '''"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;
"1","Aalten","Nationaal Onderduikmuseum","NL-AtNOM","2021-03-17",""";;;;;
"2","Alkmaar","Regionaal Archief Alkmaar","NL-AmrRAA","2009-08-18",""";;;;;
"3","Amsterdam","Rijksmuseum","NL-AsdRM","2008-01-15","National museum";;;;;
"4","Den Haag","Nationaal Archief","NL-HANA","2007-05-01",""";;;;;
"5","Rotterdam","Museum Boijmans Van Beuningen","NL-RtMBVB","","In process";;;;;
'''


@pytest.fixture
def sample_csv_file():
    """Create a temporary CSV file with sample data"""
    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f:
        f.write(SAMPLE_ISIL_CSV)
        temp_path = f.name

    yield temp_path

    # Cleanup
    os.unlink(temp_path)


class TestISILRegistryRecord:
    """Test ISILRegistryRecord Pydantic model"""

    def test_create_valid_record(self):
        """Test creating a valid ISIL registry record"""
        record = ISILRegistryRecord(
            volgnr=1,
            plaats="Amsterdam",
            instelling="Rijksmuseum",
            isil_code="NL-AsdRM",
            toegekend_op="2008-01-15",
            opmerking="National museum"
        )

        assert record.volgnr == 1
        assert record.plaats == "Amsterdam"
        assert record.instelling == "Rijksmuseum"
        assert record.isil_code == "NL-AsdRM"
        assert record.toegekend_op == datetime(2008, 1, 15)
        assert record.opmerking == "National museum"

    def test_date_parsing(self):
        """Test various date formats"""
        # ISO format
        record1 = ISILRegistryRecord(
            volgnr=1, plaats="Test", instelling="Test", isil_code="NL-TEST",
            toegekend_op="2021-03-17"
        )
        assert record1.toegekend_op == datetime(2021, 3, 17)

        # Empty date
        record2 = ISILRegistryRecord(
            volgnr=2, plaats="Test", instelling="Test", isil_code="NL-TEST2",
            toegekend_op=""
        )
        assert record2.toegekend_op is None

        # None date
        record3 = ISILRegistryRecord(
            volgnr=3, plaats="Test", instelling="Test", isil_code="NL-TEST3",
            toegekend_op=None
        )
        assert record3.toegekend_op is None

    def test_isil_validation(self):
        """Test ISIL code validation"""
        # Valid ISIL codes
        valid_codes = ["NL-AsdRM", "NL-HANA", "NL-AtNOM", "NL-Test123"]
        for code in valid_codes:
            record = ISILRegistryRecord(
                volgnr=1, plaats="Test", instelling="Test", isil_code=code
            )
            assert record.isil_code == code

        # Invalid ISIL codes
        with pytest.raises(ValueError, match="Invalid ISIL code format"):
            ISILRegistryRecord(
                volgnr=1, plaats="Test", instelling="Test", isil_code="INVALID"
            )

        with pytest.raises(ValueError, match="ISIL code cannot be empty"):
            ISILRegistryRecord(
                volgnr=1, plaats="Test", instelling="Test", isil_code=""
            )


class TestISILRegistryParser:
    """Test ISILRegistryParser class"""

    def test_parse_file(self, sample_csv_file):
        """Test parsing a complete CSV file"""
        parser = ISILRegistryParser()
        records = parser.parse_file(sample_csv_file)

        assert len(records) == 5

        # Check first record
        assert records[0].volgnr == 1
        assert records[0].plaats == "Aalten"
        assert records[0].instelling == "Nationaal Onderduikmuseum"
        assert records[0].isil_code == "NL-AtNOM"
        assert records[0].toegekend_op == datetime(2021, 3, 17)

        # Check record with remark
        assert records[2].volgnr == 3
        assert records[2].instelling == "Rijksmuseum"
        assert records[2].opmerking == "National museum"

        # Check record with empty date
        assert records[4].volgnr == 5
        assert records[4].toegekend_op is None
        assert records[4].opmerking == "In process"

    def test_parse_nonexistent_file(self):
        """Test parsing a file that doesn't exist"""
        parser = ISILRegistryParser()

        with pytest.raises(FileNotFoundError):
            parser.parse_file("/nonexistent/file.csv")

    def test_to_heritage_custodian(self):
        """Test converting ISIL record to HeritageCustodian"""
        parser = ISILRegistryParser()

        record = ISILRegistryRecord(
            volgnr=1,
            plaats="Amsterdam",
            instelling="Rijksmuseum",
            isil_code="NL-AsdRM",
            toegekend_op="2008-01-15",
            opmerking="National museum"
        )

        custodian = parser.to_heritage_custodian(record, "/path/to/csv")

        # Check basic fields
        assert custodian.name == "Rijksmuseum"
        assert len(custodian.locations) == 1
        assert custodian.locations[0].city == "Amsterdam"
        assert custodian.locations[0].country == "NL"

        # Check identifier
        assert len(custodian.identifiers) == 1
        assert custodian.identifiers[0].identifier_scheme == "ISIL"
        assert custodian.identifiers[0].identifier_value == "NL-AsdRM"
        assert str(custodian.identifiers[0].identifier_url) == "https://isil.nl/NL-AsdRM"

        # Check provenance
        assert str(custodian.provenance.data_source) == 'ISIL_REGISTRY'
        assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
        assert custodian.provenance.extraction_method == "ISILRegistryParser with GHCID generation"
        assert custodian.provenance.confidence_score == 1.0
        assert str(custodian.provenance.verified_date) == '2008-01-15T00:00:00'  # ISO 8601 string
        assert custodian.provenance.verified_by == "National Library of the Netherlands (KB)"

        # Check GHCID fields (Amsterdam is in lookup table)
        assert custodian.ghcid_numeric is not None
        assert custodian.ghcid_current is not None
        assert custodian.ghcid_original is not None
        assert custodian.ghcid_current == custodian.ghcid_original  # First assignment
        assert custodian.ghcid_history is not None
        assert len(custodian.ghcid_history) == 1

        # Verify GHCID format: NL-NH-AMS-M-R (Rijksmuseum → R)
        assert custodian.ghcid_current.startswith("NL-NH-AMS-M-")

        # Check description (opmerking is stored here)
        assert custodian.description == "National museum"

    def test_parse_and_convert(self, sample_csv_file):
        """Test end-to-end parsing and conversion"""
        parser = ISILRegistryParser()
        custodians = parser.parse_and_convert(sample_csv_file)

        assert len(custodians) == 5

        # Check first custodian
        assert custodians[0].name == "Nationaal Onderduikmuseum"
        assert custodians[0].locations[0].city == "Aalten"
        assert custodians[0].identifiers[0].identifier_value == "NL-AtNOM"

        # Check Rijksmuseum
        rijksmuseum = [c for c in custodians if c.name == "Rijksmuseum"][0]
        assert rijksmuseum.locations[0].city == "Amsterdam"
        assert rijksmuseum.identifiers[0].identifier_value == "NL-AsdRM"
        assert rijksmuseum.description == "National museum"

        # All should have TIER_1 provenance
        for custodian in custodians:
            assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
            assert custodian.provenance.confidence_score == 1.0


class TestISILParserEdgeCases:
    """Test edge cases and error handling"""

    def test_empty_csv(self):
        """Test parsing an empty CSV file"""
        with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
            f.write('"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;')
            temp_path = f.name

        try:
            parser = ISILRegistryParser()
            records = parser.parse_file(temp_path)
            assert len(records) == 0
        finally:
            os.unlink(temp_path)

    def test_malformed_row_skipped(self):
        """Test that malformed rows are skipped with warning"""
        csv_data = '''"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;
"1","Amsterdam","Rijksmuseum","NL-AsdRM","2008-01-15",""";;;;;
"invalid";"row";"without";"proper";"format";;;;;
"3","Rotterdam","Test Museum","NL-TEST","2020-01-01",""";;;;;
'''

        with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
            f.write(csv_data)
            temp_path = f.name

        try:
            parser = ISILRegistryParser()
            records = parser.parse_file(temp_path)

            # Should parse 2 valid records, skip the malformed one
            assert len(records) == 2
            assert records[0].instelling == "Rijksmuseum"
            assert records[1].instelling == "Test Museum"
        finally:
            os.unlink(temp_path)

    def test_path_as_string_or_pathlib(self, sample_csv_file):
        """Test that parser accepts both string and Path objects"""
        parser = ISILRegistryParser()

        # Test with string
        records_str = parser.parse_file(sample_csv_file)

        # Test with Path
        records_path = parser.parse_file(Path(sample_csv_file))

        assert len(records_str) == len(records_path)
        assert records_str[0].instelling == records_path[0].instelling