""" Tests for ISIL Registry CSV Parser """ import pytest from datetime import datetime from pathlib import Path import tempfile import os from glam_extractor.parsers.isil_registry import ISILRegistryParser, ISILRegistryRecord from glam_extractor.models import DataTier, DataSource # Sample ISIL CSV data (with unusual quote format) SAMPLE_ISIL_CSV = '''"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;; "1","Aalten","Nationaal Onderduikmuseum","NL-AtNOM","2021-03-17",""";;;;; "2","Alkmaar","Regionaal Archief Alkmaar","NL-AmrRAA","2009-08-18",""";;;;; "3","Amsterdam","Rijksmuseum","NL-AsdRM","2008-01-15","National museum";;;;; "4","Den Haag","Nationaal Archief","NL-HANA","2007-05-01",""";;;;; "5","Rotterdam","Museum Boijmans Van Beuningen","NL-RtMBVB","","In process";;;;; ''' @pytest.fixture def sample_csv_file(): """Create a temporary CSV file with sample data""" with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f: f.write(SAMPLE_ISIL_CSV) temp_path = f.name yield temp_path # Cleanup os.unlink(temp_path) class TestISILRegistryRecord: """Test ISILRegistryRecord Pydantic model""" def test_create_valid_record(self): """Test creating a valid ISIL registry record""" record = ISILRegistryRecord( volgnr=1, plaats="Amsterdam", instelling="Rijksmuseum", isil_code="NL-AsdRM", toegekend_op="2008-01-15", opmerking="National museum" ) assert record.volgnr == 1 assert record.plaats == "Amsterdam" assert record.instelling == "Rijksmuseum" assert record.isil_code == "NL-AsdRM" assert record.toegekend_op == datetime(2008, 1, 15) assert record.opmerking == "National museum" def test_date_parsing(self): """Test various date formats""" # ISO format record1 = ISILRegistryRecord( volgnr=1, plaats="Test", instelling="Test", isil_code="NL-TEST", toegekend_op="2021-03-17" ) assert record1.toegekend_op == datetime(2021, 3, 17) # Empty date record2 = ISILRegistryRecord( volgnr=2, plaats="Test", instelling="Test", isil_code="NL-TEST2", toegekend_op="" ) assert record2.toegekend_op is None # None date record3 = ISILRegistryRecord( volgnr=3, plaats="Test", instelling="Test", isil_code="NL-TEST3", toegekend_op=None ) assert record3.toegekend_op is None def test_isil_validation(self): """Test ISIL code validation""" # Valid ISIL codes valid_codes = ["NL-AsdRM", "NL-HANA", "NL-AtNOM", "NL-Test123"] for code in valid_codes: record = ISILRegistryRecord( volgnr=1, plaats="Test", instelling="Test", isil_code=code ) assert record.isil_code == code # Invalid ISIL codes with pytest.raises(ValueError, match="Invalid ISIL code format"): ISILRegistryRecord( volgnr=1, plaats="Test", instelling="Test", isil_code="INVALID" ) with pytest.raises(ValueError, match="ISIL code cannot be empty"): ISILRegistryRecord( volgnr=1, plaats="Test", instelling="Test", isil_code="" ) class TestISILRegistryParser: """Test ISILRegistryParser class""" def test_parse_file(self, sample_csv_file): """Test parsing a complete CSV file""" parser = ISILRegistryParser() records = parser.parse_file(sample_csv_file) assert len(records) == 5 # Check first record assert records[0].volgnr == 1 assert records[0].plaats == "Aalten" assert records[0].instelling == "Nationaal Onderduikmuseum" assert records[0].isil_code == "NL-AtNOM" assert records[0].toegekend_op == datetime(2021, 3, 17) # Check record with remark assert records[2].volgnr == 3 assert records[2].instelling == "Rijksmuseum" assert records[2].opmerking == "National museum" # Check record with empty date assert records[4].volgnr == 5 assert records[4].toegekend_op is None assert records[4].opmerking == "In process" def test_parse_nonexistent_file(self): """Test parsing a file that doesn't exist""" parser = ISILRegistryParser() with pytest.raises(FileNotFoundError): parser.parse_file("/nonexistent/file.csv") def test_to_heritage_custodian(self): """Test converting ISIL record to HeritageCustodian""" parser = ISILRegistryParser() record = ISILRegistryRecord( volgnr=1, plaats="Amsterdam", instelling="Rijksmuseum", isil_code="NL-AsdRM", toegekend_op="2008-01-15", opmerking="National museum" ) custodian = parser.to_heritage_custodian(record, "/path/to/csv") # Check basic fields assert custodian.name == "Rijksmuseum" assert len(custodian.locations) == 1 assert custodian.locations[0].city == "Amsterdam" assert custodian.locations[0].country == "NL" # Check identifier assert len(custodian.identifiers) == 1 assert custodian.identifiers[0].identifier_scheme == "ISIL" assert custodian.identifiers[0].identifier_value == "NL-AsdRM" assert str(custodian.identifiers[0].identifier_url) == "https://isil.nl/NL-AsdRM" # Check provenance assert str(custodian.provenance.data_source) == 'ISIL_REGISTRY' assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE' assert custodian.provenance.extraction_method == "ISILRegistryParser with GHCID generation" assert custodian.provenance.confidence_score == 1.0 assert str(custodian.provenance.verified_date) == '2008-01-15T00:00:00' # ISO 8601 string assert custodian.provenance.verified_by == "National Library of the Netherlands (KB)" # Check GHCID fields (Amsterdam is in lookup table) assert custodian.ghcid_numeric is not None assert custodian.ghcid_current is not None assert custodian.ghcid_original is not None assert custodian.ghcid_current == custodian.ghcid_original # First assignment assert custodian.ghcid_history is not None assert len(custodian.ghcid_history) == 1 # Verify GHCID format: NL-NH-AMS-M-R (Rijksmuseum → R) assert custodian.ghcid_current.startswith("NL-NH-AMS-M-") # Check description (opmerking is stored here) assert custodian.description == "National museum" def test_parse_and_convert(self, sample_csv_file): """Test end-to-end parsing and conversion""" parser = ISILRegistryParser() custodians = parser.parse_and_convert(sample_csv_file) assert len(custodians) == 5 # Check first custodian assert custodians[0].name == "Nationaal Onderduikmuseum" assert custodians[0].locations[0].city == "Aalten" assert custodians[0].identifiers[0].identifier_value == "NL-AtNOM" # Check Rijksmuseum rijksmuseum = [c for c in custodians if c.name == "Rijksmuseum"][0] assert rijksmuseum.locations[0].city == "Amsterdam" assert rijksmuseum.identifiers[0].identifier_value == "NL-AsdRM" assert rijksmuseum.description == "National museum" # All should have TIER_1 provenance for custodian in custodians: assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE' assert custodian.provenance.confidence_score == 1.0 class TestISILParserEdgeCases: """Test edge cases and error handling""" def test_empty_csv(self): """Test parsing an empty CSV file""" with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: f.write('"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;') temp_path = f.name try: parser = ISILRegistryParser() records = parser.parse_file(temp_path) assert len(records) == 0 finally: os.unlink(temp_path) def test_malformed_row_skipped(self): """Test that malformed rows are skipped with warning""" csv_data = '''"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;; "1","Amsterdam","Rijksmuseum","NL-AsdRM","2008-01-15",""";;;;; "invalid";"row";"without";"proper";"format";;;;; "3","Rotterdam","Test Museum","NL-TEST","2020-01-01",""";;;;; ''' with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: f.write(csv_data) temp_path = f.name try: parser = ISILRegistryParser() records = parser.parse_file(temp_path) # Should parse 2 valid records, skip the malformed one assert len(records) == 2 assert records[0].instelling == "Rijksmuseum" assert records[1].instelling == "Test Museum" finally: os.unlink(temp_path) def test_path_as_string_or_pathlib(self, sample_csv_file): """Test that parser accepts both string and Path objects""" parser = ISILRegistryParser() # Test with string records_str = parser.parse_file(sample_csv_file) # Test with Path records_path = parser.parse_file(Path(sample_csv_file)) assert len(records_str) == len(records_path) assert records_str[0].instelling == records_path[0].instelling