261 lines
9.8 KiB
Python
261 lines
9.8 KiB
Python
"""
|
||
Tests for ISIL Registry CSV Parser
|
||
"""
|
||
|
||
import pytest
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
import tempfile
|
||
import os
|
||
|
||
from glam_extractor.parsers.isil_registry import ISILRegistryParser, ISILRegistryRecord
|
||
from glam_extractor.models import DataTier, DataSource
|
||
|
||
|
||
# Sample ISIL CSV data (with unusual quote format)
|
||
SAMPLE_ISIL_CSV = '''"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;
|
||
"1","Aalten","Nationaal Onderduikmuseum","NL-AtNOM","2021-03-17",""";;;;;
|
||
"2","Alkmaar","Regionaal Archief Alkmaar","NL-AmrRAA","2009-08-18",""";;;;;
|
||
"3","Amsterdam","Rijksmuseum","NL-AsdRM","2008-01-15","National museum";;;;;
|
||
"4","Den Haag","Nationaal Archief","NL-HANA","2007-05-01",""";;;;;
|
||
"5","Rotterdam","Museum Boijmans Van Beuningen","NL-RtMBVB","","In process";;;;;
|
||
'''
|
||
|
||
|
||
@pytest.fixture
|
||
def sample_csv_file():
|
||
"""Create a temporary CSV file with sample data"""
|
||
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f:
|
||
f.write(SAMPLE_ISIL_CSV)
|
||
temp_path = f.name
|
||
|
||
yield temp_path
|
||
|
||
# Cleanup
|
||
os.unlink(temp_path)
|
||
|
||
|
||
class TestISILRegistryRecord:
|
||
"""Test ISILRegistryRecord Pydantic model"""
|
||
|
||
def test_create_valid_record(self):
|
||
"""Test creating a valid ISIL registry record"""
|
||
record = ISILRegistryRecord(
|
||
volgnr=1,
|
||
plaats="Amsterdam",
|
||
instelling="Rijksmuseum",
|
||
isil_code="NL-AsdRM",
|
||
toegekend_op="2008-01-15",
|
||
opmerking="National museum"
|
||
)
|
||
|
||
assert record.volgnr == 1
|
||
assert record.plaats == "Amsterdam"
|
||
assert record.instelling == "Rijksmuseum"
|
||
assert record.isil_code == "NL-AsdRM"
|
||
assert record.toegekend_op == datetime(2008, 1, 15)
|
||
assert record.opmerking == "National museum"
|
||
|
||
def test_date_parsing(self):
|
||
"""Test various date formats"""
|
||
# ISO format
|
||
record1 = ISILRegistryRecord(
|
||
volgnr=1, plaats="Test", instelling="Test", isil_code="NL-TEST",
|
||
toegekend_op="2021-03-17"
|
||
)
|
||
assert record1.toegekend_op == datetime(2021, 3, 17)
|
||
|
||
# Empty date
|
||
record2 = ISILRegistryRecord(
|
||
volgnr=2, plaats="Test", instelling="Test", isil_code="NL-TEST2",
|
||
toegekend_op=""
|
||
)
|
||
assert record2.toegekend_op is None
|
||
|
||
# None date
|
||
record3 = ISILRegistryRecord(
|
||
volgnr=3, plaats="Test", instelling="Test", isil_code="NL-TEST3",
|
||
toegekend_op=None
|
||
)
|
||
assert record3.toegekend_op is None
|
||
|
||
def test_isil_validation(self):
|
||
"""Test ISIL code validation"""
|
||
# Valid ISIL codes
|
||
valid_codes = ["NL-AsdRM", "NL-HANA", "NL-AtNOM", "NL-Test123"]
|
||
for code in valid_codes:
|
||
record = ISILRegistryRecord(
|
||
volgnr=1, plaats="Test", instelling="Test", isil_code=code
|
||
)
|
||
assert record.isil_code == code
|
||
|
||
# Invalid ISIL codes
|
||
with pytest.raises(ValueError, match="Invalid ISIL code format"):
|
||
ISILRegistryRecord(
|
||
volgnr=1, plaats="Test", instelling="Test", isil_code="INVALID"
|
||
)
|
||
|
||
with pytest.raises(ValueError, match="ISIL code cannot be empty"):
|
||
ISILRegistryRecord(
|
||
volgnr=1, plaats="Test", instelling="Test", isil_code=""
|
||
)
|
||
|
||
|
||
class TestISILRegistryParser:
|
||
"""Test ISILRegistryParser class"""
|
||
|
||
def test_parse_file(self, sample_csv_file):
|
||
"""Test parsing a complete CSV file"""
|
||
parser = ISILRegistryParser()
|
||
records = parser.parse_file(sample_csv_file)
|
||
|
||
assert len(records) == 5
|
||
|
||
# Check first record
|
||
assert records[0].volgnr == 1
|
||
assert records[0].plaats == "Aalten"
|
||
assert records[0].instelling == "Nationaal Onderduikmuseum"
|
||
assert records[0].isil_code == "NL-AtNOM"
|
||
assert records[0].toegekend_op == datetime(2021, 3, 17)
|
||
|
||
# Check record with remark
|
||
assert records[2].volgnr == 3
|
||
assert records[2].instelling == "Rijksmuseum"
|
||
assert records[2].opmerking == "National museum"
|
||
|
||
# Check record with empty date
|
||
assert records[4].volgnr == 5
|
||
assert records[4].toegekend_op is None
|
||
assert records[4].opmerking == "In process"
|
||
|
||
def test_parse_nonexistent_file(self):
|
||
"""Test parsing a file that doesn't exist"""
|
||
parser = ISILRegistryParser()
|
||
|
||
with pytest.raises(FileNotFoundError):
|
||
parser.parse_file("/nonexistent/file.csv")
|
||
|
||
def test_to_heritage_custodian(self):
|
||
"""Test converting ISIL record to HeritageCustodian"""
|
||
parser = ISILRegistryParser()
|
||
|
||
record = ISILRegistryRecord(
|
||
volgnr=1,
|
||
plaats="Amsterdam",
|
||
instelling="Rijksmuseum",
|
||
isil_code="NL-AsdRM",
|
||
toegekend_op="2008-01-15",
|
||
opmerking="National museum"
|
||
)
|
||
|
||
custodian = parser.to_heritage_custodian(record, "/path/to/csv")
|
||
|
||
# Check basic fields
|
||
assert custodian.name == "Rijksmuseum"
|
||
assert len(custodian.locations) == 1
|
||
assert custodian.locations[0].city == "Amsterdam"
|
||
assert custodian.locations[0].country == "NL"
|
||
|
||
# Check identifier
|
||
assert len(custodian.identifiers) == 1
|
||
assert custodian.identifiers[0].identifier_scheme == "ISIL"
|
||
assert custodian.identifiers[0].identifier_value == "NL-AsdRM"
|
||
assert str(custodian.identifiers[0].identifier_url) == "https://isil.nl/NL-AsdRM"
|
||
|
||
# Check provenance
|
||
assert str(custodian.provenance.data_source) == 'ISIL_REGISTRY'
|
||
assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
|
||
assert custodian.provenance.extraction_method == "ISILRegistryParser with GHCID generation"
|
||
assert custodian.provenance.confidence_score == 1.0
|
||
assert str(custodian.provenance.verified_date) == '2008-01-15T00:00:00' # ISO 8601 string
|
||
assert custodian.provenance.verified_by == "National Library of the Netherlands (KB)"
|
||
|
||
# Check GHCID fields (Amsterdam is in lookup table)
|
||
assert custodian.ghcid_numeric is not None
|
||
assert custodian.ghcid_current is not None
|
||
assert custodian.ghcid_original is not None
|
||
assert custodian.ghcid_current == custodian.ghcid_original # First assignment
|
||
assert custodian.ghcid_history is not None
|
||
assert len(custodian.ghcid_history) == 1
|
||
|
||
# Verify GHCID format: NL-NH-AMS-M-R (Rijksmuseum → R)
|
||
assert custodian.ghcid_current.startswith("NL-NH-AMS-M-")
|
||
|
||
# Check description (opmerking is stored here)
|
||
assert custodian.description == "National museum"
|
||
|
||
def test_parse_and_convert(self, sample_csv_file):
|
||
"""Test end-to-end parsing and conversion"""
|
||
parser = ISILRegistryParser()
|
||
custodians = parser.parse_and_convert(sample_csv_file)
|
||
|
||
assert len(custodians) == 5
|
||
|
||
# Check first custodian
|
||
assert custodians[0].name == "Nationaal Onderduikmuseum"
|
||
assert custodians[0].locations[0].city == "Aalten"
|
||
assert custodians[0].identifiers[0].identifier_value == "NL-AtNOM"
|
||
|
||
# Check Rijksmuseum
|
||
rijksmuseum = [c for c in custodians if c.name == "Rijksmuseum"][0]
|
||
assert rijksmuseum.locations[0].city == "Amsterdam"
|
||
assert rijksmuseum.identifiers[0].identifier_value == "NL-AsdRM"
|
||
assert rijksmuseum.description == "National museum"
|
||
|
||
# All should have TIER_1 provenance
|
||
for custodian in custodians:
|
||
assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
|
||
assert custodian.provenance.confidence_score == 1.0
|
||
|
||
|
||
class TestISILParserEdgeCases:
|
||
"""Test edge cases and error handling"""
|
||
|
||
def test_empty_csv(self):
|
||
"""Test parsing an empty CSV file"""
|
||
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
|
||
f.write('"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;')
|
||
temp_path = f.name
|
||
|
||
try:
|
||
parser = ISILRegistryParser()
|
||
records = parser.parse_file(temp_path)
|
||
assert len(records) == 0
|
||
finally:
|
||
os.unlink(temp_path)
|
||
|
||
def test_malformed_row_skipped(self):
|
||
"""Test that malformed rows are skipped with warning"""
|
||
csv_data = '''"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;
|
||
"1","Amsterdam","Rijksmuseum","NL-AsdRM","2008-01-15",""";;;;;
|
||
"invalid";"row";"without";"proper";"format";;;;;
|
||
"3","Rotterdam","Test Museum","NL-TEST","2020-01-01",""";;;;;
|
||
'''
|
||
|
||
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
|
||
f.write(csv_data)
|
||
temp_path = f.name
|
||
|
||
try:
|
||
parser = ISILRegistryParser()
|
||
records = parser.parse_file(temp_path)
|
||
|
||
# Should parse 2 valid records, skip the malformed one
|
||
assert len(records) == 2
|
||
assert records[0].instelling == "Rijksmuseum"
|
||
assert records[1].instelling == "Test Museum"
|
||
finally:
|
||
os.unlink(temp_path)
|
||
|
||
def test_path_as_string_or_pathlib(self, sample_csv_file):
|
||
"""Test that parser accepts both string and Path objects"""
|
||
parser = ISILRegistryParser()
|
||
|
||
# Test with string
|
||
records_str = parser.parse_file(sample_csv_file)
|
||
|
||
# Test with Path
|
||
records_path = parser.parse_file(Path(sample_csv_file))
|
||
|
||
assert len(records_str) == len(records_path)
|
||
assert records_str[0].instelling == records_path[0].instelling
|