glam/tests/parsers/test_isil_registry.py
2025-12-05 15:30:23 +01:00

261 lines
9.8 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Tests for ISIL Registry CSV Parser
"""
import pytest
from datetime import datetime
from pathlib import Path
import tempfile
import os
from glam_extractor.parsers.isil_registry import ISILRegistryParser, ISILRegistryRecord
from glam_extractor.models import DataTier, DataSource
# Sample ISIL CSV data (with unusual quote format)
SAMPLE_ISIL_CSV = '''"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;
"1","Aalten","Nationaal Onderduikmuseum","NL-AtNOM","2021-03-17",""";;;;;
"2","Alkmaar","Regionaal Archief Alkmaar","NL-AmrRAA","2009-08-18",""";;;;;
"3","Amsterdam","Rijksmuseum","NL-AsdRM","2008-01-15","National museum";;;;;
"4","Den Haag","Nationaal Archief","NL-HANA","2007-05-01",""";;;;;
"5","Rotterdam","Museum Boijmans Van Beuningen","NL-RtMBVB","","In process";;;;;
'''
@pytest.fixture
def sample_csv_file():
"""Create a temporary CSV file with sample data"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f:
f.write(SAMPLE_ISIL_CSV)
temp_path = f.name
yield temp_path
# Cleanup
os.unlink(temp_path)
class TestISILRegistryRecord:
"""Test ISILRegistryRecord Pydantic model"""
def test_create_valid_record(self):
"""Test creating a valid ISIL registry record"""
record = ISILRegistryRecord(
volgnr=1,
plaats="Amsterdam",
instelling="Rijksmuseum",
isil_code="NL-AsdRM",
toegekend_op="2008-01-15",
opmerking="National museum"
)
assert record.volgnr == 1
assert record.plaats == "Amsterdam"
assert record.instelling == "Rijksmuseum"
assert record.isil_code == "NL-AsdRM"
assert record.toegekend_op == datetime(2008, 1, 15)
assert record.opmerking == "National museum"
def test_date_parsing(self):
"""Test various date formats"""
# ISO format
record1 = ISILRegistryRecord(
volgnr=1, plaats="Test", instelling="Test", isil_code="NL-TEST",
toegekend_op="2021-03-17"
)
assert record1.toegekend_op == datetime(2021, 3, 17)
# Empty date
record2 = ISILRegistryRecord(
volgnr=2, plaats="Test", instelling="Test", isil_code="NL-TEST2",
toegekend_op=""
)
assert record2.toegekend_op is None
# None date
record3 = ISILRegistryRecord(
volgnr=3, plaats="Test", instelling="Test", isil_code="NL-TEST3",
toegekend_op=None
)
assert record3.toegekend_op is None
def test_isil_validation(self):
"""Test ISIL code validation"""
# Valid ISIL codes
valid_codes = ["NL-AsdRM", "NL-HANA", "NL-AtNOM", "NL-Test123"]
for code in valid_codes:
record = ISILRegistryRecord(
volgnr=1, plaats="Test", instelling="Test", isil_code=code
)
assert record.isil_code == code
# Invalid ISIL codes
with pytest.raises(ValueError, match="Invalid ISIL code format"):
ISILRegistryRecord(
volgnr=1, plaats="Test", instelling="Test", isil_code="INVALID"
)
with pytest.raises(ValueError, match="ISIL code cannot be empty"):
ISILRegistryRecord(
volgnr=1, plaats="Test", instelling="Test", isil_code=""
)
class TestISILRegistryParser:
"""Test ISILRegistryParser class"""
def test_parse_file(self, sample_csv_file):
"""Test parsing a complete CSV file"""
parser = ISILRegistryParser()
records = parser.parse_file(sample_csv_file)
assert len(records) == 5
# Check first record
assert records[0].volgnr == 1
assert records[0].plaats == "Aalten"
assert records[0].instelling == "Nationaal Onderduikmuseum"
assert records[0].isil_code == "NL-AtNOM"
assert records[0].toegekend_op == datetime(2021, 3, 17)
# Check record with remark
assert records[2].volgnr == 3
assert records[2].instelling == "Rijksmuseum"
assert records[2].opmerking == "National museum"
# Check record with empty date
assert records[4].volgnr == 5
assert records[4].toegekend_op is None
assert records[4].opmerking == "In process"
def test_parse_nonexistent_file(self):
"""Test parsing a file that doesn't exist"""
parser = ISILRegistryParser()
with pytest.raises(FileNotFoundError):
parser.parse_file("/nonexistent/file.csv")
def test_to_heritage_custodian(self):
"""Test converting ISIL record to HeritageCustodian"""
parser = ISILRegistryParser()
record = ISILRegistryRecord(
volgnr=1,
plaats="Amsterdam",
instelling="Rijksmuseum",
isil_code="NL-AsdRM",
toegekend_op="2008-01-15",
opmerking="National museum"
)
custodian = parser.to_heritage_custodian(record, "/path/to/csv")
# Check basic fields
assert custodian.name == "Rijksmuseum"
assert len(custodian.locations) == 1
assert custodian.locations[0].city == "Amsterdam"
assert custodian.locations[0].country == "NL"
# Check identifier
assert len(custodian.identifiers) == 1
assert custodian.identifiers[0].identifier_scheme == "ISIL"
assert custodian.identifiers[0].identifier_value == "NL-AsdRM"
assert str(custodian.identifiers[0].identifier_url) == "https://isil.nl/NL-AsdRM"
# Check provenance
assert str(custodian.provenance.data_source) == 'ISIL_REGISTRY'
assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
assert custodian.provenance.extraction_method == "ISILRegistryParser with GHCID generation"
assert custodian.provenance.confidence_score == 1.0
assert str(custodian.provenance.verified_date) == '2008-01-15T00:00:00' # ISO 8601 string
assert custodian.provenance.verified_by == "National Library of the Netherlands (KB)"
# Check GHCID fields (Amsterdam is in lookup table)
assert custodian.ghcid_numeric is not None
assert custodian.ghcid_current is not None
assert custodian.ghcid_original is not None
assert custodian.ghcid_current == custodian.ghcid_original # First assignment
assert custodian.ghcid_history is not None
assert len(custodian.ghcid_history) == 1
# Verify GHCID format: NL-NH-AMS-M-R (Rijksmuseum → R)
assert custodian.ghcid_current.startswith("NL-NH-AMS-M-")
# Check description (opmerking is stored here)
assert custodian.description == "National museum"
def test_parse_and_convert(self, sample_csv_file):
"""Test end-to-end parsing and conversion"""
parser = ISILRegistryParser()
custodians = parser.parse_and_convert(sample_csv_file)
assert len(custodians) == 5
# Check first custodian
assert custodians[0].name == "Nationaal Onderduikmuseum"
assert custodians[0].locations[0].city == "Aalten"
assert custodians[0].identifiers[0].identifier_value == "NL-AtNOM"
# Check Rijksmuseum
rijksmuseum = [c for c in custodians if c.name == "Rijksmuseum"][0]
assert rijksmuseum.locations[0].city == "Amsterdam"
assert rijksmuseum.identifiers[0].identifier_value == "NL-AsdRM"
assert rijksmuseum.description == "National museum"
# All should have TIER_1 provenance
for custodian in custodians:
assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
assert custodian.provenance.confidence_score == 1.0
class TestISILParserEdgeCases:
"""Test edge cases and error handling"""
def test_empty_csv(self):
"""Test parsing an empty CSV file"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
f.write('"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;')
temp_path = f.name
try:
parser = ISILRegistryParser()
records = parser.parse_file(temp_path)
assert len(records) == 0
finally:
os.unlink(temp_path)
def test_malformed_row_skipped(self):
"""Test that malformed rows are skipped with warning"""
csv_data = '''"Volgnr.","Plaats","Instelling","ISIL code","Toegekend op","Opmerking""";;;;;
"1","Amsterdam","Rijksmuseum","NL-AsdRM","2008-01-15",""";;;;;
"invalid";"row";"without";"proper";"format";;;;;
"3","Rotterdam","Test Museum","NL-TEST","2020-01-01",""";;;;;
'''
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
f.write(csv_data)
temp_path = f.name
try:
parser = ISILRegistryParser()
records = parser.parse_file(temp_path)
# Should parse 2 valid records, skip the malformed one
assert len(records) == 2
assert records[0].instelling == "Rijksmuseum"
assert records[1].instelling == "Test Museum"
finally:
os.unlink(temp_path)
def test_path_as_string_or_pathlib(self, sample_csv_file):
"""Test that parser accepts both string and Path objects"""
parser = ISILRegistryParser()
# Test with string
records_str = parser.parse_file(sample_csv_file)
# Test with Path
records_path = parser.parse_file(Path(sample_csv_file))
assert len(records_str) == len(records_path)
assert records_str[0].instelling == records_path[0].instelling