329 lines
11 KiB
Python
329 lines
11 KiB
Python
"""
|
|
Test suite for Belgian ISIL registry parser.
|
|
|
|
Tests parsing, validation, and conversion of Belgian heritage institutions
|
|
from the KBR (Royal Library of Belgium) ISIL registry.
|
|
"""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
from glam_extractor.parsers.belgian_isil import (
|
|
BelgianISILParser,
|
|
BelgianISILRecord
|
|
)
|
|
from glam_extractor.models import (
|
|
HeritageCustodian,
|
|
InstitutionTypeEnum,
|
|
DataSourceEnum,
|
|
DataTierEnum,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_csv_path():
|
|
"""Path to real Belgian ISIL CSV file"""
|
|
return Path("data/isil/belgian_isil_detailed.csv")
|
|
|
|
|
|
@pytest.fixture
|
|
def parser():
|
|
"""Create parser instance"""
|
|
return BelgianISILParser()
|
|
|
|
|
|
class TestBelgianISILRecord:
|
|
"""Test Belgian ISIL record model"""
|
|
|
|
def test_valid_library_isil_code(self):
|
|
"""Test valid library ISIL code validation"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-OSE00",
|
|
institution_name="Test Library",
|
|
country="BE"
|
|
)
|
|
assert record.isil_code == "BE-OSE00"
|
|
assert record.country == "BE"
|
|
|
|
def test_valid_archive_isil_code(self):
|
|
"""Test valid archive ISIL code validation"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-A0001",
|
|
institution_name="Test Archive",
|
|
country="BE"
|
|
)
|
|
assert record.isil_code == "BE-A0001"
|
|
|
|
def test_invalid_isil_code(self):
|
|
"""Test invalid ISIL code format"""
|
|
with pytest.raises(ValueError, match="Invalid Belgian ISIL code format"):
|
|
BelgianISILRecord(
|
|
isil_code="NL-AmRMA", # Dutch code, not Belgian
|
|
institution_name="Test",
|
|
country="BE"
|
|
)
|
|
|
|
def test_empty_isil_code(self):
|
|
"""Test empty ISIL code rejection"""
|
|
with pytest.raises(ValueError, match="ISIL code cannot be empty"):
|
|
BelgianISILRecord(
|
|
isil_code="",
|
|
institution_name="Test",
|
|
country="BE"
|
|
)
|
|
|
|
def test_country_always_be(self):
|
|
"""Test country is always set to BE"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-TEST00",
|
|
institution_name="Test",
|
|
country="NL" # Try to set to Netherlands
|
|
)
|
|
assert record.country == "BE" # Should be forced to BE
|
|
|
|
|
|
class TestBelgianISILParser:
|
|
"""Test Belgian ISIL CSV parser"""
|
|
|
|
def test_parse_real_file(self, parser, sample_csv_path):
|
|
"""Test parsing real Belgian ISIL CSV file"""
|
|
if not sample_csv_path.exists():
|
|
pytest.skip("Belgian ISIL CSV file not found")
|
|
|
|
records = parser.parse_file(sample_csv_path)
|
|
|
|
assert len(records) > 400 # Should have 421 institutions
|
|
assert all(isinstance(r, BelgianISILRecord) for r in records)
|
|
assert all(r.isil_code.startswith("BE-") for r in records)
|
|
|
|
def test_archive_detection_by_code(self, parser):
|
|
"""Test archive detection from BE-A prefix"""
|
|
# Create mock archive record
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-A0001",
|
|
institution_name="Stadsarchief Test",
|
|
institution_type="Archive",
|
|
country="BE"
|
|
)
|
|
|
|
inst_type, ghcid_type = parser._infer_institution_type(
|
|
record.institution_type,
|
|
record.institution_name,
|
|
record.isil_code
|
|
)
|
|
|
|
assert inst_type == InstitutionTypeEnum.ARCHIVE
|
|
|
|
def test_library_detection(self, parser):
|
|
"""Test library detection from type field and name"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-TEST00",
|
|
institution_name="Bibliotheek Test",
|
|
institution_type="Public library",
|
|
country="BE"
|
|
)
|
|
|
|
inst_type, ghcid_type = parser._infer_institution_type(
|
|
record.institution_type,
|
|
record.institution_name,
|
|
record.isil_code
|
|
)
|
|
|
|
assert inst_type == InstitutionTypeEnum.LIBRARY
|
|
|
|
def test_museum_detection(self, parser):
|
|
"""Test museum detection from name"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-TEST00",
|
|
institution_name="Museum voor Schone Kunsten",
|
|
institution_type=None,
|
|
country="BE"
|
|
)
|
|
|
|
inst_type, ghcid_type = parser._infer_institution_type(
|
|
record.institution_type,
|
|
record.institution_name,
|
|
record.isil_code
|
|
)
|
|
|
|
assert inst_type == InstitutionTypeEnum.MUSEUM
|
|
|
|
|
|
class TestBelgianISILConversion:
|
|
"""Test conversion to HeritageCustodian model"""
|
|
|
|
def test_convert_to_heritage_custodian(self, parser):
|
|
"""Test converting Belgian record to HeritageCustodian"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-TEST00",
|
|
institution_name="Test Bibliotheek",
|
|
alternative_names="Test Library",
|
|
institution_type="Public library",
|
|
parent_organization="Municipality",
|
|
country="BE"
|
|
)
|
|
|
|
custodian = parser.to_heritage_custodian(record)
|
|
|
|
assert isinstance(custodian, HeritageCustodian)
|
|
assert custodian.id == "BE-TEST00"
|
|
assert custodian.name == "Test Bibliotheek"
|
|
assert str(custodian.institution_type) == "LIBRARY"
|
|
assert "Test Library" in custodian.alternative_names
|
|
|
|
def test_provenance_metadata(self, parser):
|
|
"""Test provenance metadata is correctly set"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-TEST00",
|
|
institution_name="Test",
|
|
country="BE"
|
|
)
|
|
|
|
custodian = parser.to_heritage_custodian(record)
|
|
|
|
assert custodian.provenance.data_source == DataSourceEnum.CSV_REGISTRY
|
|
assert custodian.provenance.data_tier == DataTierEnum.TIER_1_AUTHORITATIVE
|
|
assert custodian.provenance.confidence_score == 1.0
|
|
assert custodian.provenance.verified_by == "Royal Library of Belgium (KBR)"
|
|
|
|
def test_identifiers_creation(self, parser):
|
|
"""Test ISIL identifier is created correctly"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-TEST00",
|
|
institution_name="Test",
|
|
website="https://test.be",
|
|
country="BE"
|
|
)
|
|
|
|
custodian = parser.to_heritage_custodian(record)
|
|
|
|
assert len(custodian.identifiers) >= 1
|
|
|
|
# Check ISIL identifier
|
|
isil_id = next(i for i in custodian.identifiers if i.identifier_scheme == "ISIL")
|
|
assert isil_id.identifier_value == "BE-TEST00"
|
|
assert "isil.kbr.be" in isil_id.identifier_url
|
|
|
|
# Check website identifier
|
|
web_id = next((i for i in custodian.identifiers if i.identifier_scheme == "Website"), None)
|
|
if web_id:
|
|
assert web_id.identifier_value == "https://test.be"
|
|
|
|
def test_alternative_names_collection(self, parser):
|
|
"""Test alternative names are collected correctly"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-TEST00",
|
|
institution_name="Bibliotheek Test",
|
|
institution_name_en="Test Library",
|
|
alternative_names="Bib Test",
|
|
acronym="BT",
|
|
country="BE"
|
|
)
|
|
|
|
custodian = parser.to_heritage_custodian(record)
|
|
|
|
assert len(custodian.alternative_names) == 3
|
|
assert "Bib Test" in custodian.alternative_names
|
|
assert "BT" in custodian.alternative_names
|
|
assert "Test Library" in custodian.alternative_names
|
|
|
|
def test_description_building(self, parser):
|
|
"""Test description is built from multiple fields"""
|
|
record = BelgianISILRecord(
|
|
isil_code="BE-TEST00",
|
|
institution_name="Test",
|
|
parent_organization="Municipality",
|
|
collection_description="Historic collection",
|
|
notes="Important archive",
|
|
publicly_accessible="Yes",
|
|
country="BE"
|
|
)
|
|
|
|
custodian = parser.to_heritage_custodian(record)
|
|
|
|
assert custodian.description is not None
|
|
assert "Municipality" in custodian.description
|
|
assert "Historic collection" in custodian.description
|
|
assert "Important archive" in custodian.description
|
|
assert "Publicly accessible: Yes" in custodian.description
|
|
|
|
|
|
class TestBelgianISILIntegration:
|
|
"""Integration tests with real data"""
|
|
|
|
def test_parse_and_convert_real_data(self, parser, sample_csv_path):
|
|
"""Test full pipeline with real Belgian ISIL data"""
|
|
if not sample_csv_path.exists():
|
|
pytest.skip("Belgian ISIL CSV file not found")
|
|
|
|
custodians = parser.parse_and_convert(sample_csv_path)
|
|
|
|
# Check count
|
|
assert len(custodians) > 400
|
|
|
|
# Check all are HeritageCustodian instances
|
|
assert all(isinstance(c, HeritageCustodian) for c in custodians)
|
|
|
|
# Check all have required fields
|
|
assert all(c.id for c in custodians)
|
|
assert all(c.name for c in custodians)
|
|
assert all(c.institution_type for c in custodians)
|
|
assert all(c.provenance for c in custodians)
|
|
|
|
# Check all have ISIL identifiers
|
|
assert all(c.identifiers for c in custodians)
|
|
assert all(
|
|
any(i.identifier_scheme == "ISIL" for i in c.identifiers)
|
|
for c in custodians
|
|
)
|
|
|
|
def test_institution_type_distribution(self, parser, sample_csv_path):
|
|
"""Test institution type distribution is reasonable"""
|
|
if not sample_csv_path.exists():
|
|
pytest.skip("Belgian ISIL CSV file not found")
|
|
|
|
custodians = parser.parse_and_convert(sample_csv_path)
|
|
|
|
from collections import Counter
|
|
type_counts = Counter(str(c.institution_type) for c in custodians)
|
|
|
|
# Belgian registry is predominantly libraries
|
|
assert type_counts["LIBRARY"] > 300
|
|
|
|
# Should have some archives (BE-A codes)
|
|
assert type_counts["ARCHIVE"] > 0
|
|
|
|
# Might have a few museums
|
|
# (no assertion - could be 0)
|
|
|
|
def test_batch_extraction_date(self, parser, sample_csv_path):
|
|
"""Test all records share same batch extraction date"""
|
|
if not sample_csv_path.exists():
|
|
pytest.skip("Belgian ISIL CSV file not found")
|
|
|
|
custodians = parser.parse_and_convert(sample_csv_path)
|
|
|
|
# All records should have the same extraction date (batch import)
|
|
extraction_dates = {c.provenance.extraction_date for c in custodians}
|
|
assert len(extraction_dates) == 1 # Only one unique date
|
|
|
|
def test_archive_code_prefix(self, parser, sample_csv_path):
|
|
"""Test BE-A prefix correctly identifies archives"""
|
|
if not sample_csv_path.exists():
|
|
pytest.skip("Belgian ISIL CSV file not found")
|
|
|
|
custodians = parser.parse_and_convert(sample_csv_path)
|
|
|
|
# Get all BE-A codes
|
|
a_codes = [c for c in custodians if c.id.startswith("BE-A")]
|
|
|
|
# All BE-A codes should be classified as archives
|
|
archives = [c for c in a_codes if c.institution_type == InstitutionTypeEnum.ARCHIVE]
|
|
|
|
# Should have high match rate (allowing for some museums/libraries with A codes)
|
|
assert len(archives) > len(a_codes) * 0.8 # At least 80% should be archives
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|