glam/tests/parsers/test_belgian_isil.py.bak
2025-11-19 23:25:22 +01:00

329 lines
11 KiB
Python

"""
Test suite for Belgian ISIL registry parser.
Tests parsing, validation, and conversion of Belgian heritage institutions
from the KBR (Royal Library of Belgium) ISIL registry.
"""
import pytest
from pathlib import Path
from datetime import datetime, timezone
from glam_extractor.parsers.belgian_isil import (
BelgianISILParser,
BelgianISILRecord
)
from glam_extractor.models import (
HeritageCustodian,
InstitutionTypeEnum,
DataSourceEnum,
DataTierEnum,
)
@pytest.fixture
def sample_csv_path():
"""Path to real Belgian ISIL CSV file"""
return Path("data/isil/belgian_isil_detailed.csv")
@pytest.fixture
def parser():
"""Create parser instance"""
return BelgianISILParser()
class TestBelgianISILRecord:
"""Test Belgian ISIL record model"""
def test_valid_library_isil_code(self):
"""Test valid library ISIL code validation"""
record = BelgianISILRecord(
isil_code="BE-OSE00",
institution_name="Test Library",
country="BE"
)
assert record.isil_code == "BE-OSE00"
assert record.country == "BE"
def test_valid_archive_isil_code(self):
"""Test valid archive ISIL code validation"""
record = BelgianISILRecord(
isil_code="BE-A0001",
institution_name="Test Archive",
country="BE"
)
assert record.isil_code == "BE-A0001"
def test_invalid_isil_code(self):
"""Test invalid ISIL code format"""
with pytest.raises(ValueError, match="Invalid Belgian ISIL code format"):
BelgianISILRecord(
isil_code="NL-AmRMA", # Dutch code, not Belgian
institution_name="Test",
country="BE"
)
def test_empty_isil_code(self):
"""Test empty ISIL code rejection"""
with pytest.raises(ValueError, match="ISIL code cannot be empty"):
BelgianISILRecord(
isil_code="",
institution_name="Test",
country="BE"
)
def test_country_always_be(self):
"""Test country is always set to BE"""
record = BelgianISILRecord(
isil_code="BE-TEST00",
institution_name="Test",
country="NL" # Try to set to Netherlands
)
assert record.country == "BE" # Should be forced to BE
class TestBelgianISILParser:
"""Test Belgian ISIL CSV parser"""
def test_parse_real_file(self, parser, sample_csv_path):
"""Test parsing real Belgian ISIL CSV file"""
if not sample_csv_path.exists():
pytest.skip("Belgian ISIL CSV file not found")
records = parser.parse_file(sample_csv_path)
assert len(records) > 400 # Should have 421 institutions
assert all(isinstance(r, BelgianISILRecord) for r in records)
assert all(r.isil_code.startswith("BE-") for r in records)
def test_archive_detection_by_code(self, parser):
"""Test archive detection from BE-A prefix"""
# Create mock archive record
record = BelgianISILRecord(
isil_code="BE-A0001",
institution_name="Stadsarchief Test",
institution_type="Archive",
country="BE"
)
inst_type, ghcid_type = parser._infer_institution_type(
record.institution_type,
record.institution_name,
record.isil_code
)
assert inst_type == InstitutionTypeEnum.ARCHIVE
def test_library_detection(self, parser):
"""Test library detection from type field and name"""
record = BelgianISILRecord(
isil_code="BE-TEST00",
institution_name="Bibliotheek Test",
institution_type="Public library",
country="BE"
)
inst_type, ghcid_type = parser._infer_institution_type(
record.institution_type,
record.institution_name,
record.isil_code
)
assert inst_type == InstitutionTypeEnum.LIBRARY
def test_museum_detection(self, parser):
"""Test museum detection from name"""
record = BelgianISILRecord(
isil_code="BE-TEST00",
institution_name="Museum voor Schone Kunsten",
institution_type=None,
country="BE"
)
inst_type, ghcid_type = parser._infer_institution_type(
record.institution_type,
record.institution_name,
record.isil_code
)
assert inst_type == InstitutionTypeEnum.MUSEUM
class TestBelgianISILConversion:
"""Test conversion to HeritageCustodian model"""
def test_convert_to_heritage_custodian(self, parser):
"""Test converting Belgian record to HeritageCustodian"""
record = BelgianISILRecord(
isil_code="BE-TEST00",
institution_name="Test Bibliotheek",
alternative_names="Test Library",
institution_type="Public library",
parent_organization="Municipality",
country="BE"
)
custodian = parser.to_heritage_custodian(record)
assert isinstance(custodian, HeritageCustodian)
assert custodian.id == "BE-TEST00"
assert custodian.name == "Test Bibliotheek"
assert str(custodian.institution_type) == "LIBRARY"
assert "Test Library" in custodian.alternative_names
def test_provenance_metadata(self, parser):
"""Test provenance metadata is correctly set"""
record = BelgianISILRecord(
isil_code="BE-TEST00",
institution_name="Test",
country="BE"
)
custodian = parser.to_heritage_custodian(record)
assert custodian.provenance.data_source == DataSourceEnum.CSV_REGISTRY
assert custodian.provenance.data_tier == DataTierEnum.TIER_1_AUTHORITATIVE
assert custodian.provenance.confidence_score == 1.0
assert custodian.provenance.verified_by == "Royal Library of Belgium (KBR)"
def test_identifiers_creation(self, parser):
"""Test ISIL identifier is created correctly"""
record = BelgianISILRecord(
isil_code="BE-TEST00",
institution_name="Test",
website="https://test.be",
country="BE"
)
custodian = parser.to_heritage_custodian(record)
assert len(custodian.identifiers) >= 1
# Check ISIL identifier
isil_id = next(i for i in custodian.identifiers if i.identifier_scheme == "ISIL")
assert isil_id.identifier_value == "BE-TEST00"
assert "isil.kbr.be" in isil_id.identifier_url
# Check website identifier
web_id = next((i for i in custodian.identifiers if i.identifier_scheme == "Website"), None)
if web_id:
assert web_id.identifier_value == "https://test.be"
def test_alternative_names_collection(self, parser):
"""Test alternative names are collected correctly"""
record = BelgianISILRecord(
isil_code="BE-TEST00",
institution_name="Bibliotheek Test",
institution_name_en="Test Library",
alternative_names="Bib Test",
acronym="BT",
country="BE"
)
custodian = parser.to_heritage_custodian(record)
assert len(custodian.alternative_names) == 3
assert "Bib Test" in custodian.alternative_names
assert "BT" in custodian.alternative_names
assert "Test Library" in custodian.alternative_names
def test_description_building(self, parser):
"""Test description is built from multiple fields"""
record = BelgianISILRecord(
isil_code="BE-TEST00",
institution_name="Test",
parent_organization="Municipality",
collection_description="Historic collection",
notes="Important archive",
publicly_accessible="Yes",
country="BE"
)
custodian = parser.to_heritage_custodian(record)
assert custodian.description is not None
assert "Municipality" in custodian.description
assert "Historic collection" in custodian.description
assert "Important archive" in custodian.description
assert "Publicly accessible: Yes" in custodian.description
class TestBelgianISILIntegration:
"""Integration tests with real data"""
def test_parse_and_convert_real_data(self, parser, sample_csv_path):
"""Test full pipeline with real Belgian ISIL data"""
if not sample_csv_path.exists():
pytest.skip("Belgian ISIL CSV file not found")
custodians = parser.parse_and_convert(sample_csv_path)
# Check count
assert len(custodians) > 400
# Check all are HeritageCustodian instances
assert all(isinstance(c, HeritageCustodian) for c in custodians)
# Check all have required fields
assert all(c.id for c in custodians)
assert all(c.name for c in custodians)
assert all(c.institution_type for c in custodians)
assert all(c.provenance for c in custodians)
# Check all have ISIL identifiers
assert all(c.identifiers for c in custodians)
assert all(
any(i.identifier_scheme == "ISIL" for i in c.identifiers)
for c in custodians
)
def test_institution_type_distribution(self, parser, sample_csv_path):
"""Test institution type distribution is reasonable"""
if not sample_csv_path.exists():
pytest.skip("Belgian ISIL CSV file not found")
custodians = parser.parse_and_convert(sample_csv_path)
from collections import Counter
type_counts = Counter(str(c.institution_type) for c in custodians)
# Belgian registry is predominantly libraries
assert type_counts["LIBRARY"] > 300
# Should have some archives (BE-A codes)
assert type_counts["ARCHIVE"] > 0
# Might have a few museums
# (no assertion - could be 0)
def test_batch_extraction_date(self, parser, sample_csv_path):
"""Test all records share same batch extraction date"""
if not sample_csv_path.exists():
pytest.skip("Belgian ISIL CSV file not found")
custodians = parser.parse_and_convert(sample_csv_path)
# All records should have the same extraction date (batch import)
extraction_dates = {c.provenance.extraction_date for c in custodians}
assert len(extraction_dates) == 1 # Only one unique date
def test_archive_code_prefix(self, parser, sample_csv_path):
"""Test BE-A prefix correctly identifies archives"""
if not sample_csv_path.exists():
pytest.skip("Belgian ISIL CSV file not found")
custodians = parser.parse_and_convert(sample_csv_path)
# Get all BE-A codes
a_codes = [c for c in custodians if c.id.startswith("BE-A")]
# All BE-A codes should be classified as archives
archives = [c for c in a_codes if c.institution_type == InstitutionTypeEnum.ARCHIVE]
# Should have high match rate (allowing for some museums/libraries with A codes)
assert len(archives) > len(a_codes) * 0.8 # At least 80% should be archives
if __name__ == "__main__":
pytest.main([__file__, "-v"])