446 lines
19 KiB
Python
446 lines
19 KiB
Python
"""
|
|
Tests for Dutch Organizations CSV Parser
|
|
"""
|
|
|
|
import pytest
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import tempfile
|
|
import os
|
|
|
|
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser, DutchOrgRecord
|
|
from glam_extractor.models import (
|
|
DataTier, DataSource, InstitutionType, DigitalPlatformType
|
|
)
|
|
|
|
|
|
# Sample Dutch Organizations CSV data
|
|
SAMPLE_DUTCH_ORGS_CSV = ''',Plaatsnaam bezoekadres ,Straat en huisnummer bezoekadres ,Organisatie,Koepelorganisatie,Webadres organisatie,Type organisatie,Opmerkingen Inez ,ISIL-code (NA),Samenwerkingsverband / Platform,Systeem,Versnellen,Collectie Nederland,Museum register,Rijkscollectie,Bibliotheek collectie,in scope voor DC4EU,DC4EU aansluit route,Archieven.nl,Archives Portal Europe,WO2Net,Modemuze,Maritiem Digitaal,Delfts aardewerk,Stichting Academisch Erfgoed,Coleccion Aruba,Van Gogh Worldwide,OODE24 (Mondriaan),Linked Data,Datasetregister,Versnellen project,Opmerkingen,
|
|
Drenthe,Hooghalen,Oosthalen 8,Stichting Herinneringscentrum Kamp Westerbork,,https://kampwesterbork.nl/,museum,,NL-HhlHCKW,,Atlantis,ja,,ja,,,,,,,ja,,,,,,,ja,ja,,,
|
|
,Amsterdam,Museumplein 1,Rijksmuseum,,https://www.rijksmuseum.nl,museum,,NL-AsdRM,Museumvereniging,Axiell Collections,ja,ja,ja,ja,,ja,,,,,,,,,,ja,,ja,ja,Data enrichment,National museum
|
|
,Rotterdam,Museumpark 18-20,Museum Boijmans Van Beuningen,Stichting MBVB,https://www.boijmans.nl,museum,Major collection,NL-RtMBVB,,TMS,ja,ja,ja,ja,,,,,,,ja,,,,,,ja,ja,,Under renovation
|
|
,Den Haag,Prins Willem-Alexanderhof 20,Nationaal Archief,,https://www.nationaalarchief.nl,archief,,NL-HANA,,MAIS,,,,,,,,ja,ja,,,,,,,,,,ja,,
|
|
,Leiden,,Universiteitsbibliotheek Leiden,Universiteit Leiden,https://www.library.universiteitleiden.nl,bibliotheek,,NL-LdnUB,,Alma,,,,,ja,ja,,,,,,,,,ja,,,,,OCLC member
|
|
'''
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_csv_file():
|
|
"""Create a temporary CSV file with sample data"""
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f:
|
|
f.write(SAMPLE_DUTCH_ORGS_CSV)
|
|
temp_path = f.name
|
|
|
|
yield temp_path
|
|
|
|
# Cleanup
|
|
os.unlink(temp_path)
|
|
|
|
|
|
class TestDutchOrgRecord:
|
|
"""Test DutchOrgRecord Pydantic model"""
|
|
|
|
def test_create_valid_record(self):
|
|
"""Test creating a valid Dutch org record"""
|
|
record = DutchOrgRecord(
|
|
plaatsnaam="Amsterdam",
|
|
straat_huisnummer="Museumplein 1",
|
|
organisatie="Rijksmuseum",
|
|
webadres="https://www.rijksmuseum.nl",
|
|
type_organisatie="museum",
|
|
isil_code="NL-AsdRM",
|
|
systeem="Axiell Collections",
|
|
collectie_nederland="ja",
|
|
museum_register="ja"
|
|
)
|
|
|
|
assert record.plaatsnaam == "Amsterdam"
|
|
assert record.organisatie == "Rijksmuseum"
|
|
assert record.type_organisatie == "museum"
|
|
assert record.isil_code == "NL-AsdRM"
|
|
|
|
def test_type_normalization(self):
|
|
"""Test organization type normalization"""
|
|
record = DutchOrgRecord(
|
|
organisatie="Test",
|
|
type_organisatie=" MUSEUM "
|
|
)
|
|
assert record.type_organisatie == "museum"
|
|
|
|
# Empty type
|
|
record2 = DutchOrgRecord(
|
|
organisatie="Test",
|
|
type_organisatie=""
|
|
)
|
|
assert record2.type_organisatie is None
|
|
|
|
def test_isil_normalization(self):
|
|
"""Test ISIL code normalization"""
|
|
record = DutchOrgRecord(
|
|
organisatie="Test",
|
|
isil_code=" NL-TEST "
|
|
)
|
|
assert record.isil_code == "NL-TEST"
|
|
|
|
# Empty ISIL
|
|
record2 = DutchOrgRecord(
|
|
organisatie="Test",
|
|
isil_code=""
|
|
)
|
|
assert record2.isil_code is None
|
|
|
|
def test_get_platforms(self):
|
|
"""Test extracting platforms from yes/no fields"""
|
|
record = DutchOrgRecord(
|
|
organisatie="Test Museum",
|
|
collectie_nederland="ja",
|
|
museum_register="ja",
|
|
rijkscollectie="x",
|
|
archieven_nl="",
|
|
wo2net="nee"
|
|
)
|
|
|
|
platforms = record.get_platforms()
|
|
|
|
assert "Collectie Nederland" in platforms
|
|
assert "Museum Register" in platforms
|
|
assert "Rijkscollectie" in platforms
|
|
assert "Archieven.nl" not in platforms
|
|
assert "WO2Net" not in platforms
|
|
assert len(platforms) == 3
|
|
|
|
def test_is_yes_method(self):
|
|
"""Test the _is_yes helper method"""
|
|
record = DutchOrgRecord(organisatie="Test")
|
|
|
|
# Various affirmative values
|
|
assert record._is_yes("ja") is True
|
|
assert record._is_yes("yes") is True
|
|
assert record._is_yes("x") is True
|
|
assert record._is_yes("✓") is True
|
|
assert record._is_yes("JA") is True # Case insensitive
|
|
|
|
# Negative values
|
|
assert record._is_yes("nee") is False
|
|
assert record._is_yes("no") is False
|
|
assert record._is_yes("") is False
|
|
assert record._is_yes(None) is False
|
|
|
|
|
|
class TestDutchOrgsParser:
|
|
"""Test DutchOrgsParser class"""
|
|
|
|
def test_parse_file(self, sample_csv_file):
|
|
"""Test parsing a complete CSV file"""
|
|
parser = DutchOrgsParser()
|
|
records = parser.parse_file(sample_csv_file)
|
|
|
|
assert len(records) == 5
|
|
|
|
# Check Kamp Westerbork
|
|
assert records[0].organisatie == "Stichting Herinneringscentrum Kamp Westerbork"
|
|
assert records[0].plaatsnaam == "Hooghalen"
|
|
assert records[0].straat_huisnummer == "Oosthalen 8"
|
|
assert records[0].type_organisatie == "museum"
|
|
assert records[0].isil_code == "NL-HhlHCKW"
|
|
assert records[0].systeem == "Atlantis"
|
|
|
|
# Check Rijksmuseum
|
|
rijks = [r for r in records if r.organisatie == "Rijksmuseum"][0]
|
|
assert rijks.plaatsnaam == "Amsterdam"
|
|
assert rijks.isil_code == "NL-AsdRM"
|
|
assert rijks.systeem == "Axiell Collections"
|
|
assert rijks.samenwerkingsverband == "Museumvereniging"
|
|
assert rijks.opmerkingen == "National museum"
|
|
|
|
# Check Nationaal Archief
|
|
na = [r for r in records if r.organisatie == "Nationaal Archief"][0]
|
|
assert na.type_organisatie == "archief"
|
|
assert na.isil_code == "NL-HANA"
|
|
assert na.systeem == "MAIS"
|
|
|
|
def test_parse_nonexistent_file(self):
|
|
"""Test parsing a file that doesn't exist"""
|
|
parser = DutchOrgsParser()
|
|
|
|
with pytest.raises(FileNotFoundError):
|
|
parser.parse_file("/nonexistent/file.csv")
|
|
|
|
def test_to_heritage_custodian_museum(self, sample_csv_file):
|
|
"""Test converting museum record to HeritageCustodian"""
|
|
parser = DutchOrgsParser()
|
|
records = parser.parse_file(sample_csv_file)
|
|
|
|
# Get Rijksmuseum record
|
|
rijks_record = [r for r in records if r.organisatie == "Rijksmuseum"][0]
|
|
custodian = parser.to_heritage_custodian(rijks_record, "/path/to/csv")
|
|
|
|
# Check basic fields
|
|
assert custodian.name == "Rijksmuseum"
|
|
# Compare as string since InstitutionTypeEnum != PermissibleValue
|
|
assert str(custodian.institution_type) == 'MUSEUM'
|
|
|
|
# Check location
|
|
assert custodian.locations[0].city == "Amsterdam"
|
|
assert custodian.locations[0].street_address == "Museumplein 1"
|
|
assert custodian.locations[0].country == "NL"
|
|
|
|
# Check identifiers
|
|
isil_ids = [i for i in custodian.identifiers if i.identifier_scheme == "ISIL"]
|
|
assert len(isil_ids) == 1
|
|
assert isil_ids[0].identifier_value == "NL-AsdRM"
|
|
|
|
url_ids = [i for i in custodian.identifiers if i.identifier_scheme == "URL"]
|
|
assert len(url_ids) == 1
|
|
assert "rijksmuseum.nl" in url_ids[0].identifier_value
|
|
|
|
# Check digital platforms
|
|
assert custodian.digital_platforms is not None
|
|
assert len(custodian.digital_platforms) > 0
|
|
|
|
# Should have collection management system
|
|
cms_platforms = [p for p in custodian.digital_platforms
|
|
if str(p.platform_type) == 'COLLECTION_MANAGEMENT']
|
|
assert len(cms_platforms) == 1
|
|
assert cms_platforms[0].platform_name == "Axiell Collections"
|
|
|
|
# Should have aggregator platforms
|
|
aggregators = [p for p in custodian.digital_platforms
|
|
if str(p.platform_type) == 'AGGREGATOR']
|
|
assert len(aggregators) > 0
|
|
assert any(p.platform_name == "Collectie Nederland" for p in aggregators)
|
|
assert any(p.platform_name == "Museum Register" for p in aggregators)
|
|
|
|
# Check provenance
|
|
assert str(custodian.provenance.data_source) == 'DUTCH_ORG_CSV'
|
|
assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
|
|
assert custodian.provenance.extraction_method == "DutchOrgsParser"
|
|
assert custodian.provenance.confidence_score == 1.0
|
|
# Notes is not a Provenance field - opmerking should be in description
|
|
assert "National museum" in custodian.description
|
|
# Samenwerkingsverband is in the DutchHeritageCustodian model, not provenance
|
|
|
|
def test_to_heritage_custodian_archive(self, sample_csv_file):
|
|
"""Test converting archive record to HeritageCustodian"""
|
|
parser = DutchOrgsParser()
|
|
records = parser.parse_file(sample_csv_file)
|
|
|
|
# Get Nationaal Archief record
|
|
na_record = [r for r in records if r.organisatie == "Nationaal Archief"][0]
|
|
custodian = parser.to_heritage_custodian(na_record)
|
|
|
|
assert custodian.name == "Nationaal Archief"
|
|
# Compare as string since InstitutionTypeEnum != PermissibleValue
|
|
assert str(custodian.institution_type) == 'ARCHIVE'
|
|
assert custodian.locations[0].city == "Den Haag"
|
|
|
|
# Check for MAIS system
|
|
cms = [p for p in custodian.digital_platforms
|
|
if str(p.platform_type) == 'COLLECTION_MANAGEMENT']
|
|
assert len(cms) == 1
|
|
assert cms[0].platform_name == "MAIS"
|
|
|
|
def test_to_heritage_custodian_library(self, sample_csv_file):
|
|
"""Test converting library record to HeritageCustodian"""
|
|
parser = DutchOrgsParser()
|
|
records = parser.parse_file(sample_csv_file)
|
|
|
|
# Get UB Leiden record
|
|
ub_record = [r for r in records if "Universiteitsbibliotheek" in r.organisatie][0]
|
|
custodian = parser.to_heritage_custodian(ub_record)
|
|
|
|
assert custodian.name == "Universiteitsbibliotheek Leiden"
|
|
assert str(custodian.institution_type) == 'LIBRARY'
|
|
assert custodian.locations[0].city == "Leiden"
|
|
|
|
# Check for Alma system
|
|
cms = [p for p in custodian.digital_platforms
|
|
if str(p.platform_type) == 'COLLECTION_MANAGEMENT']
|
|
assert len(cms) == 1
|
|
assert cms[0].platform_name == "Alma"
|
|
|
|
# Check parent organization in description
|
|
assert "Universiteit Leiden" in custodian.description
|
|
|
|
def test_partnerships_creation(self, sample_csv_file):
|
|
"""Test that Partnership objects are created from Dutch network memberships"""
|
|
parser = DutchOrgsParser()
|
|
records = parser.parse_file(sample_csv_file)
|
|
|
|
# Get Rijksmuseum record (has many partnerships)
|
|
rijks_record = [r for r in records if r.organisatie == "Rijksmuseum"][0]
|
|
custodian = parser.to_heritage_custodian(rijks_record)
|
|
|
|
# Should have multiple partnerships
|
|
assert custodian.partnerships is not None
|
|
assert len(custodian.partnerships) > 0
|
|
|
|
# Check for specific partnerships based on CSV data
|
|
partnership_names = [p.partner_name for p in custodian.partnerships]
|
|
|
|
# National platforms
|
|
assert "Museum Register" in partnership_names
|
|
assert "Rijkscollectie" in partnership_names
|
|
assert "Collectie Nederland" in partnership_names
|
|
|
|
# Digitization programs
|
|
assert "Versnellen" in partnership_names
|
|
|
|
# International networks
|
|
assert "Van Gogh Worldwide" in partnership_names
|
|
|
|
# EU platforms (DC4EU is "ja" in CSV)
|
|
assert "DC4EU" in partnership_names
|
|
|
|
# Check partnership types are assigned correctly
|
|
museum_register_partnership = [p for p in custodian.partnerships if p.partner_name == "Museum Register"][0]
|
|
assert museum_register_partnership.partnership_type == "national_museum_certification"
|
|
|
|
rijkscollectie_partnership = [p for p in custodian.partnerships if p.partner_name == "Rijkscollectie"][0]
|
|
assert rijkscollectie_partnership.partnership_type == "national_collection_designation"
|
|
|
|
collectie_nl_partnership = [p for p in custodian.partnerships if p.partner_name == "Collectie Nederland"][0]
|
|
assert collectie_nl_partnership.partnership_type == "aggregator_participation"
|
|
|
|
versnellen_partnership = [p for p in custodian.partnerships if p.partner_name == "Versnellen"][0]
|
|
assert versnellen_partnership.partnership_type == "digitization_program"
|
|
|
|
van_gogh_partnership = [p for p in custodian.partnerships if p.partner_name == "Van Gogh Worldwide"][0]
|
|
assert van_gogh_partnership.partnership_type == "international_thematic_network"
|
|
|
|
def test_parse_and_convert(self, sample_csv_file):
|
|
"""Test end-to-end parsing and conversion"""
|
|
parser = DutchOrgsParser()
|
|
custodians = parser.parse_and_convert(sample_csv_file)
|
|
|
|
assert len(custodians) == 5
|
|
|
|
# All should have TIER_1 provenance
|
|
for custodian in custodians:
|
|
assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
|
|
assert custodian.provenance.confidence_score == 1.0
|
|
|
|
# Check institution types are mapped
|
|
types_found = set()
|
|
for custodian in custodians:
|
|
types_found.add(str(custodian.institution_type))
|
|
|
|
assert 'MUSEUM' in types_found
|
|
assert 'ARCHIVE' in types_found
|
|
assert 'LIBRARY' in types_found
|
|
|
|
def test_url_normalization(self):
|
|
"""Test that URLs without http:// are normalized"""
|
|
parser = DutchOrgsParser()
|
|
|
|
record = DutchOrgRecord(
|
|
organisatie="Test Museum",
|
|
webadres="www.example.com"
|
|
)
|
|
|
|
custodian = parser.to_heritage_custodian(record)
|
|
|
|
url_ids = [i for i in custodian.identifiers if i.identifier_scheme == "URL"]
|
|
assert len(url_ids) == 1
|
|
assert url_ids[0].identifier_value.startswith("https://")
|
|
|
|
|
|
class TestDutchOrgsParserEdgeCases:
|
|
"""Test edge cases and error handling"""
|
|
|
|
def test_empty_csv(self):
|
|
"""Test parsing an empty CSV file"""
|
|
csv_data = ''',Plaatsnaam bezoekadres ,Straat en huisnummer bezoekadres ,Organisatie,Koepelorganisatie,Webadres organisatie,Type organisatie,Opmerkingen Inez ,ISIL-code (NA),Samenwerkingsverband / Platform,Systeem,Versnellen,Collectie Nederland,Museum register,Rijkscollectie,Bibliotheek collectie,in scope voor DC4EU,DC4EU aansluit route,Archieven.nl,Archives Portal Europe,WO2Net,Modemuze,Maritiem Digitaal,Delfts aardewerk,Stichting Academisch Erfgoed,Coleccion Aruba,Van Gogh Worldwide,OODE24 (Mondriaan),Linked Data,Datasetregister,Versnellen project,Opmerkingen,
|
|
'''
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
|
|
f.write(csv_data)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
parser = DutchOrgsParser()
|
|
records = parser.parse_file(temp_path)
|
|
assert len(records) == 0
|
|
finally:
|
|
os.unlink(temp_path)
|
|
|
|
def test_row_without_organization_name_skipped(self):
|
|
"""Test that rows without organization name are skipped"""
|
|
csv_data = ''',Plaatsnaam bezoekadres ,Straat en huisnummer bezoekadres ,Organisatie,Koepelorganisatie,Webadres organisatie,Type organisatie,Opmerkingen Inez ,ISIL-code (NA),Samenwerkingsverband / Platform,Systeem,Versnellen,Collectie Nederland,Museum register,Rijkscollectie,Bibliotheek collectie,in scope voor DC4EU,DC4EU aansluit route,Archieven.nl,Archives Portal Europe,WO2Net,Modemuze,Maritiem Digitaal,Delfts aardewerk,Stichting Academisch Erfgoed,Coleccion Aruba,Van Gogh Worldwide,OODE24 (Mondriaan),Linked Data,Datasetregister,Versnellen project,Opmerkingen,
|
|
,Amsterdam,Test Street 1,,,,museum,,,,,,,,,,,,,,,,,,,,,,,,
|
|
,Rotterdam,,Valid Museum,,https://example.com,museum,,NL-TEST,,,,,,,,,,,,,,,,,,,,,,
|
|
'''
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
|
|
f.write(csv_data)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
parser = DutchOrgsParser()
|
|
records = parser.parse_file(temp_path)
|
|
|
|
# Should only parse the row with organization name
|
|
assert len(records) == 1
|
|
assert records[0].organisatie == "Valid Museum"
|
|
finally:
|
|
os.unlink(temp_path)
|
|
|
|
def test_minimal_record(self):
|
|
"""Test parsing record with minimal data"""
|
|
record = DutchOrgRecord(organisatie="Minimal Museum")
|
|
custodian = DutchOrgsParser().to_heritage_custodian(record)
|
|
|
|
assert custodian.name == "Minimal Museum"
|
|
assert str(custodian.institution_type) == 'MIXED' # No type specified, defaults to MIXED
|
|
assert custodian.identifiers == [] # No identifiers
|
|
assert custodian.locations[0].country == "NL"
|
|
assert custodian.locations[0].city is None
|
|
assert custodian.digital_platforms is None or custodian.digital_platforms == []
|
|
|
|
def test_path_as_string_or_pathlib(self, sample_csv_file):
|
|
"""Test that parser accepts both string and Path objects"""
|
|
parser = DutchOrgsParser()
|
|
|
|
# Test with string
|
|
records_str = parser.parse_file(sample_csv_file)
|
|
|
|
# Test with Path
|
|
records_path = parser.parse_file(Path(sample_csv_file))
|
|
|
|
assert len(records_str) == len(records_path)
|
|
assert records_str[0].organisatie == records_path[0].organisatie
|
|
|
|
|
|
class TestDutchOrgsTypeMapping:
|
|
"""Test institution type mapping"""
|
|
|
|
def test_type_mapping(self):
|
|
"""Test that Dutch types are correctly mapped to InstitutionType enum"""
|
|
parser = DutchOrgsParser()
|
|
|
|
test_cases = [
|
|
("museum", 'MUSEUM'),
|
|
("archief", 'ARCHIVE'),
|
|
("bibliotheek", 'LIBRARY'),
|
|
("library", 'LIBRARY'),
|
|
("archive", 'ARCHIVE'),
|
|
]
|
|
|
|
for dutch_type, expected_type in test_cases:
|
|
record = DutchOrgRecord(
|
|
organisatie="Test",
|
|
type_organisatie=dutch_type
|
|
)
|
|
custodian = parser.to_heritage_custodian(record)
|
|
assert str(custodian.institution_type) == expected_type
|
|
|
|
def test_unknown_type_results_in_mixed(self):
|
|
"""Test that unknown organization types result in MIXED type"""
|
|
parser = DutchOrgsParser()
|
|
|
|
record = DutchOrgRecord(
|
|
organisatie="Test",
|
|
type_organisatie="unknown_type"
|
|
)
|
|
custodian = parser.to_heritage_custodian(record)
|
|
assert str(custodian.institution_type) == 'MIXED'
|