""" Tests for Dutch Organizations CSV Parser """ import pytest from datetime import datetime from pathlib import Path import tempfile import os from glam_extractor.parsers.dutch_orgs import DutchOrgsParser, DutchOrgRecord from glam_extractor.models import ( DataTier, DataSource, InstitutionType, DigitalPlatformType ) # Sample Dutch Organizations CSV data SAMPLE_DUTCH_ORGS_CSV = ''',Plaatsnaam bezoekadres ,Straat en huisnummer bezoekadres ,Organisatie,Koepelorganisatie,Webadres organisatie,Type organisatie,Opmerkingen Inez ,ISIL-code (NA),Samenwerkingsverband / Platform,Systeem,Versnellen,Collectie Nederland,Museum register,Rijkscollectie,Bibliotheek collectie,in scope voor DC4EU,DC4EU aansluit route,Archieven.nl,Archives Portal Europe,WO2Net,Modemuze,Maritiem Digitaal,Delfts aardewerk,Stichting Academisch Erfgoed,Coleccion Aruba,Van Gogh Worldwide,OODE24 (Mondriaan),Linked Data,Datasetregister,Versnellen project,Opmerkingen, Drenthe,Hooghalen,Oosthalen 8,Stichting Herinneringscentrum Kamp Westerbork,,https://kampwesterbork.nl/,museum,,NL-HhlHCKW,,Atlantis,ja,,ja,,,,,,,ja,,,,,,,ja,ja,,, ,Amsterdam,Museumplein 1,Rijksmuseum,,https://www.rijksmuseum.nl,museum,,NL-AsdRM,Museumvereniging,Axiell Collections,ja,ja,ja,ja,,ja,,,,,,,,,,ja,,ja,ja,Data enrichment,National museum ,Rotterdam,Museumpark 18-20,Museum Boijmans Van Beuningen,Stichting MBVB,https://www.boijmans.nl,museum,Major collection,NL-RtMBVB,,TMS,ja,ja,ja,ja,,,,,,,ja,,,,,,ja,ja,,Under renovation ,Den Haag,Prins Willem-Alexanderhof 20,Nationaal Archief,,https://www.nationaalarchief.nl,archief,,NL-HANA,,MAIS,,,,,,,,ja,ja,,,,,,,,,,ja,, ,Leiden,,Universiteitsbibliotheek Leiden,Universiteit Leiden,https://www.library.universiteitleiden.nl,bibliotheek,,NL-LdnUB,,Alma,,,,,ja,ja,,,,,,,,,ja,,,,,OCLC member ''' @pytest.fixture def sample_csv_file(): """Create a temporary CSV file with sample data""" with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f: f.write(SAMPLE_DUTCH_ORGS_CSV) temp_path = f.name yield temp_path # Cleanup os.unlink(temp_path) class TestDutchOrgRecord: """Test DutchOrgRecord Pydantic model""" def test_create_valid_record(self): """Test creating a valid Dutch org record""" record = DutchOrgRecord( plaatsnaam="Amsterdam", straat_huisnummer="Museumplein 1", organisatie="Rijksmuseum", webadres="https://www.rijksmuseum.nl", type_organisatie="museum", isil_code="NL-AsdRM", systeem="Axiell Collections", collectie_nederland="ja", museum_register="ja" ) assert record.plaatsnaam == "Amsterdam" assert record.organisatie == "Rijksmuseum" assert record.type_organisatie == "museum" assert record.isil_code == "NL-AsdRM" def test_type_normalization(self): """Test organization type normalization""" record = DutchOrgRecord( organisatie="Test", type_organisatie=" MUSEUM " ) assert record.type_organisatie == "museum" # Empty type record2 = DutchOrgRecord( organisatie="Test", type_organisatie="" ) assert record2.type_organisatie is None def test_isil_normalization(self): """Test ISIL code normalization""" record = DutchOrgRecord( organisatie="Test", isil_code=" NL-TEST " ) assert record.isil_code == "NL-TEST" # Empty ISIL record2 = DutchOrgRecord( organisatie="Test", isil_code="" ) assert record2.isil_code is None def test_get_platforms(self): """Test extracting platforms from yes/no fields""" record = DutchOrgRecord( organisatie="Test Museum", collectie_nederland="ja", museum_register="ja", rijkscollectie="x", archieven_nl="", wo2net="nee" ) platforms = record.get_platforms() assert "Collectie Nederland" in platforms assert "Museum Register" in platforms assert "Rijkscollectie" in platforms assert "Archieven.nl" not in platforms assert "WO2Net" not in platforms assert len(platforms) == 3 def test_is_yes_method(self): """Test the _is_yes helper method""" record = DutchOrgRecord(organisatie="Test") # Various affirmative values assert record._is_yes("ja") is True assert record._is_yes("yes") is True assert record._is_yes("x") is True assert record._is_yes("✓") is True assert record._is_yes("JA") is True # Case insensitive # Negative values assert record._is_yes("nee") is False assert record._is_yes("no") is False assert record._is_yes("") is False assert record._is_yes(None) is False class TestDutchOrgsParser: """Test DutchOrgsParser class""" def test_parse_file(self, sample_csv_file): """Test parsing a complete CSV file""" parser = DutchOrgsParser() records = parser.parse_file(sample_csv_file) assert len(records) == 5 # Check Kamp Westerbork assert records[0].organisatie == "Stichting Herinneringscentrum Kamp Westerbork" assert records[0].plaatsnaam == "Hooghalen" assert records[0].straat_huisnummer == "Oosthalen 8" assert records[0].type_organisatie == "museum" assert records[0].isil_code == "NL-HhlHCKW" assert records[0].systeem == "Atlantis" # Check Rijksmuseum rijks = [r for r in records if r.organisatie == "Rijksmuseum"][0] assert rijks.plaatsnaam == "Amsterdam" assert rijks.isil_code == "NL-AsdRM" assert rijks.systeem == "Axiell Collections" assert rijks.samenwerkingsverband == "Museumvereniging" assert rijks.opmerkingen == "National museum" # Check Nationaal Archief na = [r for r in records if r.organisatie == "Nationaal Archief"][0] assert na.type_organisatie == "archief" assert na.isil_code == "NL-HANA" assert na.systeem == "MAIS" def test_parse_nonexistent_file(self): """Test parsing a file that doesn't exist""" parser = DutchOrgsParser() with pytest.raises(FileNotFoundError): parser.parse_file("/nonexistent/file.csv") def test_to_heritage_custodian_museum(self, sample_csv_file): """Test converting museum record to HeritageCustodian""" parser = DutchOrgsParser() records = parser.parse_file(sample_csv_file) # Get Rijksmuseum record rijks_record = [r for r in records if r.organisatie == "Rijksmuseum"][0] custodian = parser.to_heritage_custodian(rijks_record, "/path/to/csv") # Check basic fields assert custodian.name == "Rijksmuseum" # Compare as string since InstitutionTypeEnum != PermissibleValue assert str(custodian.institution_type) == 'MUSEUM' # Check location assert custodian.locations[0].city == "Amsterdam" assert custodian.locations[0].street_address == "Museumplein 1" assert custodian.locations[0].country == "NL" # Check identifiers isil_ids = [i for i in custodian.identifiers if i.identifier_scheme == "ISIL"] assert len(isil_ids) == 1 assert isil_ids[0].identifier_value == "NL-AsdRM" url_ids = [i for i in custodian.identifiers if i.identifier_scheme == "URL"] assert len(url_ids) == 1 assert "rijksmuseum.nl" in url_ids[0].identifier_value # Check digital platforms assert custodian.digital_platforms is not None assert len(custodian.digital_platforms) > 0 # Should have collection management system cms_platforms = [p for p in custodian.digital_platforms if str(p.platform_type) == 'COLLECTION_MANAGEMENT'] assert len(cms_platforms) == 1 assert cms_platforms[0].platform_name == "Axiell Collections" # Should have aggregator platforms aggregators = [p for p in custodian.digital_platforms if str(p.platform_type) == 'AGGREGATOR'] assert len(aggregators) > 0 assert any(p.platform_name == "Collectie Nederland" for p in aggregators) assert any(p.platform_name == "Museum Register" for p in aggregators) # Check provenance assert str(custodian.provenance.data_source) == 'DUTCH_ORG_CSV' assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE' assert custodian.provenance.extraction_method == "DutchOrgsParser" assert custodian.provenance.confidence_score == 1.0 # Notes is not a Provenance field - opmerking should be in description assert "National museum" in custodian.description # Samenwerkingsverband is in the DutchHeritageCustodian model, not provenance def test_to_heritage_custodian_archive(self, sample_csv_file): """Test converting archive record to HeritageCustodian""" parser = DutchOrgsParser() records = parser.parse_file(sample_csv_file) # Get Nationaal Archief record na_record = [r for r in records if r.organisatie == "Nationaal Archief"][0] custodian = parser.to_heritage_custodian(na_record) assert custodian.name == "Nationaal Archief" # Compare as string since InstitutionTypeEnum != PermissibleValue assert str(custodian.institution_type) == 'ARCHIVE' assert custodian.locations[0].city == "Den Haag" # Check for MAIS system cms = [p for p in custodian.digital_platforms if str(p.platform_type) == 'COLLECTION_MANAGEMENT'] assert len(cms) == 1 assert cms[0].platform_name == "MAIS" def test_to_heritage_custodian_library(self, sample_csv_file): """Test converting library record to HeritageCustodian""" parser = DutchOrgsParser() records = parser.parse_file(sample_csv_file) # Get UB Leiden record ub_record = [r for r in records if "Universiteitsbibliotheek" in r.organisatie][0] custodian = parser.to_heritage_custodian(ub_record) assert custodian.name == "Universiteitsbibliotheek Leiden" assert str(custodian.institution_type) == 'LIBRARY' assert custodian.locations[0].city == "Leiden" # Check for Alma system cms = [p for p in custodian.digital_platforms if str(p.platform_type) == 'COLLECTION_MANAGEMENT'] assert len(cms) == 1 assert cms[0].platform_name == "Alma" # Check parent organization in description assert "Universiteit Leiden" in custodian.description def test_partnerships_creation(self, sample_csv_file): """Test that Partnership objects are created from Dutch network memberships""" parser = DutchOrgsParser() records = parser.parse_file(sample_csv_file) # Get Rijksmuseum record (has many partnerships) rijks_record = [r for r in records if r.organisatie == "Rijksmuseum"][0] custodian = parser.to_heritage_custodian(rijks_record) # Should have multiple partnerships assert custodian.partnerships is not None assert len(custodian.partnerships) > 0 # Check for specific partnerships based on CSV data partnership_names = [p.partner_name for p in custodian.partnerships] # National platforms assert "Museum Register" in partnership_names assert "Rijkscollectie" in partnership_names assert "Collectie Nederland" in partnership_names # Digitization programs assert "Versnellen" in partnership_names # International networks assert "Van Gogh Worldwide" in partnership_names # EU platforms (DC4EU is "ja" in CSV) assert "DC4EU" in partnership_names # Check partnership types are assigned correctly museum_register_partnership = [p for p in custodian.partnerships if p.partner_name == "Museum Register"][0] assert museum_register_partnership.partnership_type == "national_museum_certification" rijkscollectie_partnership = [p for p in custodian.partnerships if p.partner_name == "Rijkscollectie"][0] assert rijkscollectie_partnership.partnership_type == "national_collection_designation" collectie_nl_partnership = [p for p in custodian.partnerships if p.partner_name == "Collectie Nederland"][0] assert collectie_nl_partnership.partnership_type == "aggregator_participation" versnellen_partnership = [p for p in custodian.partnerships if p.partner_name == "Versnellen"][0] assert versnellen_partnership.partnership_type == "digitization_program" van_gogh_partnership = [p for p in custodian.partnerships if p.partner_name == "Van Gogh Worldwide"][0] assert van_gogh_partnership.partnership_type == "international_thematic_network" def test_parse_and_convert(self, sample_csv_file): """Test end-to-end parsing and conversion""" parser = DutchOrgsParser() custodians = parser.parse_and_convert(sample_csv_file) assert len(custodians) == 5 # All should have TIER_1 provenance for custodian in custodians: assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE' assert custodian.provenance.confidence_score == 1.0 # Check institution types are mapped types_found = set() for custodian in custodians: types_found.add(str(custodian.institution_type)) assert 'MUSEUM' in types_found assert 'ARCHIVE' in types_found assert 'LIBRARY' in types_found def test_url_normalization(self): """Test that URLs without http:// are normalized""" parser = DutchOrgsParser() record = DutchOrgRecord( organisatie="Test Museum", webadres="www.example.com" ) custodian = parser.to_heritage_custodian(record) url_ids = [i for i in custodian.identifiers if i.identifier_scheme == "URL"] assert len(url_ids) == 1 assert url_ids[0].identifier_value.startswith("https://") class TestDutchOrgsParserEdgeCases: """Test edge cases and error handling""" def test_empty_csv(self): """Test parsing an empty CSV file""" csv_data = ''',Plaatsnaam bezoekadres ,Straat en huisnummer bezoekadres ,Organisatie,Koepelorganisatie,Webadres organisatie,Type organisatie,Opmerkingen Inez ,ISIL-code (NA),Samenwerkingsverband / Platform,Systeem,Versnellen,Collectie Nederland,Museum register,Rijkscollectie,Bibliotheek collectie,in scope voor DC4EU,DC4EU aansluit route,Archieven.nl,Archives Portal Europe,WO2Net,Modemuze,Maritiem Digitaal,Delfts aardewerk,Stichting Academisch Erfgoed,Coleccion Aruba,Van Gogh Worldwide,OODE24 (Mondriaan),Linked Data,Datasetregister,Versnellen project,Opmerkingen, ''' with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: f.write(csv_data) temp_path = f.name try: parser = DutchOrgsParser() records = parser.parse_file(temp_path) assert len(records) == 0 finally: os.unlink(temp_path) def test_row_without_organization_name_skipped(self): """Test that rows without organization name are skipped""" csv_data = ''',Plaatsnaam bezoekadres ,Straat en huisnummer bezoekadres ,Organisatie,Koepelorganisatie,Webadres organisatie,Type organisatie,Opmerkingen Inez ,ISIL-code (NA),Samenwerkingsverband / Platform,Systeem,Versnellen,Collectie Nederland,Museum register,Rijkscollectie,Bibliotheek collectie,in scope voor DC4EU,DC4EU aansluit route,Archieven.nl,Archives Portal Europe,WO2Net,Modemuze,Maritiem Digitaal,Delfts aardewerk,Stichting Academisch Erfgoed,Coleccion Aruba,Van Gogh Worldwide,OODE24 (Mondriaan),Linked Data,Datasetregister,Versnellen project,Opmerkingen, ,Amsterdam,Test Street 1,,,,museum,,,,,,,,,,,,,,,,,,,,,,,, ,Rotterdam,,Valid Museum,,https://example.com,museum,,NL-TEST,,,,,,,,,,,,,,,,,,,,,, ''' with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: f.write(csv_data) temp_path = f.name try: parser = DutchOrgsParser() records = parser.parse_file(temp_path) # Should only parse the row with organization name assert len(records) == 1 assert records[0].organisatie == "Valid Museum" finally: os.unlink(temp_path) def test_minimal_record(self): """Test parsing record with minimal data""" record = DutchOrgRecord(organisatie="Minimal Museum") custodian = DutchOrgsParser().to_heritage_custodian(record) assert custodian.name == "Minimal Museum" assert str(custodian.institution_type) == 'MIXED' # No type specified, defaults to MIXED assert custodian.identifiers == [] # No identifiers assert custodian.locations[0].country == "NL" assert custodian.locations[0].city is None assert custodian.digital_platforms is None or custodian.digital_platforms == [] def test_path_as_string_or_pathlib(self, sample_csv_file): """Test that parser accepts both string and Path objects""" parser = DutchOrgsParser() # Test with string records_str = parser.parse_file(sample_csv_file) # Test with Path records_path = parser.parse_file(Path(sample_csv_file)) assert len(records_str) == len(records_path) assert records_str[0].organisatie == records_path[0].organisatie class TestDutchOrgsTypeMapping: """Test institution type mapping""" def test_type_mapping(self): """Test that Dutch types are correctly mapped to InstitutionType enum""" parser = DutchOrgsParser() test_cases = [ ("museum", 'MUSEUM'), ("archief", 'ARCHIVE'), ("bibliotheek", 'LIBRARY'), ("library", 'LIBRARY'), ("archive", 'ARCHIVE'), ] for dutch_type, expected_type in test_cases: record = DutchOrgRecord( organisatie="Test", type_organisatie=dutch_type ) custodian = parser.to_heritage_custodian(record) assert str(custodian.institution_type) == expected_type def test_unknown_type_results_in_mixed(self): """Test that unknown organization types result in MIXED type""" parser = DutchOrgsParser() record = DutchOrgRecord( organisatie="Test", type_organisatie="unknown_type" ) custodian = parser.to_heritage_custodian(record) assert str(custodian.institution_type) == 'MIXED'