glam/tests/parsers/test_dutch_orgs.py

"""
Tests for Dutch Organizations CSV Parser
"""

import pytest
from datetime import datetime
from pathlib import Path
import tempfile
import os

from glam_extractor.parsers.dutch_orgs import DutchOrgsParser, DutchOrgRecord
from glam_extractor.models import (
    DataTier, DataSource, InstitutionType, DigitalPlatformType
)


# Sample Dutch Organizations CSV data
SAMPLE_DUTCH_ORGS_CSV = ''',Plaatsnaam bezoekadres  ,Straat en huisnummer bezoekadres ,Organisatie,Koepelorganisatie,Webadres organisatie,Type organisatie,Opmerkingen Inez ,ISIL-code (NA),Samenwerkingsverband / Platform,Systeem,Versnellen,Collectie Nederland,Museum register,Rijkscollectie,Bibliotheek collectie,in scope voor DC4EU,DC4EU aansluit route,Archieven.nl,Archives Portal Europe,WO2Net,Modemuze,Maritiem Digitaal,Delfts aardewerk,Stichting Academisch Erfgoed,Coleccion Aruba,Van Gogh Worldwide,OODE24 (Mondriaan),Linked Data,Datasetregister,Versnellen project,Opmerkingen,
Drenthe,Hooghalen,Oosthalen 8,Stichting Herinneringscentrum Kamp Westerbork,,https://kampwesterbork.nl/,museum,,NL-HhlHCKW,,Atlantis,ja,,ja,,,,,,,ja,,,,,,,ja,ja,,,
,Amsterdam,Museumplein 1,Rijksmuseum,,https://www.rijksmuseum.nl,museum,,NL-AsdRM,Museumvereniging,Axiell Collections,ja,ja,ja,ja,,ja,,,,,,,,,,ja,,ja,ja,Data enrichment,National museum
,Rotterdam,Museumpark 18-20,Museum Boijmans Van Beuningen,Stichting MBVB,https://www.boijmans.nl,museum,Major collection,NL-RtMBVB,,TMS,ja,ja,ja,ja,,,,,,,ja,,,,,,ja,ja,,Under renovation
,Den Haag,Prins Willem-Alexanderhof 20,Nationaal Archief,,https://www.nationaalarchief.nl,archief,,NL-HANA,,MAIS,,,,,,,,ja,ja,,,,,,,,,,ja,,
,Leiden,,Universiteitsbibliotheek Leiden,Universiteit Leiden,https://www.library.universiteitleiden.nl,bibliotheek,,NL-LdnUB,,Alma,,,,,ja,ja,,,,,,,,,ja,,,,,OCLC member
'''


@pytest.fixture
def sample_csv_file():
    """Create a temporary CSV file with sample data"""
    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f:
        f.write(SAMPLE_DUTCH_ORGS_CSV)
        temp_path = f.name

    yield temp_path

    # Cleanup
    os.unlink(temp_path)


class TestDutchOrgRecord:
    """Test DutchOrgRecord Pydantic model"""

    def test_create_valid_record(self):
        """Test creating a valid Dutch org record"""
        record = DutchOrgRecord(
            plaatsnaam="Amsterdam",
            straat_huisnummer="Museumplein 1",
            organisatie="Rijksmuseum",
            webadres="https://www.rijksmuseum.nl",
            type_organisatie="museum",
            isil_code="NL-AsdRM",
            systeem="Axiell Collections",
            collectie_nederland="ja",
            museum_register="ja"
        )

        assert record.plaatsnaam == "Amsterdam"
        assert record.organisatie == "Rijksmuseum"
        assert record.type_organisatie == "museum"
        assert record.isil_code == "NL-AsdRM"

    def test_type_normalization(self):
        """Test organization type normalization"""
        record = DutchOrgRecord(
            organisatie="Test",
            type_organisatie="  MUSEUM  "
        )
        assert record.type_organisatie == "museum"

        # Empty type
        record2 = DutchOrgRecord(
            organisatie="Test",
            type_organisatie=""
        )
        assert record2.type_organisatie is None

    def test_isil_normalization(self):
        """Test ISIL code normalization"""
        record = DutchOrgRecord(
            organisatie="Test",
            isil_code="  NL-TEST  "
        )
        assert record.isil_code == "NL-TEST"

        # Empty ISIL
        record2 = DutchOrgRecord(
            organisatie="Test",
            isil_code=""
        )
        assert record2.isil_code is None

    def test_get_platforms(self):
        """Test extracting platforms from yes/no fields"""
        record = DutchOrgRecord(
            organisatie="Test Museum",
            collectie_nederland="ja",
            museum_register="ja",
            rijkscollectie="x",
            archieven_nl="",
            wo2net="nee"
        )

        platforms = record.get_platforms()

        assert "Collectie Nederland" in platforms
        assert "Museum Register" in platforms
        assert "Rijkscollectie" in platforms
        assert "Archieven.nl" not in platforms
        assert "WO2Net" not in platforms
        assert len(platforms) == 3

    def test_is_yes_method(self):
        """Test the _is_yes helper method"""
        record = DutchOrgRecord(organisatie="Test")

        # Various affirmative values
        assert record._is_yes("ja") is True
        assert record._is_yes("yes") is True
        assert record._is_yes("x") is True
        assert record._is_yes("✓") is True
        assert record._is_yes("JA") is True  # Case insensitive

        # Negative values
        assert record._is_yes("nee") is False
        assert record._is_yes("no") is False
        assert record._is_yes("") is False
        assert record._is_yes(None) is False


class TestDutchOrgsParser:
    """Test DutchOrgsParser class"""

    def test_parse_file(self, sample_csv_file):
        """Test parsing a complete CSV file"""
        parser = DutchOrgsParser()
        records = parser.parse_file(sample_csv_file)

        assert len(records) == 5

        # Check Kamp Westerbork
        assert records[0].organisatie == "Stichting Herinneringscentrum Kamp Westerbork"
        assert records[0].plaatsnaam == "Hooghalen"
        assert records[0].straat_huisnummer == "Oosthalen 8"
        assert records[0].type_organisatie == "museum"
        assert records[0].isil_code == "NL-HhlHCKW"
        assert records[0].systeem == "Atlantis"

        # Check Rijksmuseum
        rijks = [r for r in records if r.organisatie == "Rijksmuseum"][0]
        assert rijks.plaatsnaam == "Amsterdam"
        assert rijks.isil_code == "NL-AsdRM"
        assert rijks.systeem == "Axiell Collections"
        assert rijks.samenwerkingsverband == "Museumvereniging"
        assert rijks.opmerkingen == "National museum"

        # Check Nationaal Archief
        na = [r for r in records if r.organisatie == "Nationaal Archief"][0]
        assert na.type_organisatie == "archief"
        assert na.isil_code == "NL-HANA"
        assert na.systeem == "MAIS"

    def test_parse_nonexistent_file(self):
        """Test parsing a file that doesn't exist"""
        parser = DutchOrgsParser()

        with pytest.raises(FileNotFoundError):
            parser.parse_file("/nonexistent/file.csv")

    def test_to_heritage_custodian_museum(self, sample_csv_file):
        """Test converting museum record to HeritageCustodian"""
        parser = DutchOrgsParser()
        records = parser.parse_file(sample_csv_file)

        # Get Rijksmuseum record
        rijks_record = [r for r in records if r.organisatie == "Rijksmuseum"][0]
        custodian = parser.to_heritage_custodian(rijks_record, "/path/to/csv")

        # Check basic fields
        assert custodian.name == "Rijksmuseum"
        # Compare as string since InstitutionTypeEnum != PermissibleValue
        assert str(custodian.institution_type) == 'MUSEUM'

        # Check location
        assert custodian.locations[0].city == "Amsterdam"
        assert custodian.locations[0].street_address == "Museumplein 1"
        assert custodian.locations[0].country == "NL"

        # Check identifiers
        isil_ids = [i for i in custodian.identifiers if i.identifier_scheme == "ISIL"]
        assert len(isil_ids) == 1
        assert isil_ids[0].identifier_value == "NL-AsdRM"

        url_ids = [i for i in custodian.identifiers if i.identifier_scheme == "URL"]
        assert len(url_ids) == 1
        assert "rijksmuseum.nl" in url_ids[0].identifier_value

        # Check digital platforms
        assert custodian.digital_platforms is not None
        assert len(custodian.digital_platforms) > 0

        # Should have collection management system
        cms_platforms = [p for p in custodian.digital_platforms
                         if str(p.platform_type) == 'COLLECTION_MANAGEMENT']
        assert len(cms_platforms) == 1
        assert cms_platforms[0].platform_name == "Axiell Collections"

        # Should have aggregator platforms
        aggregators = [p for p in custodian.digital_platforms
                      if str(p.platform_type) == 'AGGREGATOR']
        assert len(aggregators) > 0
        assert any(p.platform_name == "Collectie Nederland" for p in aggregators)
        assert any(p.platform_name == "Museum Register" for p in aggregators)

        # Check provenance
        assert str(custodian.provenance.data_source) == 'DUTCH_ORG_CSV'
        assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
        assert custodian.provenance.extraction_method == "DutchOrgsParser"
        assert custodian.provenance.confidence_score == 1.0
        # Notes is not a Provenance field - opmerking should be in description
        assert "National museum" in custodian.description
        # Samenwerkingsverband is in the DutchHeritageCustodian model, not provenance

    def test_to_heritage_custodian_archive(self, sample_csv_file):
        """Test converting archive record to HeritageCustodian"""
        parser = DutchOrgsParser()
        records = parser.parse_file(sample_csv_file)

        # Get Nationaal Archief record
        na_record = [r for r in records if r.organisatie == "Nationaal Archief"][0]
        custodian = parser.to_heritage_custodian(na_record)

        assert custodian.name == "Nationaal Archief"
        # Compare as string since InstitutionTypeEnum != PermissibleValue
        assert str(custodian.institution_type) == 'ARCHIVE'
        assert custodian.locations[0].city == "Den Haag"

        # Check for MAIS system
        cms = [p for p in custodian.digital_platforms
               if str(p.platform_type) == 'COLLECTION_MANAGEMENT']
        assert len(cms) == 1
        assert cms[0].platform_name == "MAIS"

    def test_to_heritage_custodian_library(self, sample_csv_file):
        """Test converting library record to HeritageCustodian"""
        parser = DutchOrgsParser()
        records = parser.parse_file(sample_csv_file)

        # Get UB Leiden record
        ub_record = [r for r in records if "Universiteitsbibliotheek" in r.organisatie][0]
        custodian = parser.to_heritage_custodian(ub_record)

        assert custodian.name == "Universiteitsbibliotheek Leiden"
        assert str(custodian.institution_type) == 'LIBRARY'
        assert custodian.locations[0].city == "Leiden"

        # Check for Alma system
        cms = [p for p in custodian.digital_platforms
               if str(p.platform_type) == 'COLLECTION_MANAGEMENT']
        assert len(cms) == 1
        assert cms[0].platform_name == "Alma"

        # Check parent organization in description
        assert "Universiteit Leiden" in custodian.description

    def test_partnerships_creation(self, sample_csv_file):
        """Test that Partnership objects are created from Dutch network memberships"""
        parser = DutchOrgsParser()
        records = parser.parse_file(sample_csv_file)

        # Get Rijksmuseum record (has many partnerships)
        rijks_record = [r for r in records if r.organisatie == "Rijksmuseum"][0]
        custodian = parser.to_heritage_custodian(rijks_record)

        # Should have multiple partnerships
        assert custodian.partnerships is not None
        assert len(custodian.partnerships) > 0

        # Check for specific partnerships based on CSV data
        partnership_names = [p.partner_name for p in custodian.partnerships]

        # National platforms
        assert "Museum Register" in partnership_names
        assert "Rijkscollectie" in partnership_names
        assert "Collectie Nederland" in partnership_names

        # Digitization programs
        assert "Versnellen" in partnership_names

        # International networks
        assert "Van Gogh Worldwide" in partnership_names

        # EU platforms (DC4EU is "ja" in CSV)
        assert "DC4EU" in partnership_names

        # Check partnership types are assigned correctly
        museum_register_partnership = [p for p in custodian.partnerships if p.partner_name == "Museum Register"][0]
        assert museum_register_partnership.partnership_type == "national_museum_certification"

        rijkscollectie_partnership = [p for p in custodian.partnerships if p.partner_name == "Rijkscollectie"][0]
        assert rijkscollectie_partnership.partnership_type == "national_collection_designation"

        collectie_nl_partnership = [p for p in custodian.partnerships if p.partner_name == "Collectie Nederland"][0]
        assert collectie_nl_partnership.partnership_type == "aggregator_participation"

        versnellen_partnership = [p for p in custodian.partnerships if p.partner_name == "Versnellen"][0]
        assert versnellen_partnership.partnership_type == "digitization_program"

        van_gogh_partnership = [p for p in custodian.partnerships if p.partner_name == "Van Gogh Worldwide"][0]
        assert van_gogh_partnership.partnership_type == "international_thematic_network"

    def test_parse_and_convert(self, sample_csv_file):
        """Test end-to-end parsing and conversion"""
        parser = DutchOrgsParser()
        custodians = parser.parse_and_convert(sample_csv_file)

        assert len(custodians) == 5

        # All should have TIER_1 provenance
        for custodian in custodians:
            assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
            assert custodian.provenance.confidence_score == 1.0

        # Check institution types are mapped
        types_found = set()
        for custodian in custodians:
            types_found.add(str(custodian.institution_type))

        assert 'MUSEUM' in types_found
        assert 'ARCHIVE' in types_found
        assert 'LIBRARY' in types_found

    def test_url_normalization(self):
        """Test that URLs without http:// are normalized"""
        parser = DutchOrgsParser()

        record = DutchOrgRecord(
            organisatie="Test Museum",
            webadres="www.example.com"
        )

        custodian = parser.to_heritage_custodian(record)

        url_ids = [i for i in custodian.identifiers if i.identifier_scheme == "URL"]
        assert len(url_ids) == 1
        assert url_ids[0].identifier_value.startswith("https://")


class TestDutchOrgsParserEdgeCases:
    """Test edge cases and error handling"""

    def test_empty_csv(self):
        """Test parsing an empty CSV file"""
        csv_data = ''',Plaatsnaam bezoekadres  ,Straat en huisnummer bezoekadres ,Organisatie,Koepelorganisatie,Webadres organisatie,Type organisatie,Opmerkingen Inez ,ISIL-code (NA),Samenwerkingsverband / Platform,Systeem,Versnellen,Collectie Nederland,Museum register,Rijkscollectie,Bibliotheek collectie,in scope voor DC4EU,DC4EU aansluit route,Archieven.nl,Archives Portal Europe,WO2Net,Modemuze,Maritiem Digitaal,Delfts aardewerk,Stichting Academisch Erfgoed,Coleccion Aruba,Van Gogh Worldwide,OODE24 (Mondriaan),Linked Data,Datasetregister,Versnellen project,Opmerkingen,
'''

        with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
            f.write(csv_data)
            temp_path = f.name

        try:
            parser = DutchOrgsParser()
            records = parser.parse_file(temp_path)
            assert len(records) == 0
        finally:
            os.unlink(temp_path)

    def test_row_without_organization_name_skipped(self):
        """Test that rows without organization name are skipped"""
        csv_data = ''',Plaatsnaam bezoekadres  ,Straat en huisnummer bezoekadres ,Organisatie,Koepelorganisatie,Webadres organisatie,Type organisatie,Opmerkingen Inez ,ISIL-code (NA),Samenwerkingsverband / Platform,Systeem,Versnellen,Collectie Nederland,Museum register,Rijkscollectie,Bibliotheek collectie,in scope voor DC4EU,DC4EU aansluit route,Archieven.nl,Archives Portal Europe,WO2Net,Modemuze,Maritiem Digitaal,Delfts aardewerk,Stichting Academisch Erfgoed,Coleccion Aruba,Van Gogh Worldwide,OODE24 (Mondriaan),Linked Data,Datasetregister,Versnellen project,Opmerkingen,
,Amsterdam,Test Street 1,,,,museum,,,,,,,,,,,,,,,,,,,,,,,,
,Rotterdam,,Valid Museum,,https://example.com,museum,,NL-TEST,,,,,,,,,,,,,,,,,,,,,,
'''

        with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
            f.write(csv_data)
            temp_path = f.name

        try:
            parser = DutchOrgsParser()
            records = parser.parse_file(temp_path)

            # Should only parse the row with organization name
            assert len(records) == 1
            assert records[0].organisatie == "Valid Museum"
        finally:
            os.unlink(temp_path)

    def test_minimal_record(self):
        """Test parsing record with minimal data"""
        record = DutchOrgRecord(organisatie="Minimal Museum")
        custodian = DutchOrgsParser().to_heritage_custodian(record)

        assert custodian.name == "Minimal Museum"
        assert str(custodian.institution_type) == 'MIXED'  # No type specified, defaults to MIXED
        assert custodian.identifiers == []  # No identifiers
        assert custodian.locations[0].country == "NL"
        assert custodian.locations[0].city is None
        assert custodian.digital_platforms is None or custodian.digital_platforms == []

    def test_path_as_string_or_pathlib(self, sample_csv_file):
        """Test that parser accepts both string and Path objects"""
        parser = DutchOrgsParser()

        # Test with string
        records_str = parser.parse_file(sample_csv_file)

        # Test with Path
        records_path = parser.parse_file(Path(sample_csv_file))

        assert len(records_str) == len(records_path)
        assert records_str[0].organisatie == records_path[0].organisatie


class TestDutchOrgsTypeMapping:
    """Test institution type mapping"""

    def test_type_mapping(self):
        """Test that Dutch types are correctly mapped to InstitutionType enum"""
        parser = DutchOrgsParser()

        test_cases = [
            ("museum", 'MUSEUM'),
            ("archief", 'ARCHIVE'),
            ("bibliotheek", 'LIBRARY'),
            ("library", 'LIBRARY'),
            ("archive", 'ARCHIVE'),
        ]

        for dutch_type, expected_type in test_cases:
            record = DutchOrgRecord(
                organisatie="Test",
                type_organisatie=dutch_type
            )
            custodian = parser.to_heritage_custodian(record)
            assert str(custodian.institution_type) == expected_type

    def test_unknown_type_results_in_mixed(self):
        """Test that unknown organization types result in MIXED type"""
        parser = DutchOrgsParser()

        record = DutchOrgRecord(
            organisatie="Test",
            type_organisatie="unknown_type"
        )
        custodian = parser.to_heritage_custodian(record)
        assert str(custodian.institution_type) == 'MIXED'