glam/tests/rag/test_ontology_mapping.py

"""
Tests for backend.rag.ontology_mapping module.

This module tests the dynamic ontology mapping system that loads LinkML schema
enumerations and provides multilingual matching for the Heritage RAG pipeline.

Coverage:
- Enum loading and caching
- Multilingual synonym extraction from YAML comments
- Natural language fuzzy matching (Dutch, German, French, Spanish)
- Singular/plural handling (bibliotheek → bibliotheken)
- Heritage code mapping (GLAMORCUBESFIXPHDNT)
- Cache invalidation
- Role category keyword extraction
"""

from __future__ import annotations

import os
import tempfile
from pathlib import Path
from unittest.mock import patch

import pytest
import yaml

# Import module under test
from backend.rag.ontology_mapping import (
    GLAMORCUBESFIXPHDNT_CODES,
    SCHEMA_BASE_DIR,
    EnumMapping,
    EnumValueInfo,
    OntologyMapper,
    detect_term_language,
    extract_comma_separated_terms,
    extract_wikidata_id,
    get_custodian_type_mapping,
    get_heritage_code,
    get_ontology_mapper,
    get_role_keywords,
    match_custodian_type,
    match_digital_platform_type,
    match_museum_type,
    normalize_text,
    parse_language_tag,
    reset_ontology_mapper,
)


# =============================================================================
# Fixtures
# =============================================================================


@pytest.fixture
def mapper() -> OntologyMapper:
    """Create a fresh OntologyMapper instance."""
    return OntologyMapper(SCHEMA_BASE_DIR)


@pytest.fixture
def temp_enum_dir(tmp_path: Path) -> Path:
    """Create a temporary directory with test enum files."""
    enums_dir = tmp_path / "modules" / "enums"
    enums_dir.mkdir(parents=True)
    return tmp_path


@pytest.fixture
def sample_enum_yaml() -> dict:
    """Sample enum YAML content for testing."""
    return {
        "enums": {
            "TestEnum": {
                "description": "Test enumeration",
                "permissible_values": {
                    "VALUE_ONE": {
                        "description": "First test value",
                        "meaning": "wikidata:Q12345",
                        "comments": [
                            "waarde een (nl)",
                            "Wert eins (de)",
                            "valeur un (fr)",
                        ],
                    },
                    "VALUE_TWO": {
                        "description": "Second test value",
                        "meaning": "wikidata:Q67890",
                        "comments": [
                            "Includes alpha, beta, gamma",
                        ],
                    },
                    "VALUE_THREE": {
                        "description": "Third value with no comments",
                    },
                },
            }
        }
    }


@pytest.fixture
def temp_mapper(temp_enum_dir: Path, sample_enum_yaml: dict) -> OntologyMapper:
    """Create mapper with temporary test enum file."""
    # Write sample enum file
    enum_file = temp_enum_dir / "modules" / "enums" / "TestEnum.yaml"
    with open(enum_file, "w") as f:
        yaml.dump(sample_enum_yaml, f)

    return OntologyMapper(temp_enum_dir)


# =============================================================================
# Test: normalize_text
# =============================================================================


class TestNormalizeText:
    """Tests for normalize_text function."""

    def test_lowercase(self):
        """Should convert to lowercase."""
        assert normalize_text("MUSEUM") == "museum"
        assert normalize_text("Museum") == "museum"

    def test_strip_whitespace(self):
        """Should strip leading/trailing whitespace."""
        assert normalize_text("  museum  ") == "museum"
        assert normalize_text("\tarchive\n") == "archive"

    def test_remove_diacritics(self):
        """Should remove accents/diacritics."""
        assert normalize_text("Bibliothèque") == "bibliotheque"
        assert normalize_text("musée") == "musee"
        assert normalize_text("Müzeum") == "muzeum"
        assert normalize_text("café") == "cafe"
        assert normalize_text("naïve") == "naive"

    def test_combined(self):
        """Should handle combined normalization."""
        assert normalize_text("  Musée Virtuel  ") == "musee virtuel"
        assert normalize_text("BIBLIOTHÈQUE NATIONALE") == "bibliotheque nationale"


# =============================================================================
# Test: parse_language_tag
# =============================================================================


class TestParseLanguageTag:
    """Tests for parse_language_tag function."""

    def test_dutch_tag(self):
        """Should parse Dutch language tag."""
        lang, term = parse_language_tag("virtueel museum (nl)")
        assert lang == "nl"
        assert term == "virtueel museum"

    def test_german_tag(self):
        """Should parse German language tag."""
        lang, term = parse_language_tag("Digitales Museum (de)")
        assert lang == "de"
        assert term == "Digitales Museum"

    def test_french_tag(self):
        """Should parse French language tag."""
        lang, term = parse_language_tag("musée virtuel (fr)")
        assert lang == "fr"
        assert term == "musée virtuel"

    def test_spanish_tag(self):
        """Should parse Spanish language tag."""
        lang, term = parse_language_tag("museo virtual (es)")
        assert lang == "es"
        assert term == "museo virtual"

    def test_no_tag(self):
        """Should return None for lang when no tag present."""
        lang, term = parse_language_tag("Some plain comment")
        assert lang is None
        assert term == "Some plain comment"

    def test_unsupported_language(self):
        """Should treat unsupported language codes as no tag."""
        lang, term = parse_language_tag("text (xyz)")
        assert lang is None  # xyz is not supported

    def test_uppercase_tag(self):
        """Should handle uppercase language tags."""
        lang, term = parse_language_tag("museum (NL)")
        assert lang == "nl"
        assert term == "museum"


# =============================================================================
# Test: extract_comma_separated_terms
# =============================================================================


class TestExtractCommaSeparatedTerms:
    """Tests for extract_comma_separated_terms function."""

    def test_simple_list(self):
        """Should extract simple comma-separated terms."""
        terms = extract_comma_separated_terms("alpha, beta, gamma")
        assert "alpha" in terms
        assert "beta" in terms
        assert "gamma" in terms

    def test_includes_prefix(self):
        """Should strip 'Includes' prefix."""
        terms = extract_comma_separated_terms("Includes bibliotheken, bibliotecas, bibliothèques")
        assert "bibliotheken" in terms
        assert "bibliotecas" in terms
        assert "bibliothèques" in terms
        assert "Includes" not in " ".join(terms)

    def test_examples_prefix(self):
        """Should strip 'Examples:' prefix."""
        terms = extract_comma_separated_terms("Examples: museum, archive, library")
        assert "museum" in terms
        assert "archive" in terms
        assert "library" in terms

    def test_no_commas(self):
        """Should return empty list for single term."""
        terms = extract_comma_separated_terms("Just a single comment")
        assert terms == []

    def test_skip_long_sentences(self):
        """Should skip terms that look like sentences (> 50 chars)."""
        long_term = "This is a very long sentence that should be skipped because it exceeds fifty characters"
        terms = extract_comma_separated_terms(f"short term, {long_term}")
        assert "short term" in terms
        assert long_term not in terms

    def test_strip_wikidata_references(self):
        """Should strip trailing Wikidata references."""
        terms = extract_comma_separated_terms("botanical gardens (Q473972), zoos")
        assert "botanical gardens" in terms
        assert "zoos" in terms
        assert "(Q473972)" not in " ".join(terms)


# =============================================================================
# Test: extract_wikidata_id
# =============================================================================


class TestExtractWikidataId:
    """Tests for extract_wikidata_id function."""

    def test_wikidata_prefix(self):
        """Should extract ID with wikidata: prefix."""
        assert extract_wikidata_id("wikidata:Q12345") == "Q12345"
        assert extract_wikidata_id("wikidata:Q1225034") == "Q1225034"

    def test_full_uri(self):
        """Should extract ID from full Wikidata URI."""
        assert extract_wikidata_id("http://www.wikidata.org/entity/Q12345") == "Q12345"
        assert extract_wikidata_id("https://www.wikidata.org/wiki/Q67890") == "Q67890"

    def test_none_input(self):
        """Should handle None input."""
        assert extract_wikidata_id(None) is None

    def test_invalid_format(self):
        """Should return None for invalid format."""
        assert extract_wikidata_id("not a wikidata id") is None
        assert extract_wikidata_id("schema:Thing") is None


# =============================================================================
# Test: EnumValueInfo
# =============================================================================


class TestEnumValueInfo:
    """Tests for EnumValueInfo dataclass."""

    def test_basic_creation(self):
        """Should create with minimal fields."""
        info = EnumValueInfo(name="TEST_VALUE")
        assert info.name == "TEST_VALUE"
        assert info.description is None
        assert info.wikidata_id is None
        assert info.synonyms == {}
        assert info.all_synonyms_normalized == []

    def test_full_creation(self):
        """Should create with all fields."""
        info = EnumValueInfo(
            name="MUSEUM",
            description="A museum institution",
            wikidata_id="Q33506",
            synonyms={"nl": ["museum", "musea"], "de": ["Museum"]},
            all_synonyms_normalized=["museum", "musea"],
        )
        assert info.name == "MUSEUM"
        assert info.description == "A museum institution"
        assert info.wikidata_id == "Q33506"
        assert "nl" in info.synonyms
        assert "museum" in info.all_synonyms_normalized


# =============================================================================
# Test: OntologyMapper - Enum Loading
# =============================================================================


class TestOntologyMapperLoading:
    """Tests for OntologyMapper enum loading."""

    def test_load_enum_from_temp_file(self, temp_mapper: OntologyMapper):
        """Should load enum from temporary test file."""
        mapping = temp_mapper.load_enum("TestEnum")
        assert mapping is not None
        assert mapping.enum_name == "TestEnum"
        assert len(mapping.values) == 3
        assert "VALUE_ONE" in mapping.values
        assert "VALUE_TWO" in mapping.values
        assert "VALUE_THREE" in mapping.values

    def test_load_nonexistent_enum(self, temp_mapper: OntologyMapper):
        """Should return None for non-existent enum."""
        mapping = temp_mapper.load_enum("NonExistentEnum")
        assert mapping is None

    def test_extract_wikidata_from_meaning(self, temp_mapper: OntologyMapper):
        """Should extract Wikidata ID from meaning field."""
        mapping = temp_mapper.load_enum("TestEnum")
        assert mapping is not None
        value_one = mapping.values.get("VALUE_ONE")
        assert value_one is not None
        assert value_one.wikidata_id == "Q12345"

    def test_extract_synonyms_from_comments(self, temp_mapper: OntologyMapper):
        """Should extract language-tagged synonyms from comments."""
        mapping = temp_mapper.load_enum("TestEnum")
        assert mapping is not None
        value_one = mapping.values.get("VALUE_ONE")
        assert value_one is not None
        # Check language-specific synonyms
        assert "nl" in value_one.synonyms
        assert "waarde een" in value_one.synonyms["nl"]
        assert "de" in value_one.synonyms
        assert "Wert eins" in value_one.synonyms["de"]

    def test_extract_comma_separated_from_comments(self, temp_mapper: OntologyMapper):
        """Should extract comma-separated terms from comments."""
        mapping = temp_mapper.load_enum("TestEnum")
        assert mapping is not None
        value_two = mapping.values.get("VALUE_TWO")
        assert value_two is not None
        # Comma-separated terms should be in all_synonyms_normalized
        assert "alpha" in value_two.all_synonyms_normalized
        assert "beta" in value_two.all_synonyms_normalized
        assert "gamma" in value_two.all_synonyms_normalized

    def test_load_real_custodian_type_enum(self, mapper: OntologyMapper):
        """Should load real CustodianPrimaryTypeEnum from schema."""
        mapping = mapper.load_enum("CustodianPrimaryTypeEnum")
        assert mapping is not None
        assert len(mapping.values) >= 19  # GLAMORCUBESFIXPHDNT has 19 types
        assert "MUSEUM" in mapping.values
        assert "LIBRARY" in mapping.values
        assert "ARCHIVE" in mapping.values

    def test_load_real_digital_platform_enum(self, mapper: OntologyMapper):
        """Should load real DigitalPlatformTypeEnum from schema."""
        mapping = mapper.load_enum("DigitalPlatformTypeEnum")
        assert mapping is not None
        assert len(mapping.values) >= 50  # Should have many platform types
        assert "VIRTUAL_MUSEUM" in mapping.values

    def test_load_all_enums(self, mapper: OntologyMapper):
        """Should load all enum files from schema directory."""
        all_enums = mapper.load_all_enums()
        assert len(all_enums) >= 10  # Should have many enums
        # Check some expected enums
        enum_names = list(all_enums.keys())
        assert "CustodianPrimaryTypeEnum" in enum_names
        assert "DigitalPlatformTypeEnum" in enum_names


# =============================================================================
# Test: OntologyMapper - Natural Language Matching
# =============================================================================


class TestOntologyMapperMatching:
    """Tests for OntologyMapper natural language matching."""

    def test_exact_match(self, temp_mapper: OntologyMapper):
        """Should match exact normalized text."""
        result = temp_mapper.match_natural_language("value one", "TestEnum")
        assert result == "VALUE_ONE"

    def test_dutch_synonym_match(self, temp_mapper: OntologyMapper):
        """Should match Dutch synonym from comments."""
        result = temp_mapper.match_natural_language("waarde een", "TestEnum")
        assert result == "VALUE_ONE"

    def test_german_synonym_match(self, temp_mapper: OntologyMapper):
        """Should match German synonym from comments."""
        result = temp_mapper.match_natural_language("Wert eins", "TestEnum")
        assert result == "VALUE_ONE"

    def test_comma_term_match(self, temp_mapper: OntologyMapper):
        """Should match comma-separated term."""
        result = temp_mapper.match_natural_language("alpha", "TestEnum")
        assert result == "VALUE_TWO"

    def test_no_match(self, temp_mapper: OntologyMapper):
        """Should return None when no match found."""
        result = temp_mapper.match_natural_language("xyz nonexistent", "TestEnum")
        assert result is None

    def test_real_dutch_bibliotheek(self, mapper: OntologyMapper):
        """Should match Dutch 'bibliotheek' to LIBRARY."""
        result = mapper.match_natural_language("bibliotheek", "CustodianPrimaryTypeEnum")
        assert result == "LIBRARY"

    def test_real_dutch_bibliotheken(self, mapper: OntologyMapper):
        """Should match Dutch plural 'bibliotheken' to LIBRARY (fuzzy)."""
        result = mapper.match_natural_language("bibliotheken", "CustodianPrimaryTypeEnum")
        assert result == "LIBRARY"

    def test_real_dutch_archief(self, mapper: OntologyMapper):
        """Should match Dutch 'archief' to ARCHIVE."""
        result = mapper.match_natural_language("archief", "CustodianPrimaryTypeEnum")
        assert result == "ARCHIVE"

    def test_real_dutch_virtueel_museum(self, mapper: OntologyMapper):
        """Should match Dutch 'virtueel museum' to VIRTUAL_MUSEUM."""
        result = mapper.match_natural_language("virtueel museum", "DigitalPlatformTypeEnum")
        assert result == "VIRTUAL_MUSEUM"

    def test_real_german_digitales_museum(self, mapper: OntologyMapper):
        """Should match German 'Digitales Museum' to VIRTUAL_MUSEUM."""
        result = mapper.match_natural_language("Digitales Museum", "DigitalPlatformTypeEnum")
        assert result == "VIRTUAL_MUSEUM"

    def test_real_spanish_museo_virtual(self, mapper: OntologyMapper):
        """Should match Spanish 'museo virtual' to VIRTUAL_MUSEUM."""
        result = mapper.match_natural_language("museo virtual", "DigitalPlatformTypeEnum")
        assert result == "VIRTUAL_MUSEUM"

    def test_case_insensitive(self, mapper: OntologyMapper):
        """Should be case insensitive."""
        result1 = mapper.match_natural_language("MUSEUM", "CustodianPrimaryTypeEnum")
        result2 = mapper.match_natural_language("museum", "CustodianPrimaryTypeEnum")
        result3 = mapper.match_natural_language("Museum", "CustodianPrimaryTypeEnum")
        assert result1 == result2 == result3 == "MUSEUM"


# =============================================================================
# Test: OntologyMapper - Heritage Code Mapping
# =============================================================================


class TestOntologyMapperHeritageCodes:
    """Tests for heritage code mapping."""

    def test_museum_code(self, mapper: OntologyMapper):
        """Should map MUSEUM to M."""
        assert mapper.get_heritage_type_code("MUSEUM") == "M"

    def test_library_code(self, mapper: OntologyMapper):
        """Should map LIBRARY to L."""
        assert mapper.get_heritage_type_code("LIBRARY") == "L"

    def test_archive_code(self, mapper: OntologyMapper):
        """Should map ARCHIVE to A."""
        assert mapper.get_heritage_type_code("ARCHIVE") == "A"

    def test_gallery_code(self, mapper: OntologyMapper):
        """Should map GALLERY to G."""
        assert mapper.get_heritage_type_code("GALLERY") == "G"

    def test_unknown_code(self, mapper: OntologyMapper):
        """Should return None for unknown type."""
        assert mapper.get_heritage_type_code("UNKNOWN_TYPE") is None

    def test_get_full_mapping(self, mapper: OntologyMapper):
        """Should return complete type-to-code mapping."""
        mapping = mapper.get_custodian_type_to_code_mapping()
        assert len(mapping) == 19  # GLAMORCUBESFIXPHDNT has 19 types
        assert mapping["MUSEUM"] == "M"
        assert mapping["LIBRARY"] == "L"
        assert mapping["ARCHIVE"] == "A"
        assert mapping["GALLERY"] == "G"
        # Check all expected codes are present
        expected_codes = set("GLAMORCUBESFIXPHDNT")
        actual_codes = set(mapping.values())
        assert actual_codes == expected_codes


# =============================================================================
# Test: OntologyMapper - Caching
# =============================================================================


class TestOntologyMapperCaching:
    """Tests for caching behavior."""

    def test_enum_is_cached(self, mapper: OntologyMapper):
        """Should cache enum after first load."""
        # First load
        mapping1 = mapper.load_enum("CustodianPrimaryTypeEnum")
        assert mapping1 is not None
        assert "CustodianPrimaryTypeEnum" in mapper._cache

        # Second load should return cached version
        mapping2 = mapper.load_enum("CustodianPrimaryTypeEnum")
        assert mapping1 is mapping2  # Same object

    def test_force_reload(self, mapper: OntologyMapper):
        """Should reload when force_reload=True."""
        # First load
        mapping1 = mapper.load_enum("CustodianPrimaryTypeEnum")

        # Force reload
        mapping2 = mapper.load_enum("CustodianPrimaryTypeEnum", force_reload=True)

        # Should be different objects
        assert mapping1 is not mapping2

    def test_clear_cache(self, mapper: OntologyMapper):
        """Should clear all cached enums."""
        # Load some enums
        mapper.load_enum("CustodianPrimaryTypeEnum")
        mapper.load_enum("DigitalPlatformTypeEnum")
        assert len(mapper._cache) >= 2

        # Clear cache
        mapper.clear_cache()
        assert len(mapper._cache) == 0
        assert len(mapper._file_mtimes) == 0


# =============================================================================
# Test: Convenience Functions
# =============================================================================


class TestConvenienceFunctions:
    """Tests for module-level convenience functions."""

    @pytest.fixture(autouse=True)
    def reset_singleton(self):
        """Reset singleton before each test."""
        reset_ontology_mapper()
        yield
        reset_ontology_mapper()

    def test_match_custodian_type(self):
        """Should match custodian type via convenience function."""
        assert match_custodian_type("museum") == "MUSEUM"
        assert match_custodian_type("bibliotheek") == "LIBRARY"
        assert match_custodian_type("archief") == "ARCHIVE"

    def test_match_digital_platform_type(self):
        """Should match digital platform type via convenience function."""
        assert match_digital_platform_type("virtueel museum") == "VIRTUAL_MUSEUM"

    def test_match_museum_type(self):
        """Should match museum type via convenience function."""
        # This tests against MuseumTypeEnum
        result = match_museum_type("art museum")
        # Result depends on what's in MuseumTypeEnum
        assert result is None or isinstance(result, str)

    def test_get_heritage_code(self):
        """Should get heritage code via convenience function."""
        assert get_heritage_code("MUSEUM") == "M"
        assert get_heritage_code("LIBRARY") == "L"
        assert get_heritage_code("ARCHIVE") == "A"

    def test_get_custodian_type_mapping(self):
        """Should get full mapping via convenience function."""
        mapping = get_custodian_type_mapping()
        assert len(mapping) == 19
        assert mapping["MUSEUM"] == "M"

    def test_get_ontology_mapper_singleton(self):
        """Should return singleton instance."""
        mapper1 = get_ontology_mapper()
        mapper2 = get_ontology_mapper()
        assert mapper1 is mapper2


# =============================================================================
# Test: Role Category Keywords
# =============================================================================


class TestRoleCategoryKeywords:
    """Tests for role category keyword extraction."""

    def test_get_role_keywords(self, mapper: OntologyMapper):
        """Should extract role category keywords."""
        keywords = mapper.get_role_category_keywords()
        # May return empty dict if StaffRole.yaml doesn't exist
        assert isinstance(keywords, dict)

    def test_get_role_keywords_convenience(self):
        """Should work via convenience function."""
        reset_ontology_mapper()
        keywords = get_role_keywords()
        assert isinstance(keywords, dict)


# =============================================================================
# Test: Prompt Formatting
# =============================================================================


class TestPromptFormatting:
    """Tests for DSPy prompt formatting."""

    def test_get_enum_values_for_prompt(self, mapper: OntologyMapper):
        """Should format enum values for prompt injection."""
        prompt = mapper.get_enum_values_for_prompt("CustodianPrimaryTypeEnum", max_values=5)
        assert "Valid values for CustodianPrimaryTypeEnum:" in prompt
        assert "MUSEUM" in prompt or "LIBRARY" in prompt  # At least some values
        assert "... and" in prompt  # Should indicate more values exist

    def test_get_valid_filter_values(self, mapper: OntologyMapper):
        """Should return list of valid filter values."""
        values = mapper.get_valid_filter_values("CustodianPrimaryTypeEnum")
        assert isinstance(values, list)
        assert len(values) >= 19
        assert "MUSEUM" in values
        assert "LIBRARY" in values


# =============================================================================
# Test: GLAMORCUBESFIXPHDNT Codes Constant
# =============================================================================


class TestGLAMORCUBESFIXPHDNTCodes:
    """Tests for GLAMORCUBESFIXPHDNT_CODES constant."""

    def test_all_codes_present(self):
        """Should have all 19 codes in mnemonic."""
        expected = "GLAMORCUBESFIXPHDNT"
        actual_codes = set(GLAMORCUBESFIXPHDNT_CODES.values())
        assert actual_codes == set(expected)

    def test_all_codes_single_letter(self):
        """All codes should be single letters."""
        for type_name, code in GLAMORCUBESFIXPHDNT_CODES.items():
            assert len(code) == 1, f"{type_name} has non-single-letter code: {code}"
            assert code.isalpha(), f"{type_name} has non-letter code: {code}"
            assert code.isupper(), f"{type_name} has non-uppercase code: {code}"

    def test_code_count(self):
        """Should have exactly 19 type-to-code mappings."""
        assert len(GLAMORCUBESFIXPHDNT_CODES) == 19


# =============================================================================
# Test: Similarity Function
# =============================================================================


class TestSimilarityFunction:
    """Tests for _simple_similarity method."""

    def test_exact_match(self, mapper: OntologyMapper):
        """Exact match should return 1.0."""
        score = mapper._simple_similarity("museum", "museum")
        assert score == 1.0

    def test_prefix_match(self, mapper: OntologyMapper):
        """Prefix match should return high score."""
        # bibliotheek → bibliotheken (Dutch singular/plural)
        score = mapper._simple_similarity("bibliotheek", "bibliotheken")
        assert score >= 0.9

    def test_stem_match(self, mapper: OntologyMapper):
        """Shared stem should return good score."""
        # archief → archieven
        score = mapper._simple_similarity("archief", "archieven")
        assert score >= 0.85

    def test_no_similarity(self, mapper: OntologyMapper):
        """Completely different strings should return low score."""
        score = mapper._simple_similarity("museum", "xyz")
        assert score < 0.5

    def test_empty_string(self, mapper: OntologyMapper):
        """Empty strings should return 0.0."""
        assert mapper._simple_similarity("", "museum") == 0.0
        assert mapper._simple_similarity("museum", "") == 0.0
        assert mapper._simple_similarity("", "") == 0.0


# =============================================================================
# Test: Integration with hybrid_retriever
# =============================================================================


class TestHybridRetrieverIntegration:
    """Tests verifying integration with hybrid_retriever.py."""

    @pytest.fixture(autouse=True)
    def reset(self):
        """Reset singleton before each test."""
        reset_ontology_mapper()
        yield

    def test_mapping_has_expected_format(self):
        """Mapping should match expected format for hybrid_retriever."""
        mapping = get_custodian_type_mapping()

        # All keys should be uppercase enum values
        for key in mapping:
            assert key.isupper() or key == key.upper().replace("_", "_")

        # All values should be single uppercase letters
        for value in mapping.values():
            assert len(value) == 1
            assert value.isupper()

    def test_heritage_code_returns_none_for_invalid(self):
        """get_heritage_code should return None for invalid types."""
        assert get_heritage_code("INVALID_TYPE") is None
        assert get_heritage_code("") is None

    def test_consistent_with_hardcoded_values(self):
        """Dynamic mapping should match expected hardcoded values."""
        mapping = get_custodian_type_mapping()

        # These are the critical mappings that hybrid_retriever depends on
        expected = {
            "GALLERY": "G",
            "LIBRARY": "L",
            "ARCHIVE": "A",
            "MUSEUM": "M",
            "OFFICIAL_INSTITUTION": "O",
            "RESEARCH_CENTER": "R",
            "DIGITAL_PLATFORM": "D",
        }

        for enum_val, code in expected.items():
            assert mapping.get(enum_val) == code, f"Mismatch for {enum_val}"


# =============================================================================
# Test: Edge Cases
# =============================================================================


class TestEdgeCases:
    """Tests for edge cases and error handling."""

    def test_match_empty_string(self, mapper: OntologyMapper):
        """Should handle empty string input."""
        result = mapper.match_natural_language("", "CustodianPrimaryTypeEnum")
        assert result is None

    def test_match_whitespace_only(self, mapper: OntologyMapper):
        """Should handle whitespace-only input."""
        result = mapper.match_natural_language("   ", "CustodianPrimaryTypeEnum")
        assert result is None

    def test_match_nonexistent_enum(self, mapper: OntologyMapper):
        """Should return None for non-existent enum."""
        result = mapper.match_natural_language("museum", "NonExistentEnum")
        assert result is None

    def test_load_malformed_yaml(self, temp_enum_dir: Path):
        """Should handle malformed YAML gracefully."""
        enum_file = temp_enum_dir / "modules" / "enums" / "BrokenEnum.yaml"
        with open(enum_file, "w") as f:
            f.write("this is not: valid: yaml: content:")

        mapper = OntologyMapper(temp_enum_dir)
        result = mapper.load_enum("BrokenEnum")
        assert result is None

    def test_unicode_normalization(self, mapper: OntologyMapper):
        """Should handle various unicode representations."""
        # e with combining acute accent vs precomposed é
        result1 = mapper.match_natural_language("musée", "CustodianPrimaryTypeEnum")  # precomposed
        result2 = mapper.match_natural_language("musée", "CustodianPrimaryTypeEnum")  # combining
        # Both should normalize to "musee" and potentially match
        assert result1 == result2


# =============================================================================
# Test: Language Detection
# =============================================================================


class TestDetectTermLanguage:
    """Tests for the detect_term_language function.

    This function uses a hybrid approach:
    1. Heritage-specific vocabulary for known heritage terms (highest priority)
    2. fast-langdetect library for general language detection (with confidence threshold)
    3. English default for multi-word phrases without clear indicators

    The heritage vocabulary focuses on terms that general-purpose language
    detectors often misclassify (e.g., "musea" as Italian instead of Dutch).
    """

    def test_detect_dutch_museum_terms(self):
        """Dutch museum-related terms in heritage vocabulary should be 'nl'."""
        # "musea" is in heritage vocabulary - fast-langdetect often misclassifies it
        assert detect_term_language("musea") == "nl"
        # "museum" is generic - depends on fast-langdetect (en/nl/de all valid)
        result = detect_term_language("museum")
        assert result in ("nl", "de", "en")  # Accept any valid detection

    def test_detect_dutch_library_terms(self):
        """Dutch library terms should be detected as 'nl'."""
        assert detect_term_language("bibliotheken") == "nl"
        assert detect_term_language("bibliotheek") == "nl"
        # Multi-word terms without English indicators default to heritage vocab match
        assert detect_term_language("openbare bibliotheek") in ("nl", "en")

    def test_detect_dutch_archive_terms(self):
        """Dutch archive terms should be detected as 'nl'."""
        assert detect_term_language("archieven") == "nl"
        assert detect_term_language("archief") == "nl"
        # "nationaal" triggers heritage vocab match for Dutch
        assert detect_term_language("nationaal archief") in ("nl", "en")  # "national" may trigger English
        # Compound terms use prefix matching
        assert detect_term_language("gemeentearchief") in ("nl", None)

    def test_detect_french_terms(self):
        """French heritage terms with diacritics should be detected as 'fr'."""
        # Terms with diacritics are reliably detected by fast-langdetect
        assert detect_term_language("musées") == "fr"
        assert detect_term_language("musée") == "fr"
        assert detect_term_language("bibliothèques") == "fr"
        assert detect_term_language("bibliothèque") == "fr"
        # "archives" without diacritics is ambiguous (French/English)
        result = detect_term_language("archives")
        assert result in ("fr", "en")
        # Diacritics provide clear French signal
        result = detect_term_language("société historique")
        assert result in ("fr", "en")  # "historique" detected by fast-langdetect

    def test_detect_spanish_terms(self):
        """Spanish heritage terms should be detected as 'es'."""
        # "museos" is in heritage vocabulary
        result = detect_term_language("museos")
        assert result in ("es", None)  # May not match if not in reduced vocab
        # "bibliotecas" and "archivos" are in heritage vocabulary
        assert detect_term_language("bibliotecas") in ("es", "pt")  # Shared term
        assert detect_term_language("archivos") == "es"

    def test_detect_german_terms(self):
        """German heritage terms should be detected as 'de'."""
        assert detect_term_language("museen") == "de"
        # "bibliothek" may match Dutch vocabulary first due to prefix matching
        result = detect_term_language("bibliothek")
        assert result in ("de", "nl")  # Both have similar terms
        assert detect_term_language("archiv") == "de"
        assert detect_term_language("sammlung") == "de"

    def test_detect_english_terms(self):
        """English heritage terms should be detected as 'en'."""
        assert detect_term_language("museums") == "en"
        assert detect_term_language("libraries") == "en"
        assert detect_term_language("gallery") == "en"
        assert detect_term_language("national library") == "en"
        assert detect_term_language("public archives") == "en"

    def test_detect_italian_terms(self):
        """Italian heritage terms should be detected as 'it'."""
        assert detect_term_language("musei") == "it"
        assert detect_term_language("biblioteche") == "it"
        assert detect_term_language("archivi") == "it"

    def test_detect_portuguese_terms(self):
        """Portuguese heritage terms should be detected as 'pt'."""
        assert detect_term_language("museus") == "pt"
        assert detect_term_language("bibliotecas") in ("pt", "es")  # Shared term
        assert detect_term_language("arquivos") == "pt"

    def test_unknown_term_returns_none(self):
        """Unknown single-word terms should return None."""
        assert detect_term_language("xyz123") is None
        assert detect_term_language("asdfghjkl") is None

    def test_empty_string_defaults_to_english(self):
        """Empty string should return English as default."""
        assert detect_term_language("") == "en"

    def test_whitespace_only_defaults_to_english(self):
        """Whitespace-only input should return English as default."""
        assert detect_term_language("   ") == "en"

    def test_case_insensitive_detection(self):
        """Detection should be case-insensitive."""
        assert detect_term_language("MUSEA") == "nl"
        assert detect_term_language("Musées") == "fr"
        # "MUSEOS" relies on fast-langdetect after heritage vocab check
        result = detect_term_language("MUSEOS")
        assert result in ("es", None)
        assert detect_term_language("Libraries") == "en"

    def test_compound_dutch_terms(self):
        """Compound Dutch terms should be detected via heritage vocabulary or prefix matching."""
        # "rijks" is in heritage vocabulary as prefix
        assert detect_term_language("rijksmuseum") in ("nl", None)
        # "gemeente" matches via prefix with "gemeentelijk"
        assert detect_term_language("gemeentearchief") in ("nl", None)

    def test_priority_when_ambiguous(self):
        """Heritage vocabulary takes precedence for known terms.

        When a term is in heritage vocabulary, that language is returned.
        For terms not in vocabulary, fast-langdetect determines the result.
        """
        # "archiv" is in German heritage vocabulary
        assert detect_term_language("archiv") == "de"

        # "museum" is not in heritage vocabulary (too ambiguous)
        # fast-langdetect will classify it
        result = detect_term_language("museum")
        assert result in ("nl", "de", "en")

        # "musea" is specifically in Dutch heritage vocabulary
        assert detect_term_language("musea") == "nl"


if __name__ == "__main__":
    pytest.main([__file__, "-v"])