""" Tests for backend.rag.ontology_mapping module. This module tests the dynamic ontology mapping system that loads LinkML schema enumerations and provides multilingual matching for the Heritage RAG pipeline. Coverage: - Enum loading and caching - Multilingual synonym extraction from YAML comments - Natural language fuzzy matching (Dutch, German, French, Spanish) - Singular/plural handling (bibliotheek → bibliotheken) - Heritage code mapping (GLAMORCUBESFIXPHDNT) - Cache invalidation - Role category keyword extraction """ from __future__ import annotations import os import tempfile from pathlib import Path from unittest.mock import patch import pytest import yaml # Import module under test from backend.rag.ontology_mapping import ( GLAMORCUBESFIXPHDNT_CODES, SCHEMA_BASE_DIR, EnumMapping, EnumValueInfo, OntologyMapper, detect_term_language, extract_comma_separated_terms, extract_wikidata_id, get_custodian_type_mapping, get_heritage_code, get_ontology_mapper, get_role_keywords, match_custodian_type, match_digital_platform_type, match_museum_type, normalize_text, parse_language_tag, reset_ontology_mapper, ) # ============================================================================= # Fixtures # ============================================================================= @pytest.fixture def mapper() -> OntologyMapper: """Create a fresh OntologyMapper instance.""" return OntologyMapper(SCHEMA_BASE_DIR) @pytest.fixture def temp_enum_dir(tmp_path: Path) -> Path: """Create a temporary directory with test enum files.""" enums_dir = tmp_path / "modules" / "enums" enums_dir.mkdir(parents=True) return tmp_path @pytest.fixture def sample_enum_yaml() -> dict: """Sample enum YAML content for testing.""" return { "enums": { "TestEnum": { "description": "Test enumeration", "permissible_values": { "VALUE_ONE": { "description": "First test value", "meaning": "wikidata:Q12345", "comments": [ "waarde een (nl)", "Wert eins (de)", "valeur un (fr)", ], }, "VALUE_TWO": { "description": "Second test value", "meaning": "wikidata:Q67890", "comments": [ "Includes alpha, beta, gamma", ], }, "VALUE_THREE": { "description": "Third value with no comments", }, }, } } } @pytest.fixture def temp_mapper(temp_enum_dir: Path, sample_enum_yaml: dict) -> OntologyMapper: """Create mapper with temporary test enum file.""" # Write sample enum file enum_file = temp_enum_dir / "modules" / "enums" / "TestEnum.yaml" with open(enum_file, "w") as f: yaml.dump(sample_enum_yaml, f) return OntologyMapper(temp_enum_dir) # ============================================================================= # Test: normalize_text # ============================================================================= class TestNormalizeText: """Tests for normalize_text function.""" def test_lowercase(self): """Should convert to lowercase.""" assert normalize_text("MUSEUM") == "museum" assert normalize_text("Museum") == "museum" def test_strip_whitespace(self): """Should strip leading/trailing whitespace.""" assert normalize_text(" museum ") == "museum" assert normalize_text("\tarchive\n") == "archive" def test_remove_diacritics(self): """Should remove accents/diacritics.""" assert normalize_text("Bibliothèque") == "bibliotheque" assert normalize_text("musée") == "musee" assert normalize_text("Müzeum") == "muzeum" assert normalize_text("café") == "cafe" assert normalize_text("naïve") == "naive" def test_combined(self): """Should handle combined normalization.""" assert normalize_text(" Musée Virtuel ") == "musee virtuel" assert normalize_text("BIBLIOTHÈQUE NATIONALE") == "bibliotheque nationale" # ============================================================================= # Test: parse_language_tag # ============================================================================= class TestParseLanguageTag: """Tests for parse_language_tag function.""" def test_dutch_tag(self): """Should parse Dutch language tag.""" lang, term = parse_language_tag("virtueel museum (nl)") assert lang == "nl" assert term == "virtueel museum" def test_german_tag(self): """Should parse German language tag.""" lang, term = parse_language_tag("Digitales Museum (de)") assert lang == "de" assert term == "Digitales Museum" def test_french_tag(self): """Should parse French language tag.""" lang, term = parse_language_tag("musée virtuel (fr)") assert lang == "fr" assert term == "musée virtuel" def test_spanish_tag(self): """Should parse Spanish language tag.""" lang, term = parse_language_tag("museo virtual (es)") assert lang == "es" assert term == "museo virtual" def test_no_tag(self): """Should return None for lang when no tag present.""" lang, term = parse_language_tag("Some plain comment") assert lang is None assert term == "Some plain comment" def test_unsupported_language(self): """Should treat unsupported language codes as no tag.""" lang, term = parse_language_tag("text (xyz)") assert lang is None # xyz is not supported def test_uppercase_tag(self): """Should handle uppercase language tags.""" lang, term = parse_language_tag("museum (NL)") assert lang == "nl" assert term == "museum" # ============================================================================= # Test: extract_comma_separated_terms # ============================================================================= class TestExtractCommaSeparatedTerms: """Tests for extract_comma_separated_terms function.""" def test_simple_list(self): """Should extract simple comma-separated terms.""" terms = extract_comma_separated_terms("alpha, beta, gamma") assert "alpha" in terms assert "beta" in terms assert "gamma" in terms def test_includes_prefix(self): """Should strip 'Includes' prefix.""" terms = extract_comma_separated_terms("Includes bibliotheken, bibliotecas, bibliothèques") assert "bibliotheken" in terms assert "bibliotecas" in terms assert "bibliothèques" in terms assert "Includes" not in " ".join(terms) def test_examples_prefix(self): """Should strip 'Examples:' prefix.""" terms = extract_comma_separated_terms("Examples: museum, archive, library") assert "museum" in terms assert "archive" in terms assert "library" in terms def test_no_commas(self): """Should return empty list for single term.""" terms = extract_comma_separated_terms("Just a single comment") assert terms == [] def test_skip_long_sentences(self): """Should skip terms that look like sentences (> 50 chars).""" long_term = "This is a very long sentence that should be skipped because it exceeds fifty characters" terms = extract_comma_separated_terms(f"short term, {long_term}") assert "short term" in terms assert long_term not in terms def test_strip_wikidata_references(self): """Should strip trailing Wikidata references.""" terms = extract_comma_separated_terms("botanical gardens (Q473972), zoos") assert "botanical gardens" in terms assert "zoos" in terms assert "(Q473972)" not in " ".join(terms) # ============================================================================= # Test: extract_wikidata_id # ============================================================================= class TestExtractWikidataId: """Tests for extract_wikidata_id function.""" def test_wikidata_prefix(self): """Should extract ID with wikidata: prefix.""" assert extract_wikidata_id("wikidata:Q12345") == "Q12345" assert extract_wikidata_id("wikidata:Q1225034") == "Q1225034" def test_full_uri(self): """Should extract ID from full Wikidata URI.""" assert extract_wikidata_id("http://www.wikidata.org/entity/Q12345") == "Q12345" assert extract_wikidata_id("https://www.wikidata.org/wiki/Q67890") == "Q67890" def test_none_input(self): """Should handle None input.""" assert extract_wikidata_id(None) is None def test_invalid_format(self): """Should return None for invalid format.""" assert extract_wikidata_id("not a wikidata id") is None assert extract_wikidata_id("schema:Thing") is None # ============================================================================= # Test: EnumValueInfo # ============================================================================= class TestEnumValueInfo: """Tests for EnumValueInfo dataclass.""" def test_basic_creation(self): """Should create with minimal fields.""" info = EnumValueInfo(name="TEST_VALUE") assert info.name == "TEST_VALUE" assert info.description is None assert info.wikidata_id is None assert info.synonyms == {} assert info.all_synonyms_normalized == [] def test_full_creation(self): """Should create with all fields.""" info = EnumValueInfo( name="MUSEUM", description="A museum institution", wikidata_id="Q33506", synonyms={"nl": ["museum", "musea"], "de": ["Museum"]}, all_synonyms_normalized=["museum", "musea"], ) assert info.name == "MUSEUM" assert info.description == "A museum institution" assert info.wikidata_id == "Q33506" assert "nl" in info.synonyms assert "museum" in info.all_synonyms_normalized # ============================================================================= # Test: OntologyMapper - Enum Loading # ============================================================================= class TestOntologyMapperLoading: """Tests for OntologyMapper enum loading.""" def test_load_enum_from_temp_file(self, temp_mapper: OntologyMapper): """Should load enum from temporary test file.""" mapping = temp_mapper.load_enum("TestEnum") assert mapping is not None assert mapping.enum_name == "TestEnum" assert len(mapping.values) == 3 assert "VALUE_ONE" in mapping.values assert "VALUE_TWO" in mapping.values assert "VALUE_THREE" in mapping.values def test_load_nonexistent_enum(self, temp_mapper: OntologyMapper): """Should return None for non-existent enum.""" mapping = temp_mapper.load_enum("NonExistentEnum") assert mapping is None def test_extract_wikidata_from_meaning(self, temp_mapper: OntologyMapper): """Should extract Wikidata ID from meaning field.""" mapping = temp_mapper.load_enum("TestEnum") assert mapping is not None value_one = mapping.values.get("VALUE_ONE") assert value_one is not None assert value_one.wikidata_id == "Q12345" def test_extract_synonyms_from_comments(self, temp_mapper: OntologyMapper): """Should extract language-tagged synonyms from comments.""" mapping = temp_mapper.load_enum("TestEnum") assert mapping is not None value_one = mapping.values.get("VALUE_ONE") assert value_one is not None # Check language-specific synonyms assert "nl" in value_one.synonyms assert "waarde een" in value_one.synonyms["nl"] assert "de" in value_one.synonyms assert "Wert eins" in value_one.synonyms["de"] def test_extract_comma_separated_from_comments(self, temp_mapper: OntologyMapper): """Should extract comma-separated terms from comments.""" mapping = temp_mapper.load_enum("TestEnum") assert mapping is not None value_two = mapping.values.get("VALUE_TWO") assert value_two is not None # Comma-separated terms should be in all_synonyms_normalized assert "alpha" in value_two.all_synonyms_normalized assert "beta" in value_two.all_synonyms_normalized assert "gamma" in value_two.all_synonyms_normalized def test_load_real_custodian_type_enum(self, mapper: OntologyMapper): """Should load real CustodianPrimaryTypeEnum from schema.""" mapping = mapper.load_enum("CustodianPrimaryTypeEnum") assert mapping is not None assert len(mapping.values) >= 19 # GLAMORCUBESFIXPHDNT has 19 types assert "MUSEUM" in mapping.values assert "LIBRARY" in mapping.values assert "ARCHIVE" in mapping.values def test_load_real_digital_platform_enum(self, mapper: OntologyMapper): """Should load real DigitalPlatformTypeEnum from schema.""" mapping = mapper.load_enum("DigitalPlatformTypeEnum") assert mapping is not None assert len(mapping.values) >= 50 # Should have many platform types assert "VIRTUAL_MUSEUM" in mapping.values def test_load_all_enums(self, mapper: OntologyMapper): """Should load all enum files from schema directory.""" all_enums = mapper.load_all_enums() assert len(all_enums) >= 10 # Should have many enums # Check some expected enums enum_names = list(all_enums.keys()) assert "CustodianPrimaryTypeEnum" in enum_names assert "DigitalPlatformTypeEnum" in enum_names # ============================================================================= # Test: OntologyMapper - Natural Language Matching # ============================================================================= class TestOntologyMapperMatching: """Tests for OntologyMapper natural language matching.""" def test_exact_match(self, temp_mapper: OntologyMapper): """Should match exact normalized text.""" result = temp_mapper.match_natural_language("value one", "TestEnum") assert result == "VALUE_ONE" def test_dutch_synonym_match(self, temp_mapper: OntologyMapper): """Should match Dutch synonym from comments.""" result = temp_mapper.match_natural_language("waarde een", "TestEnum") assert result == "VALUE_ONE" def test_german_synonym_match(self, temp_mapper: OntologyMapper): """Should match German synonym from comments.""" result = temp_mapper.match_natural_language("Wert eins", "TestEnum") assert result == "VALUE_ONE" def test_comma_term_match(self, temp_mapper: OntologyMapper): """Should match comma-separated term.""" result = temp_mapper.match_natural_language("alpha", "TestEnum") assert result == "VALUE_TWO" def test_no_match(self, temp_mapper: OntologyMapper): """Should return None when no match found.""" result = temp_mapper.match_natural_language("xyz nonexistent", "TestEnum") assert result is None def test_real_dutch_bibliotheek(self, mapper: OntologyMapper): """Should match Dutch 'bibliotheek' to LIBRARY.""" result = mapper.match_natural_language("bibliotheek", "CustodianPrimaryTypeEnum") assert result == "LIBRARY" def test_real_dutch_bibliotheken(self, mapper: OntologyMapper): """Should match Dutch plural 'bibliotheken' to LIBRARY (fuzzy).""" result = mapper.match_natural_language("bibliotheken", "CustodianPrimaryTypeEnum") assert result == "LIBRARY" def test_real_dutch_archief(self, mapper: OntologyMapper): """Should match Dutch 'archief' to ARCHIVE.""" result = mapper.match_natural_language("archief", "CustodianPrimaryTypeEnum") assert result == "ARCHIVE" def test_real_dutch_virtueel_museum(self, mapper: OntologyMapper): """Should match Dutch 'virtueel museum' to VIRTUAL_MUSEUM.""" result = mapper.match_natural_language("virtueel museum", "DigitalPlatformTypeEnum") assert result == "VIRTUAL_MUSEUM" def test_real_german_digitales_museum(self, mapper: OntologyMapper): """Should match German 'Digitales Museum' to VIRTUAL_MUSEUM.""" result = mapper.match_natural_language("Digitales Museum", "DigitalPlatformTypeEnum") assert result == "VIRTUAL_MUSEUM" def test_real_spanish_museo_virtual(self, mapper: OntologyMapper): """Should match Spanish 'museo virtual' to VIRTUAL_MUSEUM.""" result = mapper.match_natural_language("museo virtual", "DigitalPlatformTypeEnum") assert result == "VIRTUAL_MUSEUM" def test_case_insensitive(self, mapper: OntologyMapper): """Should be case insensitive.""" result1 = mapper.match_natural_language("MUSEUM", "CustodianPrimaryTypeEnum") result2 = mapper.match_natural_language("museum", "CustodianPrimaryTypeEnum") result3 = mapper.match_natural_language("Museum", "CustodianPrimaryTypeEnum") assert result1 == result2 == result3 == "MUSEUM" # ============================================================================= # Test: OntologyMapper - Heritage Code Mapping # ============================================================================= class TestOntologyMapperHeritageCodes: """Tests for heritage code mapping.""" def test_museum_code(self, mapper: OntologyMapper): """Should map MUSEUM to M.""" assert mapper.get_heritage_type_code("MUSEUM") == "M" def test_library_code(self, mapper: OntologyMapper): """Should map LIBRARY to L.""" assert mapper.get_heritage_type_code("LIBRARY") == "L" def test_archive_code(self, mapper: OntologyMapper): """Should map ARCHIVE to A.""" assert mapper.get_heritage_type_code("ARCHIVE") == "A" def test_gallery_code(self, mapper: OntologyMapper): """Should map GALLERY to G.""" assert mapper.get_heritage_type_code("GALLERY") == "G" def test_unknown_code(self, mapper: OntologyMapper): """Should return None for unknown type.""" assert mapper.get_heritage_type_code("UNKNOWN_TYPE") is None def test_get_full_mapping(self, mapper: OntologyMapper): """Should return complete type-to-code mapping.""" mapping = mapper.get_custodian_type_to_code_mapping() assert len(mapping) == 19 # GLAMORCUBESFIXPHDNT has 19 types assert mapping["MUSEUM"] == "M" assert mapping["LIBRARY"] == "L" assert mapping["ARCHIVE"] == "A" assert mapping["GALLERY"] == "G" # Check all expected codes are present expected_codes = set("GLAMORCUBESFIXPHDNT") actual_codes = set(mapping.values()) assert actual_codes == expected_codes # ============================================================================= # Test: OntologyMapper - Caching # ============================================================================= class TestOntologyMapperCaching: """Tests for caching behavior.""" def test_enum_is_cached(self, mapper: OntologyMapper): """Should cache enum after first load.""" # First load mapping1 = mapper.load_enum("CustodianPrimaryTypeEnum") assert mapping1 is not None assert "CustodianPrimaryTypeEnum" in mapper._cache # Second load should return cached version mapping2 = mapper.load_enum("CustodianPrimaryTypeEnum") assert mapping1 is mapping2 # Same object def test_force_reload(self, mapper: OntologyMapper): """Should reload when force_reload=True.""" # First load mapping1 = mapper.load_enum("CustodianPrimaryTypeEnum") # Force reload mapping2 = mapper.load_enum("CustodianPrimaryTypeEnum", force_reload=True) # Should be different objects assert mapping1 is not mapping2 def test_clear_cache(self, mapper: OntologyMapper): """Should clear all cached enums.""" # Load some enums mapper.load_enum("CustodianPrimaryTypeEnum") mapper.load_enum("DigitalPlatformTypeEnum") assert len(mapper._cache) >= 2 # Clear cache mapper.clear_cache() assert len(mapper._cache) == 0 assert len(mapper._file_mtimes) == 0 # ============================================================================= # Test: Convenience Functions # ============================================================================= class TestConvenienceFunctions: """Tests for module-level convenience functions.""" @pytest.fixture(autouse=True) def reset_singleton(self): """Reset singleton before each test.""" reset_ontology_mapper() yield reset_ontology_mapper() def test_match_custodian_type(self): """Should match custodian type via convenience function.""" assert match_custodian_type("museum") == "MUSEUM" assert match_custodian_type("bibliotheek") == "LIBRARY" assert match_custodian_type("archief") == "ARCHIVE" def test_match_digital_platform_type(self): """Should match digital platform type via convenience function.""" assert match_digital_platform_type("virtueel museum") == "VIRTUAL_MUSEUM" def test_match_museum_type(self): """Should match museum type via convenience function.""" # This tests against MuseumTypeEnum result = match_museum_type("art museum") # Result depends on what's in MuseumTypeEnum assert result is None or isinstance(result, str) def test_get_heritage_code(self): """Should get heritage code via convenience function.""" assert get_heritage_code("MUSEUM") == "M" assert get_heritage_code("LIBRARY") == "L" assert get_heritage_code("ARCHIVE") == "A" def test_get_custodian_type_mapping(self): """Should get full mapping via convenience function.""" mapping = get_custodian_type_mapping() assert len(mapping) == 19 assert mapping["MUSEUM"] == "M" def test_get_ontology_mapper_singleton(self): """Should return singleton instance.""" mapper1 = get_ontology_mapper() mapper2 = get_ontology_mapper() assert mapper1 is mapper2 # ============================================================================= # Test: Role Category Keywords # ============================================================================= class TestRoleCategoryKeywords: """Tests for role category keyword extraction.""" def test_get_role_keywords(self, mapper: OntologyMapper): """Should extract role category keywords.""" keywords = mapper.get_role_category_keywords() # May return empty dict if StaffRole.yaml doesn't exist assert isinstance(keywords, dict) def test_get_role_keywords_convenience(self): """Should work via convenience function.""" reset_ontology_mapper() keywords = get_role_keywords() assert isinstance(keywords, dict) # ============================================================================= # Test: Prompt Formatting # ============================================================================= class TestPromptFormatting: """Tests for DSPy prompt formatting.""" def test_get_enum_values_for_prompt(self, mapper: OntologyMapper): """Should format enum values for prompt injection.""" prompt = mapper.get_enum_values_for_prompt("CustodianPrimaryTypeEnum", max_values=5) assert "Valid values for CustodianPrimaryTypeEnum:" in prompt assert "MUSEUM" in prompt or "LIBRARY" in prompt # At least some values assert "... and" in prompt # Should indicate more values exist def test_get_valid_filter_values(self, mapper: OntologyMapper): """Should return list of valid filter values.""" values = mapper.get_valid_filter_values("CustodianPrimaryTypeEnum") assert isinstance(values, list) assert len(values) >= 19 assert "MUSEUM" in values assert "LIBRARY" in values # ============================================================================= # Test: GLAMORCUBESFIXPHDNT Codes Constant # ============================================================================= class TestGLAMORCUBESFIXPHDNTCodes: """Tests for GLAMORCUBESFIXPHDNT_CODES constant.""" def test_all_codes_present(self): """Should have all 19 codes in mnemonic.""" expected = "GLAMORCUBESFIXPHDNT" actual_codes = set(GLAMORCUBESFIXPHDNT_CODES.values()) assert actual_codes == set(expected) def test_all_codes_single_letter(self): """All codes should be single letters.""" for type_name, code in GLAMORCUBESFIXPHDNT_CODES.items(): assert len(code) == 1, f"{type_name} has non-single-letter code: {code}" assert code.isalpha(), f"{type_name} has non-letter code: {code}" assert code.isupper(), f"{type_name} has non-uppercase code: {code}" def test_code_count(self): """Should have exactly 19 type-to-code mappings.""" assert len(GLAMORCUBESFIXPHDNT_CODES) == 19 # ============================================================================= # Test: Similarity Function # ============================================================================= class TestSimilarityFunction: """Tests for _simple_similarity method.""" def test_exact_match(self, mapper: OntologyMapper): """Exact match should return 1.0.""" score = mapper._simple_similarity("museum", "museum") assert score == 1.0 def test_prefix_match(self, mapper: OntologyMapper): """Prefix match should return high score.""" # bibliotheek → bibliotheken (Dutch singular/plural) score = mapper._simple_similarity("bibliotheek", "bibliotheken") assert score >= 0.9 def test_stem_match(self, mapper: OntologyMapper): """Shared stem should return good score.""" # archief → archieven score = mapper._simple_similarity("archief", "archieven") assert score >= 0.85 def test_no_similarity(self, mapper: OntologyMapper): """Completely different strings should return low score.""" score = mapper._simple_similarity("museum", "xyz") assert score < 0.5 def test_empty_string(self, mapper: OntologyMapper): """Empty strings should return 0.0.""" assert mapper._simple_similarity("", "museum") == 0.0 assert mapper._simple_similarity("museum", "") == 0.0 assert mapper._simple_similarity("", "") == 0.0 # ============================================================================= # Test: Integration with hybrid_retriever # ============================================================================= class TestHybridRetrieverIntegration: """Tests verifying integration with hybrid_retriever.py.""" @pytest.fixture(autouse=True) def reset(self): """Reset singleton before each test.""" reset_ontology_mapper() yield def test_mapping_has_expected_format(self): """Mapping should match expected format for hybrid_retriever.""" mapping = get_custodian_type_mapping() # All keys should be uppercase enum values for key in mapping: assert key.isupper() or key == key.upper().replace("_", "_") # All values should be single uppercase letters for value in mapping.values(): assert len(value) == 1 assert value.isupper() def test_heritage_code_returns_none_for_invalid(self): """get_heritage_code should return None for invalid types.""" assert get_heritage_code("INVALID_TYPE") is None assert get_heritage_code("") is None def test_consistent_with_hardcoded_values(self): """Dynamic mapping should match expected hardcoded values.""" mapping = get_custodian_type_mapping() # These are the critical mappings that hybrid_retriever depends on expected = { "GALLERY": "G", "LIBRARY": "L", "ARCHIVE": "A", "MUSEUM": "M", "OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "DIGITAL_PLATFORM": "D", } for enum_val, code in expected.items(): assert mapping.get(enum_val) == code, f"Mismatch for {enum_val}" # ============================================================================= # Test: Edge Cases # ============================================================================= class TestEdgeCases: """Tests for edge cases and error handling.""" def test_match_empty_string(self, mapper: OntologyMapper): """Should handle empty string input.""" result = mapper.match_natural_language("", "CustodianPrimaryTypeEnum") assert result is None def test_match_whitespace_only(self, mapper: OntologyMapper): """Should handle whitespace-only input.""" result = mapper.match_natural_language(" ", "CustodianPrimaryTypeEnum") assert result is None def test_match_nonexistent_enum(self, mapper: OntologyMapper): """Should return None for non-existent enum.""" result = mapper.match_natural_language("museum", "NonExistentEnum") assert result is None def test_load_malformed_yaml(self, temp_enum_dir: Path): """Should handle malformed YAML gracefully.""" enum_file = temp_enum_dir / "modules" / "enums" / "BrokenEnum.yaml" with open(enum_file, "w") as f: f.write("this is not: valid: yaml: content:") mapper = OntologyMapper(temp_enum_dir) result = mapper.load_enum("BrokenEnum") assert result is None def test_unicode_normalization(self, mapper: OntologyMapper): """Should handle various unicode representations.""" # e with combining acute accent vs precomposed é result1 = mapper.match_natural_language("musée", "CustodianPrimaryTypeEnum") # precomposed result2 = mapper.match_natural_language("musée", "CustodianPrimaryTypeEnum") # combining # Both should normalize to "musee" and potentially match assert result1 == result2 # ============================================================================= # Test: Language Detection # ============================================================================= class TestDetectTermLanguage: """Tests for the detect_term_language function. This function uses a hybrid approach: 1. Heritage-specific vocabulary for known heritage terms (highest priority) 2. fast-langdetect library for general language detection (with confidence threshold) 3. English default for multi-word phrases without clear indicators The heritage vocabulary focuses on terms that general-purpose language detectors often misclassify (e.g., "musea" as Italian instead of Dutch). """ def test_detect_dutch_museum_terms(self): """Dutch museum-related terms in heritage vocabulary should be 'nl'.""" # "musea" is in heritage vocabulary - fast-langdetect often misclassifies it assert detect_term_language("musea") == "nl" # "museum" is generic - depends on fast-langdetect (en/nl/de all valid) result = detect_term_language("museum") assert result in ("nl", "de", "en") # Accept any valid detection def test_detect_dutch_library_terms(self): """Dutch library terms should be detected as 'nl'.""" assert detect_term_language("bibliotheken") == "nl" assert detect_term_language("bibliotheek") == "nl" # Multi-word terms without English indicators default to heritage vocab match assert detect_term_language("openbare bibliotheek") in ("nl", "en") def test_detect_dutch_archive_terms(self): """Dutch archive terms should be detected as 'nl'.""" assert detect_term_language("archieven") == "nl" assert detect_term_language("archief") == "nl" # "nationaal" triggers heritage vocab match for Dutch assert detect_term_language("nationaal archief") in ("nl", "en") # "national" may trigger English # Compound terms use prefix matching assert detect_term_language("gemeentearchief") in ("nl", None) def test_detect_french_terms(self): """French heritage terms with diacritics should be detected as 'fr'.""" # Terms with diacritics are reliably detected by fast-langdetect assert detect_term_language("musées") == "fr" assert detect_term_language("musée") == "fr" assert detect_term_language("bibliothèques") == "fr" assert detect_term_language("bibliothèque") == "fr" # "archives" without diacritics is ambiguous (French/English) result = detect_term_language("archives") assert result in ("fr", "en") # Diacritics provide clear French signal result = detect_term_language("société historique") assert result in ("fr", "en") # "historique" detected by fast-langdetect def test_detect_spanish_terms(self): """Spanish heritage terms should be detected as 'es'.""" # "museos" is in heritage vocabulary result = detect_term_language("museos") assert result in ("es", None) # May not match if not in reduced vocab # "bibliotecas" and "archivos" are in heritage vocabulary assert detect_term_language("bibliotecas") in ("es", "pt") # Shared term assert detect_term_language("archivos") == "es" def test_detect_german_terms(self): """German heritage terms should be detected as 'de'.""" assert detect_term_language("museen") == "de" # "bibliothek" may match Dutch vocabulary first due to prefix matching result = detect_term_language("bibliothek") assert result in ("de", "nl") # Both have similar terms assert detect_term_language("archiv") == "de" assert detect_term_language("sammlung") == "de" def test_detect_english_terms(self): """English heritage terms should be detected as 'en'.""" assert detect_term_language("museums") == "en" assert detect_term_language("libraries") == "en" assert detect_term_language("gallery") == "en" assert detect_term_language("national library") == "en" assert detect_term_language("public archives") == "en" def test_detect_italian_terms(self): """Italian heritage terms should be detected as 'it'.""" assert detect_term_language("musei") == "it" assert detect_term_language("biblioteche") == "it" assert detect_term_language("archivi") == "it" def test_detect_portuguese_terms(self): """Portuguese heritage terms should be detected as 'pt'.""" assert detect_term_language("museus") == "pt" assert detect_term_language("bibliotecas") in ("pt", "es") # Shared term assert detect_term_language("arquivos") == "pt" def test_unknown_term_returns_none(self): """Unknown single-word terms should return None.""" assert detect_term_language("xyz123") is None assert detect_term_language("asdfghjkl") is None def test_empty_string_defaults_to_english(self): """Empty string should return English as default.""" assert detect_term_language("") == "en" def test_whitespace_only_defaults_to_english(self): """Whitespace-only input should return English as default.""" assert detect_term_language(" ") == "en" def test_case_insensitive_detection(self): """Detection should be case-insensitive.""" assert detect_term_language("MUSEA") == "nl" assert detect_term_language("Musées") == "fr" # "MUSEOS" relies on fast-langdetect after heritage vocab check result = detect_term_language("MUSEOS") assert result in ("es", None) assert detect_term_language("Libraries") == "en" def test_compound_dutch_terms(self): """Compound Dutch terms should be detected via heritage vocabulary or prefix matching.""" # "rijks" is in heritage vocabulary as prefix assert detect_term_language("rijksmuseum") in ("nl", None) # "gemeente" matches via prefix with "gemeentelijk" assert detect_term_language("gemeentearchief") in ("nl", None) def test_priority_when_ambiguous(self): """Heritage vocabulary takes precedence for known terms. When a term is in heritage vocabulary, that language is returned. For terms not in vocabulary, fast-langdetect determines the result. """ # "archiv" is in German heritage vocabulary assert detect_term_language("archiv") == "de" # "museum" is not in heritage vocabulary (too ambiguous) # fast-langdetect will classify it result = detect_term_language("museum") assert result in ("nl", "de", "en") # "musea" is specifically in Dutch heritage vocabulary assert detect_term_language("musea") == "nl" if __name__ == "__main__": pytest.main([__file__, "-v"])