""" Unit tests for transliteration functions. Tests the scripts/transliterate_emic_names.py module for converting non-Latin script institution names to Latin characters. """ import sys from pathlib import Path import pytest # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) from scripts.transliterate_emic_names import ( detect_script, transliterate, transliterate_for_abbreviation, transliterate_cyrillic, transliterate_chinese, transliterate_japanese, transliterate_korean, transliterate_arabic, transliterate_hebrew, transliterate_greek, transliterate_devanagari, transliterate_armenian, transliterate_georgian, transliterate_thai, transliterate_sinhala, transliterate_khmer, ) class TestScriptDetection: """Tests for script detection function.""" def test_detect_latin(self): assert detect_script("Hello World") == "latin" assert detect_script("Rijksmuseum Amsterdam") == "latin" def test_detect_cyrillic(self): assert detect_script("Институт") == "cyrillic" assert detect_script("Музей") == "cyrillic" def test_detect_chinese(self): assert detect_script("故宮博物院") == "chinese" assert detect_script("中国国家图书馆") == "chinese" def test_detect_japanese(self): # Japanese with hiragana or katakana assert detect_script("こんにちは") == "japanese" assert detect_script("カタカナ") == "japanese" def test_detect_korean(self): assert detect_script("국립중앙박물관") == "korean" def test_detect_arabic(self): assert detect_script("المكتبة الوطنية") == "arabic" def test_detect_hebrew(self): assert detect_script("ארכיון") == "hebrew" def test_detect_greek(self): assert detect_script("Μουσείο") == "greek" def test_detect_devanagari(self): assert detect_script("राजस्थान") == "devanagari" def test_detect_thai(self): assert detect_script("สำนักหอจดหมายเหตุ") == "thai" assert detect_script("กรุงเทพ") == "thai" def test_detect_sinhala(self): assert detect_script("පේරාදෙණිය") == "sinhala" assert detect_script("ජාතික කෞතුකාගාර") == "sinhala" def test_detect_khmer(self): assert detect_script("សារមន្ទីរ") == "khmer" assert detect_script("ភ្នំពេញ") == "khmer" class TestCyrillicTransliteration: """Tests for Cyrillic (Russian/Ukrainian/etc.) transliteration.""" def test_russian_basic(self): result = transliterate_cyrillic("Музей", "ru") assert result == "Muzey" def test_russian_institute(self): result = transliterate_cyrillic("Институт восточных рукописей РАН", "ru") assert "Institut" in result assert "vostochnykh" in result def test_russian_hard_soft_signs(self): # Hard and soft signs should be removed result = transliterate_cyrillic("объект", "ru") assert "ъ" not in result assert "ь" not in result def test_ukrainian(self): result = transliterate_cyrillic("Київ", "uk") # Should handle Ukrainian-specific letters assert "K" in result or "k" in result class TestChineseTransliteration: """Tests for Chinese (Hanzi to Pinyin) transliteration.""" def test_museum_vocabulary(self): result = transliterate_chinese("博物館") assert "bo" in result.lower() or "haku" in result.lower() def test_national_palace_museum(self): result = transliterate_chinese("故宮博物院") # Should contain pinyin for these characters assert len(result) > 0 assert result != "故宮博物院" # Should be transliterated def test_dongba_museum(self): result = transliterate_chinese("东巴文化博物院") assert "dong" in result.lower() assert "wen" in result.lower() class TestJapaneseTransliteration: """Tests for Japanese (Kanji/Kana to Romaji) transliteration.""" def test_national_museum(self): result = transliterate_japanese("国立博物館") assert "koku" in result.lower() assert "ritsu" in result.lower() def test_tokyo_national_museum(self): result = transliterate_japanese("東京国立博物館") assert "tou" in result.lower() or "to" in result.lower() assert "kyou" in result.lower() or "kyo" in result.lower() def test_hiragana(self): result = transliterate_japanese("あいうえお") assert result == "aiueo" def test_katakana(self): result = transliterate_japanese("アイウエオ") assert result == "aiueo" class TestKoreanTransliteration: """Tests for Korean (Hangul to Revised Romanization) transliteration.""" def test_national_museum(self): result = transliterate_korean("국립중앙박물관") # Should contain romanized syllables assert len(result) > 0 assert "guk" in result.lower() or "kuk" in result.lower() def test_simple_hangul(self): result = transliterate_korean("한글") assert "han" in result.lower() class TestArabicTransliteration: """Tests for Arabic script transliteration.""" def test_national_library(self): result = transliterate_arabic("المكتبة الوطنية") assert "mktb" in result.lower() or "maktab" in result.lower() def test_basic_letters(self): result = transliterate_arabic("كتاب") assert "k" in result.lower() assert "t" in result.lower() class TestHebrewTransliteration: """Tests for Hebrew script transliteration.""" def test_archive(self): result = transliterate_hebrew("ארכיון") # Should contain transliterated letters assert len(result) > 0 def test_basic_letters(self): result = transliterate_hebrew("שלום") assert "sh" in result.lower() class TestGreekTransliteration: """Tests for Greek script transliteration.""" def test_museum(self): result = transliterate_greek("Μουσείο") assert "Moyseio" in result or "Mouseio" in result def test_archaeological(self): result = transliterate_greek("Αρχαιολογικό") assert "Archaiologiko" in result class TestDevanagariTransliteration: """Tests for Devanagari (Hindi/Nepali) transliteration.""" def test_rajasthan(self): result = transliterate_devanagari("राजस्थान") # ISO 15919 uses "aa" for long vowels, so "raaj" not "raj" assert "raaj" in result.lower() or "raj" in result.lower() def test_basic_consonants(self): result = transliterate_devanagari("क") assert "k" in result.lower() class TestThaiTransliteration: """Tests for Thai script transliteration (RTGS).""" def test_national_archives(self): # สำนักหอจดหมายเหตุแห่งชาติ = National Archives of Thailand result = transliterate_thai("สำนักหอจดหมายเหตุแห่งชาติ") assert "samnak" in result.lower() assert "haeng chat" in result.lower() def test_national_library(self): # สำนักหอสมุดแห่งชาติ = National Library of Thailand result = transliterate_thai("สำนักหอสมุดแห่งชาติ") assert "ho samut" in result.lower() def test_national_museum(self): # พิพิธภัณฑสถานแห่งชาติ พระนคร = Bangkok National Museum result = transliterate_thai("พิพิธภัณฑสถานแห่งชาติ พระนคร") assert "phiphitthaphan" in result.lower() assert "phra nakhon" in result.lower() def test_siam_society(self): # สยามสมาคมในพระบรมราชูปถัมภ์ = Siam Society result = transliterate_thai("สยามสมาคมในพระบรมราชูปถัมภ์") assert "sayam" in result.lower() assert "samakhom" in result.lower() def test_wat_temple(self): # วัดโพธิ์ราม = Wat Pho Ram result = transliterate_thai("วัดโพธิ์ราม") assert "wat" in result.lower() assert "pho" in result.lower() assert "ram" in result.lower() def test_empty_without_library(self): # Even without pythainlp, should return transliterated result (not empty) result = transliterate_thai("กรุงเทพ") # Should get 'krung thep' from vocabulary lookup assert len(result) > 0 class TestSinhalaTransliteration: """Tests for Sinhala script transliteration (ISO 15919).""" def test_university_peradeniya(self): # පේරාදෙණිය විශ්වවිද් යාලය = University of Peradeniya result = transliterate_sinhala("පේරාදෙණිය විශ්වවිද් යාලය") assert "peradeniya" in result.lower() assert "vishvavid" in result.lower() def test_national_museums(self): # ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව = Department of National Museums result = transliterate_sinhala("ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව") assert "jathika" in result.lower() assert "kauthukagara" in result.lower() def test_basic_consonants(self): # Basic consonant test result = transliterate_sinhala("ක") # ka assert "k" in result.lower() def test_output_not_empty(self): # Sinhala should never return empty string result = transliterate_sinhala("කොළඹ") # Colombo assert len(result) > 0 class TestKhmerTransliteration: """Tests for Khmer script transliteration (UNGEGN).""" def test_tuol_sleng(self): # សារមន្ទីរទួលស្លែង = Tuol Sleng Genocide Museum result = transliterate_khmer("សារមន្ទីរទួលស្លែង") assert "tuol sleng" in result.lower() def test_phnom_penh(self): # ភ្នំពេញ = Phnom Penh result = transliterate_khmer("ភ្នំពេញ") assert "phnom penh" in result.lower() def test_angkor(self): # អង្គរ = Angkor result = transliterate_khmer("អង្គរ") assert "angkor" in result.lower() def test_output_not_empty(self): # Khmer should never return empty string result = transliterate_khmer("សារមន្ទីរ") assert len(result) > 0 class TestTransliterateForAbbreviation: """Tests for the main abbreviation function.""" def test_russian_cleanup(self): result = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru") # Should be clean Latin text assert result.isascii() or all(c.isalnum() or c in " -'" for c in result) def test_chinese_cleanup(self): result = transliterate_for_abbreviation("东巴文化博物院", "zh") # Should be clean Latin text or warning assert result.isascii() or "[REQUIRES" in result def test_korean_cleanup(self): result = transliterate_for_abbreviation("국립중앙박물관", "ko") assert result.isascii() def test_special_characters_removed(self): # Special characters should be removed for abbreviation result = transliterate_for_abbreviation("Test (Museum) & Gallery", "en") assert "&" not in result assert "(" not in result class TestIntegration: """Integration tests using the main transliterate function.""" def test_auto_detect_russian(self): result = transliterate("Музей") assert result.isascii() def test_auto_detect_korean(self): result = transliterate("박물관") assert result.isascii() def test_latin_passthrough(self): result = transliterate("Rijksmuseum Amsterdam") assert result == "Rijksmuseum Amsterdam" def test_with_explicit_language(self): result = transliterate("故宮博物院", lang="zh") assert len(result) > 0 # Should not be original Chinese assert "故" not in result or "[REQUIRES" in result if __name__ == "__main__": pytest.main([__file__, "-v"])