glam/tests/test_transliteration.py
kempersc 891692a4d6 feat(ghcid): add diacritics normalization and transliteration scripts
- Add fix_ghcid_diacritics.py for normalizing non-ASCII in GHCIDs
- Add resolve_diacritics_collisions.py for collision handling
- Add transliterate_emic_names.py for non-Latin script handling
- Add transliteration tests
2025-12-08 14:59:28 +01:00

350 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Unit tests for transliteration functions.
Tests the scripts/transliterate_emic_names.py module for converting
non-Latin script institution names to Latin characters.
"""
import sys
from pathlib import Path
import pytest
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.transliterate_emic_names import (
detect_script,
transliterate,
transliterate_for_abbreviation,
transliterate_cyrillic,
transliterate_chinese,
transliterate_japanese,
transliterate_korean,
transliterate_arabic,
transliterate_hebrew,
transliterate_greek,
transliterate_devanagari,
transliterate_armenian,
transliterate_georgian,
transliterate_thai,
transliterate_sinhala,
transliterate_khmer,
)
class TestScriptDetection:
"""Tests for script detection function."""
def test_detect_latin(self):
assert detect_script("Hello World") == "latin"
assert detect_script("Rijksmuseum Amsterdam") == "latin"
def test_detect_cyrillic(self):
assert detect_script("Институт") == "cyrillic"
assert detect_script("Музей") == "cyrillic"
def test_detect_chinese(self):
assert detect_script("故宮博物院") == "chinese"
assert detect_script("中国国家图书馆") == "chinese"
def test_detect_japanese(self):
# Japanese with hiragana or katakana
assert detect_script("こんにちは") == "japanese"
assert detect_script("カタカナ") == "japanese"
def test_detect_korean(self):
assert detect_script("국립중앙박물관") == "korean"
def test_detect_arabic(self):
assert detect_script("المكتبة الوطنية") == "arabic"
def test_detect_hebrew(self):
assert detect_script("ארכיון") == "hebrew"
def test_detect_greek(self):
assert detect_script("Μουσείο") == "greek"
def test_detect_devanagari(self):
assert detect_script("राजस्थान") == "devanagari"
def test_detect_thai(self):
assert detect_script("สำนักหอจดหมายเหตุ") == "thai"
assert detect_script("กรุงเทพ") == "thai"
def test_detect_sinhala(self):
assert detect_script("පේරාදෙණිය") == "sinhala"
assert detect_script("ජාතික කෞතුකාගාර") == "sinhala"
def test_detect_khmer(self):
assert detect_script("សារមន្ទីរ") == "khmer"
assert detect_script("ភ្នំពេញ") == "khmer"
class TestCyrillicTransliteration:
"""Tests for Cyrillic (Russian/Ukrainian/etc.) transliteration."""
def test_russian_basic(self):
result = transliterate_cyrillic("Музей", "ru")
assert result == "Muzey"
def test_russian_institute(self):
result = transliterate_cyrillic("Институт восточных рукописей РАН", "ru")
assert "Institut" in result
assert "vostochnykh" in result
def test_russian_hard_soft_signs(self):
# Hard and soft signs should be removed
result = transliterate_cyrillic("объект", "ru")
assert "ъ" not in result
assert "ь" not in result
def test_ukrainian(self):
result = transliterate_cyrillic("Київ", "uk")
# Should handle Ukrainian-specific letters
assert "K" in result or "k" in result
class TestChineseTransliteration:
"""Tests for Chinese (Hanzi to Pinyin) transliteration."""
def test_museum_vocabulary(self):
result = transliterate_chinese("博物館")
assert "bo" in result.lower() or "haku" in result.lower()
def test_national_palace_museum(self):
result = transliterate_chinese("故宮博物院")
# Should contain pinyin for these characters
assert len(result) > 0
assert result != "故宮博物院" # Should be transliterated
def test_dongba_museum(self):
result = transliterate_chinese("东巴文化博物院")
assert "dong" in result.lower()
assert "wen" in result.lower()
class TestJapaneseTransliteration:
"""Tests for Japanese (Kanji/Kana to Romaji) transliteration."""
def test_national_museum(self):
result = transliterate_japanese("国立博物館")
assert "koku" in result.lower()
assert "ritsu" in result.lower()
def test_tokyo_national_museum(self):
result = transliterate_japanese("東京国立博物館")
assert "tou" in result.lower() or "to" in result.lower()
assert "kyou" in result.lower() or "kyo" in result.lower()
def test_hiragana(self):
result = transliterate_japanese("あいうえお")
assert result == "aiueo"
def test_katakana(self):
result = transliterate_japanese("アイウエオ")
assert result == "aiueo"
class TestKoreanTransliteration:
"""Tests for Korean (Hangul to Revised Romanization) transliteration."""
def test_national_museum(self):
result = transliterate_korean("국립중앙박물관")
# Should contain romanized syllables
assert len(result) > 0
assert "guk" in result.lower() or "kuk" in result.lower()
def test_simple_hangul(self):
result = transliterate_korean("한글")
assert "han" in result.lower()
class TestArabicTransliteration:
"""Tests for Arabic script transliteration."""
def test_national_library(self):
result = transliterate_arabic("المكتبة الوطنية")
assert "mktb" in result.lower() or "maktab" in result.lower()
def test_basic_letters(self):
result = transliterate_arabic("كتاب")
assert "k" in result.lower()
assert "t" in result.lower()
class TestHebrewTransliteration:
"""Tests for Hebrew script transliteration."""
def test_archive(self):
result = transliterate_hebrew("ארכיון")
# Should contain transliterated letters
assert len(result) > 0
def test_basic_letters(self):
result = transliterate_hebrew("שלום")
assert "sh" in result.lower()
class TestGreekTransliteration:
"""Tests for Greek script transliteration."""
def test_museum(self):
result = transliterate_greek("Μουσείο")
assert "Moyseio" in result or "Mouseio" in result
def test_archaeological(self):
result = transliterate_greek("Αρχαιολογικό")
assert "Archaiologiko" in result
class TestDevanagariTransliteration:
"""Tests for Devanagari (Hindi/Nepali) transliteration."""
def test_rajasthan(self):
result = transliterate_devanagari("राजस्थान")
# ISO 15919 uses "aa" for long vowels, so "raaj" not "raj"
assert "raaj" in result.lower() or "raj" in result.lower()
def test_basic_consonants(self):
result = transliterate_devanagari("")
assert "k" in result.lower()
class TestThaiTransliteration:
"""Tests for Thai script transliteration (RTGS)."""
def test_national_archives(self):
# สำนักหอจดหมายเหตุแห่งชาติ = National Archives of Thailand
result = transliterate_thai("สำนักหอจดหมายเหตุแห่งชาติ")
assert "samnak" in result.lower()
assert "haeng chat" in result.lower()
def test_national_library(self):
# สำนักหอสมุดแห่งชาติ = National Library of Thailand
result = transliterate_thai("สำนักหอสมุดแห่งชาติ")
assert "ho samut" in result.lower()
def test_national_museum(self):
# พิพิธภัณฑสถานแห่งชาติ พระนคร = Bangkok National Museum
result = transliterate_thai("พิพิธภัณฑสถานแห่งชาติ พระนคร")
assert "phiphitthaphan" in result.lower()
assert "phra nakhon" in result.lower()
def test_siam_society(self):
# สยามสมาคมในพระบรมราชูปถัมภ์ = Siam Society
result = transliterate_thai("สยามสมาคมในพระบรมราชูปถัมภ์")
assert "sayam" in result.lower()
assert "samakhom" in result.lower()
def test_wat_temple(self):
# วัดโพธิ์ราม = Wat Pho Ram
result = transliterate_thai("วัดโพธิ์ราม")
assert "wat" in result.lower()
assert "pho" in result.lower()
assert "ram" in result.lower()
def test_empty_without_library(self):
# Even without pythainlp, should return transliterated result (not empty)
result = transliterate_thai("กรุงเทพ")
# Should get 'krung thep' from vocabulary lookup
assert len(result) > 0
class TestSinhalaTransliteration:
"""Tests for Sinhala script transliteration (ISO 15919)."""
def test_university_peradeniya(self):
# පේරාදෙණිය විශ්වවිද් යාලය = University of Peradeniya
result = transliterate_sinhala("පේරාදෙණිය විශ්වවිද් යාලය")
assert "peradeniya" in result.lower()
assert "vishvavid" in result.lower()
def test_national_museums(self):
# ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව = Department of National Museums
result = transliterate_sinhala("ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව")
assert "jathika" in result.lower()
assert "kauthukagara" in result.lower()
def test_basic_consonants(self):
# Basic consonant test
result = transliterate_sinhala("") # ka
assert "k" in result.lower()
def test_output_not_empty(self):
# Sinhala should never return empty string
result = transliterate_sinhala("කොළඹ") # Colombo
assert len(result) > 0
class TestKhmerTransliteration:
"""Tests for Khmer script transliteration (UNGEGN)."""
def test_tuol_sleng(self):
# សារមន្ទីរទួលស្លែង = Tuol Sleng Genocide Museum
result = transliterate_khmer("សារមន្ទីរទួលស្លែង")
assert "tuol sleng" in result.lower()
def test_phnom_penh(self):
# ភ្នំពេញ = Phnom Penh
result = transliterate_khmer("ភ្នំពេញ")
assert "phnom penh" in result.lower()
def test_angkor(self):
# អង្គរ = Angkor
result = transliterate_khmer("អង្គរ")
assert "angkor" in result.lower()
def test_output_not_empty(self):
# Khmer should never return empty string
result = transliterate_khmer("សារមន្ទីរ")
assert len(result) > 0
class TestTransliterateForAbbreviation:
"""Tests for the main abbreviation function."""
def test_russian_cleanup(self):
result = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru")
# Should be clean Latin text
assert result.isascii() or all(c.isalnum() or c in " -'" for c in result)
def test_chinese_cleanup(self):
result = transliterate_for_abbreviation("东巴文化博物院", "zh")
# Should be clean Latin text or warning
assert result.isascii() or "[REQUIRES" in result
def test_korean_cleanup(self):
result = transliterate_for_abbreviation("국립중앙박물관", "ko")
assert result.isascii()
def test_special_characters_removed(self):
# Special characters should be removed for abbreviation
result = transliterate_for_abbreviation("Test (Museum) & Gallery", "en")
assert "&" not in result
assert "(" not in result
class TestIntegration:
"""Integration tests using the main transliterate function."""
def test_auto_detect_russian(self):
result = transliterate("Музей")
assert result.isascii()
def test_auto_detect_korean(self):
result = transliterate("박물관")
assert result.isascii()
def test_latin_passthrough(self):
result = transliterate("Rijksmuseum Amsterdam")
assert result == "Rijksmuseum Amsterdam"
def test_with_explicit_language(self):
result = transliterate("故宮博物院", lang="zh")
assert len(result) > 0
# Should not be original Chinese
assert "" not in result or "[REQUIRES" in result
if __name__ == "__main__":
pytest.main([__file__, "-v"])