- Add fix_ghcid_diacritics.py for normalizing non-ASCII in GHCIDs - Add resolve_diacritics_collisions.py for collision handling - Add transliterate_emic_names.py for non-Latin script handling - Add transliteration tests
350 lines
13 KiB
Python
350 lines
13 KiB
Python
"""
|
||
Unit tests for transliteration functions.
|
||
|
||
Tests the scripts/transliterate_emic_names.py module for converting
|
||
non-Latin script institution names to Latin characters.
|
||
"""
|
||
|
||
import sys
|
||
from pathlib import Path
|
||
import pytest
|
||
|
||
# Add project root to path
|
||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
|
||
from scripts.transliterate_emic_names import (
|
||
detect_script,
|
||
transliterate,
|
||
transliterate_for_abbreviation,
|
||
transliterate_cyrillic,
|
||
transliterate_chinese,
|
||
transliterate_japanese,
|
||
transliterate_korean,
|
||
transliterate_arabic,
|
||
transliterate_hebrew,
|
||
transliterate_greek,
|
||
transliterate_devanagari,
|
||
transliterate_armenian,
|
||
transliterate_georgian,
|
||
transliterate_thai,
|
||
transliterate_sinhala,
|
||
transliterate_khmer,
|
||
)
|
||
|
||
|
||
class TestScriptDetection:
|
||
"""Tests for script detection function."""
|
||
|
||
def test_detect_latin(self):
|
||
assert detect_script("Hello World") == "latin"
|
||
assert detect_script("Rijksmuseum Amsterdam") == "latin"
|
||
|
||
def test_detect_cyrillic(self):
|
||
assert detect_script("Институт") == "cyrillic"
|
||
assert detect_script("Музей") == "cyrillic"
|
||
|
||
def test_detect_chinese(self):
|
||
assert detect_script("故宮博物院") == "chinese"
|
||
assert detect_script("中国国家图书馆") == "chinese"
|
||
|
||
def test_detect_japanese(self):
|
||
# Japanese with hiragana or katakana
|
||
assert detect_script("こんにちは") == "japanese"
|
||
assert detect_script("カタカナ") == "japanese"
|
||
|
||
def test_detect_korean(self):
|
||
assert detect_script("국립중앙박물관") == "korean"
|
||
|
||
def test_detect_arabic(self):
|
||
assert detect_script("المكتبة الوطنية") == "arabic"
|
||
|
||
def test_detect_hebrew(self):
|
||
assert detect_script("ארכיון") == "hebrew"
|
||
|
||
def test_detect_greek(self):
|
||
assert detect_script("Μουσείο") == "greek"
|
||
|
||
def test_detect_devanagari(self):
|
||
assert detect_script("राजस्थान") == "devanagari"
|
||
|
||
def test_detect_thai(self):
|
||
assert detect_script("สำนักหอจดหมายเหตุ") == "thai"
|
||
assert detect_script("กรุงเทพ") == "thai"
|
||
|
||
def test_detect_sinhala(self):
|
||
assert detect_script("පේරාදෙණිය") == "sinhala"
|
||
assert detect_script("ජාතික කෞතුකාගාර") == "sinhala"
|
||
|
||
def test_detect_khmer(self):
|
||
assert detect_script("សារមន្ទីរ") == "khmer"
|
||
assert detect_script("ភ្នំពេញ") == "khmer"
|
||
|
||
|
||
class TestCyrillicTransliteration:
|
||
"""Tests for Cyrillic (Russian/Ukrainian/etc.) transliteration."""
|
||
|
||
def test_russian_basic(self):
|
||
result = transliterate_cyrillic("Музей", "ru")
|
||
assert result == "Muzey"
|
||
|
||
def test_russian_institute(self):
|
||
result = transliterate_cyrillic("Институт восточных рукописей РАН", "ru")
|
||
assert "Institut" in result
|
||
assert "vostochnykh" in result
|
||
|
||
def test_russian_hard_soft_signs(self):
|
||
# Hard and soft signs should be removed
|
||
result = transliterate_cyrillic("объект", "ru")
|
||
assert "ъ" not in result
|
||
assert "ь" not in result
|
||
|
||
def test_ukrainian(self):
|
||
result = transliterate_cyrillic("Київ", "uk")
|
||
# Should handle Ukrainian-specific letters
|
||
assert "K" in result or "k" in result
|
||
|
||
|
||
class TestChineseTransliteration:
|
||
"""Tests for Chinese (Hanzi to Pinyin) transliteration."""
|
||
|
||
def test_museum_vocabulary(self):
|
||
result = transliterate_chinese("博物館")
|
||
assert "bo" in result.lower() or "haku" in result.lower()
|
||
|
||
def test_national_palace_museum(self):
|
||
result = transliterate_chinese("故宮博物院")
|
||
# Should contain pinyin for these characters
|
||
assert len(result) > 0
|
||
assert result != "故宮博物院" # Should be transliterated
|
||
|
||
def test_dongba_museum(self):
|
||
result = transliterate_chinese("东巴文化博物院")
|
||
assert "dong" in result.lower()
|
||
assert "wen" in result.lower()
|
||
|
||
|
||
class TestJapaneseTransliteration:
|
||
"""Tests for Japanese (Kanji/Kana to Romaji) transliteration."""
|
||
|
||
def test_national_museum(self):
|
||
result = transliterate_japanese("国立博物館")
|
||
assert "koku" in result.lower()
|
||
assert "ritsu" in result.lower()
|
||
|
||
def test_tokyo_national_museum(self):
|
||
result = transliterate_japanese("東京国立博物館")
|
||
assert "tou" in result.lower() or "to" in result.lower()
|
||
assert "kyou" in result.lower() or "kyo" in result.lower()
|
||
|
||
def test_hiragana(self):
|
||
result = transliterate_japanese("あいうえお")
|
||
assert result == "aiueo"
|
||
|
||
def test_katakana(self):
|
||
result = transliterate_japanese("アイウエオ")
|
||
assert result == "aiueo"
|
||
|
||
|
||
class TestKoreanTransliteration:
|
||
"""Tests for Korean (Hangul to Revised Romanization) transliteration."""
|
||
|
||
def test_national_museum(self):
|
||
result = transliterate_korean("국립중앙박물관")
|
||
# Should contain romanized syllables
|
||
assert len(result) > 0
|
||
assert "guk" in result.lower() or "kuk" in result.lower()
|
||
|
||
def test_simple_hangul(self):
|
||
result = transliterate_korean("한글")
|
||
assert "han" in result.lower()
|
||
|
||
|
||
class TestArabicTransliteration:
|
||
"""Tests for Arabic script transliteration."""
|
||
|
||
def test_national_library(self):
|
||
result = transliterate_arabic("المكتبة الوطنية")
|
||
assert "mktb" in result.lower() or "maktab" in result.lower()
|
||
|
||
def test_basic_letters(self):
|
||
result = transliterate_arabic("كتاب")
|
||
assert "k" in result.lower()
|
||
assert "t" in result.lower()
|
||
|
||
|
||
class TestHebrewTransliteration:
|
||
"""Tests for Hebrew script transliteration."""
|
||
|
||
def test_archive(self):
|
||
result = transliterate_hebrew("ארכיון")
|
||
# Should contain transliterated letters
|
||
assert len(result) > 0
|
||
|
||
def test_basic_letters(self):
|
||
result = transliterate_hebrew("שלום")
|
||
assert "sh" in result.lower()
|
||
|
||
|
||
class TestGreekTransliteration:
|
||
"""Tests for Greek script transliteration."""
|
||
|
||
def test_museum(self):
|
||
result = transliterate_greek("Μουσείο")
|
||
assert "Moyseio" in result or "Mouseio" in result
|
||
|
||
def test_archaeological(self):
|
||
result = transliterate_greek("Αρχαιολογικό")
|
||
assert "Archaiologiko" in result
|
||
|
||
|
||
class TestDevanagariTransliteration:
|
||
"""Tests for Devanagari (Hindi/Nepali) transliteration."""
|
||
|
||
def test_rajasthan(self):
|
||
result = transliterate_devanagari("राजस्थान")
|
||
# ISO 15919 uses "aa" for long vowels, so "raaj" not "raj"
|
||
assert "raaj" in result.lower() or "raj" in result.lower()
|
||
|
||
def test_basic_consonants(self):
|
||
result = transliterate_devanagari("क")
|
||
assert "k" in result.lower()
|
||
|
||
|
||
class TestThaiTransliteration:
|
||
"""Tests for Thai script transliteration (RTGS)."""
|
||
|
||
def test_national_archives(self):
|
||
# สำนักหอจดหมายเหตุแห่งชาติ = National Archives of Thailand
|
||
result = transliterate_thai("สำนักหอจดหมายเหตุแห่งชาติ")
|
||
assert "samnak" in result.lower()
|
||
assert "haeng chat" in result.lower()
|
||
|
||
def test_national_library(self):
|
||
# สำนักหอสมุดแห่งชาติ = National Library of Thailand
|
||
result = transliterate_thai("สำนักหอสมุดแห่งชาติ")
|
||
assert "ho samut" in result.lower()
|
||
|
||
def test_national_museum(self):
|
||
# พิพิธภัณฑสถานแห่งชาติ พระนคร = Bangkok National Museum
|
||
result = transliterate_thai("พิพิธภัณฑสถานแห่งชาติ พระนคร")
|
||
assert "phiphitthaphan" in result.lower()
|
||
assert "phra nakhon" in result.lower()
|
||
|
||
def test_siam_society(self):
|
||
# สยามสมาคมในพระบรมราชูปถัมภ์ = Siam Society
|
||
result = transliterate_thai("สยามสมาคมในพระบรมราชูปถัมภ์")
|
||
assert "sayam" in result.lower()
|
||
assert "samakhom" in result.lower()
|
||
|
||
def test_wat_temple(self):
|
||
# วัดโพธิ์ราม = Wat Pho Ram
|
||
result = transliterate_thai("วัดโพธิ์ราม")
|
||
assert "wat" in result.lower()
|
||
assert "pho" in result.lower()
|
||
assert "ram" in result.lower()
|
||
|
||
def test_empty_without_library(self):
|
||
# Even without pythainlp, should return transliterated result (not empty)
|
||
result = transliterate_thai("กรุงเทพ")
|
||
# Should get 'krung thep' from vocabulary lookup
|
||
assert len(result) > 0
|
||
|
||
|
||
class TestSinhalaTransliteration:
|
||
"""Tests for Sinhala script transliteration (ISO 15919)."""
|
||
|
||
def test_university_peradeniya(self):
|
||
# පේරාදෙණිය විශ්වවිද් යාලය = University of Peradeniya
|
||
result = transliterate_sinhala("පේරාදෙණිය විශ්වවිද් යාලය")
|
||
assert "peradeniya" in result.lower()
|
||
assert "vishvavid" in result.lower()
|
||
|
||
def test_national_museums(self):
|
||
# ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව = Department of National Museums
|
||
result = transliterate_sinhala("ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව")
|
||
assert "jathika" in result.lower()
|
||
assert "kauthukagara" in result.lower()
|
||
|
||
def test_basic_consonants(self):
|
||
# Basic consonant test
|
||
result = transliterate_sinhala("ක") # ka
|
||
assert "k" in result.lower()
|
||
|
||
def test_output_not_empty(self):
|
||
# Sinhala should never return empty string
|
||
result = transliterate_sinhala("කොළඹ") # Colombo
|
||
assert len(result) > 0
|
||
|
||
|
||
class TestKhmerTransliteration:
|
||
"""Tests for Khmer script transliteration (UNGEGN)."""
|
||
|
||
def test_tuol_sleng(self):
|
||
# សារមន្ទីរទួលស្លែង = Tuol Sleng Genocide Museum
|
||
result = transliterate_khmer("សារមន្ទីរទួលស្លែង")
|
||
assert "tuol sleng" in result.lower()
|
||
|
||
def test_phnom_penh(self):
|
||
# ភ្នំពេញ = Phnom Penh
|
||
result = transliterate_khmer("ភ្នំពេញ")
|
||
assert "phnom penh" in result.lower()
|
||
|
||
def test_angkor(self):
|
||
# អង្គរ = Angkor
|
||
result = transliterate_khmer("អង្គរ")
|
||
assert "angkor" in result.lower()
|
||
|
||
def test_output_not_empty(self):
|
||
# Khmer should never return empty string
|
||
result = transliterate_khmer("សារមន្ទីរ")
|
||
assert len(result) > 0
|
||
|
||
|
||
class TestTransliterateForAbbreviation:
|
||
"""Tests for the main abbreviation function."""
|
||
|
||
def test_russian_cleanup(self):
|
||
result = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru")
|
||
# Should be clean Latin text
|
||
assert result.isascii() or all(c.isalnum() or c in " -'" for c in result)
|
||
|
||
def test_chinese_cleanup(self):
|
||
result = transliterate_for_abbreviation("东巴文化博物院", "zh")
|
||
# Should be clean Latin text or warning
|
||
assert result.isascii() or "[REQUIRES" in result
|
||
|
||
def test_korean_cleanup(self):
|
||
result = transliterate_for_abbreviation("국립중앙박물관", "ko")
|
||
assert result.isascii()
|
||
|
||
def test_special_characters_removed(self):
|
||
# Special characters should be removed for abbreviation
|
||
result = transliterate_for_abbreviation("Test (Museum) & Gallery", "en")
|
||
assert "&" not in result
|
||
assert "(" not in result
|
||
|
||
|
||
class TestIntegration:
|
||
"""Integration tests using the main transliterate function."""
|
||
|
||
def test_auto_detect_russian(self):
|
||
result = transliterate("Музей")
|
||
assert result.isascii()
|
||
|
||
def test_auto_detect_korean(self):
|
||
result = transliterate("박물관")
|
||
assert result.isascii()
|
||
|
||
def test_latin_passthrough(self):
|
||
result = transliterate("Rijksmuseum Amsterdam")
|
||
assert result == "Rijksmuseum Amsterdam"
|
||
|
||
def test_with_explicit_language(self):
|
||
result = transliterate("故宮博物院", lang="zh")
|
||
assert len(result) > 0
|
||
# Should not be original Chinese
|
||
assert "故" not in result or "[REQUIRES" in result
|
||
|
||
|
||
if __name__ == "__main__":
|
||
pytest.main([__file__, "-v"])
|