#!/usr/bin/env python3 """ Transliteration Utility for GHCID Abbreviation Generation This script provides transliteration functions for converting non-Latin script institution names to Latin characters using ISO and recognized standards. Usage: # As a module from scripts.transliterate_emic_names import transliterate_for_abbreviation latin = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru") # Result: "Institut vostochnykh rukopisey RAN" # As a CLI tool python scripts/transliterate_emic_names.py --text "東巴文化博物院" --lang zh python scripts/transliterate_emic_names.py --file data/custodian/example.yaml Standards: - Cyrillic (ru, uk, bg, sr, kk): ISO 9:1995 - Chinese (zh): Hanyu Pinyin (ISO 7098) - Japanese (ja): Modified Hepburn - Korean (ko): Revised Romanization - Arabic (ar, fa, ur): ISO 233-2/3 - Hebrew (he): ISO 259-3 - Greek (el): ISO 843 - Devanagari (hi, ne, bn): ISO 15919 - Thai (th): ISO 11940-2 - Armenian (hy): ISO 9985 - Georgian (ka): ISO 9984 Author: GLAM Project Created: 2025-12-08 """ import argparse import re import unicodedata from pathlib import Path from typing import Optional, Dict, List, Tuple # Try importing optional transliteration libraries AVAILABLE_LIBS: Dict[str, bool] = {} try: from pypinyin import pinyin, Style AVAILABLE_LIBS['pypinyin'] = True except ImportError: AVAILABLE_LIBS['pypinyin'] = False try: import pykakasi AVAILABLE_LIBS['pykakasi'] = True except ImportError: AVAILABLE_LIBS['pykakasi'] = False try: from transliterate import translit AVAILABLE_LIBS['transliterate'] = True except ImportError: AVAILABLE_LIBS['transliterate'] = False # ============================================================================= # SCRIPT DETECTION # ============================================================================= def detect_script(text: str) -> str: """ Detect the primary script of the input text. Returns one of: - 'latin': Latin alphabet - 'cyrillic': Cyrillic script - 'chinese': Chinese characters (Hanzi) - 'japanese': Japanese (mixed Kanji/Kana) - 'korean': Korean Hangul - 'arabic': Arabic script (includes Persian, Urdu) - 'hebrew': Hebrew script - 'greek': Greek script - 'devanagari': Devanagari (Hindi, Nepali, Sanskrit) - 'bengali': Bengali script - 'thai': Thai script - 'armenian': Armenian script - 'georgian': Georgian script - 'sinhala': Sinhala script - 'khmer': Khmer script - 'unknown': Cannot determine """ script_ranges = { 'cyrillic': (0x0400, 0x04FF), 'arabic': (0x0600, 0x06FF), 'persian_ext': (0x0750, 0x077F), # Arabic Supplement 'hebrew': (0x0590, 0x05FF), 'devanagari': (0x0900, 0x097F), 'bengali': (0x0980, 0x09FF), 'thai': (0x0E00, 0x0E7F), 'greek': (0x0370, 0x03FF), 'armenian': (0x0530, 0x058F), 'georgian': (0x10A0, 0x10FF), 'korean': (0xAC00, 0xD7AF), # Hangul syllables 'korean_jamo': (0x1100, 0x11FF), # Hangul Jamo 'japanese_hiragana': (0x3040, 0x309F), 'japanese_katakana': (0x30A0, 0x30FF), 'chinese': (0x4E00, 0x9FFF), # CJK Unified Ideographs 'chinese_ext': (0x3400, 0x4DBF), # CJK Extension A 'sinhala': (0x0D80, 0x0DFF), 'khmer': (0x1780, 0x17FF), } script_counts: Dict[str, int] = {script: 0 for script in script_ranges} latin_count = 0 for char in text: code = ord(char) # Check Latin if ('a' <= char <= 'z') or ('A' <= char <= 'Z'): latin_count += 1 continue # Check other scripts for script, (start, end) in script_ranges.items(): if start <= code <= end: script_counts[script] += 1 break # Determine primary script if latin_count > 0 and all(c == 0 for c in script_counts.values()): return 'latin' # Merge related scripts script_counts['arabic'] += script_counts.get('persian_ext', 0) script_counts['korean'] += script_counts.get('korean_jamo', 0) script_counts['chinese'] += script_counts.get('chinese_ext', 0) # Handle CJK disambiguation cjk_count = script_counts.get('chinese', 0) hiragana_count = script_counts.get('japanese_hiragana', 0) katakana_count = script_counts.get('japanese_katakana', 0) if hiragana_count > 0 or katakana_count > 0: return 'japanese' # Find max non-Latin script primary_scripts = ['cyrillic', 'arabic', 'hebrew', 'devanagari', 'bengali', 'thai', 'greek', 'armenian', 'georgian', 'korean', 'chinese', 'sinhala', 'khmer'] max_script = max(primary_scripts, key=lambda s: script_counts.get(s, 0)) if script_counts.get(max_script, 0) > 0: return max_script return 'latin' if latin_count > 0 else 'unknown' # ============================================================================= # CYRILLIC TRANSLITERATION (ISO 9:1995) # ============================================================================= CYRILLIC_MAP = { # Russian 'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E', 'Ё': 'E', 'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K', 'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R', 'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'Kh', 'Ц': 'Ts', 'Ч': 'Ch', 'Ш': 'Sh', 'Щ': 'Shch', 'Ъ': '', 'Ы': 'Y', 'Ь': '', 'Э': 'E', 'Ю': 'Yu', 'Я': 'Ya', 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu', 'я': 'ya', # Ukrainian additions 'І': 'I', 'і': 'i', 'Ї': 'Yi', 'ї': 'yi', 'Є': 'Ye', 'є': 'ye', 'Ґ': 'G', 'ґ': 'g', "'": '', # Bulgarian additions 'Ъ': 'A', 'ъ': 'a', # Bulgarian hard sign = schwa # Serbian additions 'Ђ': 'Dj', 'ђ': 'dj', 'Ј': 'J', 'ј': 'j', 'Љ': 'Lj', 'љ': 'lj', 'Њ': 'Nj', 'њ': 'nj', 'Ћ': 'C', 'ћ': 'c', 'Џ': 'Dz', 'џ': 'dz', # Kazakh additions (Cyrillic-based) 'Ә': 'A', 'ә': 'a', 'Ғ': 'Gh', 'ғ': 'gh', 'Қ': 'Q', 'қ': 'q', 'Ң': 'Ng', 'ң': 'ng', 'Ө': 'O', 'ө': 'o', 'Ұ': 'U', 'ұ': 'u', 'Ү': 'U', 'ү': 'u', 'Һ': 'H', 'һ': 'h', } def transliterate_cyrillic(text: str, lang: str = 'ru') -> str: """Transliterate Cyrillic text using ISO 9 mapping.""" if AVAILABLE_LIBS.get('transliterate'): try: return translit(text, lang, reversed=True) except Exception: pass # Fallback to manual mapping return ''.join(CYRILLIC_MAP.get(c, c) for c in text) # ============================================================================= # CHINESE TRANSLITERATION (Hanyu Pinyin) # ============================================================================= # Basic Pinyin dictionary for common museum/library/archive vocabulary # This allows basic transliteration without pypinyin library CHINESE_PINYIN_MAP = { # Numbers '一': 'yi', '二': 'er', '三': 'san', '四': 'si', '五': 'wu', '六': 'liu', '七': 'qi', '八': 'ba', '九': 'jiu', '十': 'shi', '百': 'bai', '千': 'qian', '万': 'wan', # Heritage/Museum vocabulary '博': 'bo', '物': 'wu', '館': 'guan', '馆': 'guan', '院': 'yuan', '文': 'wen', '化': 'hua', '藝': 'yi', '艺': 'yi', '術': 'shu', '术': 'shu', '歷': 'li', '历': 'li', '史': 'shi', '遺': 'yi', '遗': 'yi', '產': 'chan', '产': 'chan', '國': 'guo', '国': 'guo', '立': 'li', '家': 'jia', '民': 'min', '族': 'zu', '中': 'zhong', '央': 'yang', '省': 'sheng', '市': 'shi', '縣': 'xian', '县': 'xian', '圖': 'tu', '图': 'tu', '書': 'shu', '书': 'shu', '檔': 'dang', '档': 'dang', '案': 'an', '美': 'mei', '古': 'gu', '典': 'dian', '藏': 'cang', '品': 'pin', '展': 'zhan', '覽': 'lan', '览': 'lan', '紀': 'ji', '纪': 'ji', '念': 'nian', '碑': 'bei', '塔': 'ta', '廟': 'miao', '庙': 'miao', '寺': 'si', '宮': 'gong', '宫': 'gong', '殿': 'dian', '城': 'cheng', '堡': 'bao', '樓': 'lou', '楼': 'lou', '閣': 'ge', '阁': 'ge', '亭': 'ting', '園': 'yuan', '园': 'yuan', '研': 'yan', '究': 'jiu', '所': 'suo', '中': 'zhong', '心': 'xin', '學': 'xue', '学': 'xue', '院': 'yuan', '校': 'xiao', '系': 'xi', '會': 'hui', '会': 'hui', '社': 'she', '團': 'tuan', '团': 'tuan', '東': 'dong', '东': 'dong', '西': 'xi', '南': 'nan', '北': 'bei', '京': 'jing', '海': 'hai', '山': 'shan', '河': 'he', '江': 'jiang', '大': 'da', '小': 'xiao', '新': 'xin', '老': 'lao', '古': 'gu', '自': 'zi', '然': 'ran', '科': 'ke', '技': 'ji', '巴': 'ba', '東': 'dong', '納': 'na', '纳': 'na', '西': 'xi', '故': 'gu', '宮': 'gong', '基': 'ji', '金': 'jin', '銀': 'yin', '银': 'yin', '教': 'jiao', '育': 'yu', '傳': 'chuan', '传': 'chuan', '統': 'tong', '统': 'tong', '絲': 'si', '丝': 'si', '綢': 'chou', '绸': 'chou', '路': 'lu', '陶': 'tao', '瓷': 'ci', '玉': 'yu', '石': 'shi', '銅': 'tong', '铜': 'tong', '畫': 'hua', '画': 'hua', '雕': 'diao', '塑': 'su', '民': 'min', '俗': 'su', '風': 'feng', '风': 'feng', '土': 'tu', '革': 'ge', '命': 'ming', '戰': 'zhan', '战': 'zhan', '爭': 'zheng', '争': 'zheng', '軍': 'jun', '军': 'jun', '事': 'shi', '航': 'hang', '空': 'kong', '天': 'tian', '宗': 'zong', '佛': 'fo', '道': 'dao', '儒': 'ru', '絃': 'xian', '弦': 'xian', '琴': 'qin', '樂': 'yue', '乐': 'yue', '舞': 'wu', '劇': 'ju', '剧': 'ju', '戲': 'xi', '戏': 'xi', '茶': 'cha', '酒': 'jiu', '食': 'shi', '餐': 'can', '衣': 'yi', '服': 'fu', '紡': 'fang', '纺': 'fang', '織': 'zhi', '织': 'zhi', '建': 'jian', '築': 'zhu', '筑': 'zhu', '房': 'fang', '屋': 'wu', '水': 'shui', '電': 'dian', '电': 'dian', '火': 'huo', '木': 'mu', '農': 'nong', '农': 'nong', '業': 'ye', '业': 'ye', '工': 'gong', '商': 'shang', '醫': 'yi', '医': 'yi', '藥': 'yao', '药': 'yao', '人': 'ren', '物': 'wu', '生': 'sheng', '活': 'huo', '和': 'he', '平': 'ping', '友': 'you', '誼': 'yi', '谊': 'yi', '港': 'gang', '澳': 'ao', '台': 'tai', '灣': 'wan', '湾': 'wan', '華': 'hua', '华': 'hua', '僑': 'qiao', '侨': 'qiao', '海': 'hai', '外': 'wai', '交': 'jiao', '流': 'liu', '保': 'bao', '護': 'hu', '护': 'hu', '修': 'xiu', '復': 'fu', '复': 'fu', '鑒': 'jian', '鉴': 'jian', '定': 'ding', '評': 'ping', '评': 'ping', '估': 'gu', } def transliterate_chinese(text: str) -> str: """Transliterate Chinese to Pinyin without tone marks.""" if AVAILABLE_LIBS.get('pypinyin'): result = pinyin(text, style=Style.NORMAL) return ' '.join([''.join(p) for p in result]) # Fallback: use basic vocabulary mapping result = [] for char in text: if char in CHINESE_PINYIN_MAP: result.append(CHINESE_PINYIN_MAP[char]) elif char == ' ': result.append(' ') elif char.isalnum(): result.append(char) # If we got no result, return warning if not result: return '[REQUIRES_PYPINYIN]' return ''.join(result) # ============================================================================= # JAPANESE TRANSLITERATION (Modified Hepburn) # ============================================================================= # Basic Kanji/Kana to Romaji map for common heritage vocabulary # This allows basic transliteration without pykakasi library JAPANESE_ROMAJI_MAP = { # Common heritage vocabulary Kanji '博': 'haku', '物': 'butsu', '館': 'kan', '院': 'in', '文': 'bun', '化': 'ka', '藝': 'gei', '術': 'jutsu', '歷': 'reki', '史': 'shi', '遺': 'i', '產': 'san', '國': 'koku', '国': 'koku', '立': 'ritsu', '家': 'ka', '民': 'min', '族': 'zoku', '中': 'chuu', '央': 'ou', '圖': 'to', '図': 'to', '書': 'sho', '檔': 'tou', '案': 'an', '美': 'bi', '古': 'ko', '典': 'ten', '藏': 'zou', '品': 'hin', '展': 'ten', '覽': 'ran', '紀': 'ki', '念': 'nen', '寺': 'ji', '宮': 'kyuu', '殿': 'den', '城': 'jou', '堡': 'hou', '樓': 'rou', '閣': 'kaku', '亭': 'tei', '園': 'en', '研': 'ken', '究': 'kyuu', '所': 'sho', '心': 'shin', '學': 'gaku', '学': 'gaku', '校': 'kou', '系': 'kei', '會': 'kai', '会': 'kai', '社': 'sha', '團': 'dan', '東': 'tou', '西': 'sei', '南': 'nan', '北': 'hoku', '京': 'kyou', '都': 'to', '海': 'kai', '山': 'zan', '河': 'ka', '川': 'kawa', '大': 'dai', '小': 'shou', '新': 'shin', '老': 'rou', '自': 'ji', '然': 'nen', '科': 'ka', '技': 'gi', '故': 'ko', '金': 'kin', '銀': 'gin', '教': 'kyou', '育': 'iku', '傳': 'den', '統': 'tou', '陶': 'tou', '瓷': 'ji', '玉': 'gyoku', '石': 'seki', '銅': 'dou', '畫': 'ga', '画': 'ga', '雕': 'chou', '塑': 'so', '俗': 'zoku', '風': 'fuu', '土': 'do', '革': 'kaku', '命': 'mei', '戰': 'sen', '爭': 'sou', '軍': 'gun', '事': 'ji', '航': 'kou', '空': 'kuu', '天': 'ten', '宗': 'shuu', '佛': 'butsu', '道': 'dou', '儒': 'ju', '琴': 'kin', '樂': 'gaku', '舞': 'bu', '劇': 'geki', '戲': 'gi', '茶': 'cha', '酒': 'shu', '食': 'shoku', '餐': 'san', '衣': 'i', '服': 'fuku', '紡': 'bou', '織': 'shoku', '建': 'ken', '築': 'chiku', '房': 'bou', '屋': 'oku', '水': 'sui', '電': 'den', '火': 'ka', '木': 'moku', '農': 'nou', '業': 'gyou', '工': 'kou', '商': 'shou', '醫': 'i', '藥': 'yaku', '人': 'jin', '生': 'sei', '活': 'katsu', '和': 'wa', '平': 'hei', '友': 'yuu', '誼': 'gi', '港': 'kou', '灣': 'wan', '華': 'ka', '僑': 'kyou', '外': 'gai', '交': 'kou', '流': 'ryuu', '保': 'ho', '護': 'go', '修': 'shuu', '復': 'fuku', '鑒': 'kan', '定': 'tei', '評': 'hyou', '估': 'ko', '記': 'ki', '録': 'roku', '資': 'shi', '料': 'ryou', # Hiragana 'あ': 'a', 'い': 'i', 'う': 'u', 'え': 'e', 'お': 'o', 'か': 'ka', 'き': 'ki', 'く': 'ku', 'け': 'ke', 'こ': 'ko', 'さ': 'sa', 'し': 'shi', 'す': 'su', 'せ': 'se', 'そ': 'so', 'た': 'ta', 'ち': 'chi', 'つ': 'tsu', 'て': 'te', 'と': 'to', 'な': 'na', 'に': 'ni', 'ぬ': 'nu', 'ね': 'ne', 'の': 'no', 'は': 'ha', 'ひ': 'hi', 'ふ': 'fu', 'へ': 'he', 'ほ': 'ho', 'ま': 'ma', 'み': 'mi', 'む': 'mu', 'め': 'me', 'も': 'mo', 'や': 'ya', 'ゆ': 'yu', 'よ': 'yo', 'ら': 'ra', 'り': 'ri', 'る': 'ru', 'れ': 're', 'ろ': 'ro', 'わ': 'wa', 'を': 'wo', 'ん': 'n', # Katakana 'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o', 'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke', 'コ': 'ko', 'サ': 'sa', 'シ': 'shi', 'ス': 'su', 'セ': 'se', 'ソ': 'so', 'タ': 'ta', 'チ': 'chi', 'ツ': 'tsu', 'テ': 'te', 'ト': 'to', 'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no', 'ハ': 'ha', 'ヒ': 'hi', 'フ': 'fu', 'ヘ': 'he', 'ホ': 'ho', 'マ': 'ma', 'ミ': 'mi', 'ム': 'mu', 'メ': 'me', 'モ': 'mo', 'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo', 'ラ': 'ra', 'リ': 'ri', 'ル': 'ru', 'レ': 're', 'ロ': 'ro', 'ワ': 'wa', 'ヲ': 'wo', 'ン': 'n', } def transliterate_japanese(text: str) -> str: """Transliterate Japanese to Romaji using Modified Hepburn.""" if AVAILABLE_LIBS.get('pykakasi'): kakasi = pykakasi.kakasi() result = kakasi.convert(text) return ' '.join([item['hepburn'] for item in result]) # Fallback: use basic vocabulary mapping result = [] for char in text: if char in JAPANESE_ROMAJI_MAP: result.append(JAPANESE_ROMAJI_MAP[char]) elif char == ' ': result.append(' ') elif char.isalnum(): result.append(char) # If we got no result, return warning if not result: return '[REQUIRES_PYKAKASI]' return ''.join(result) # ============================================================================= # KOREAN TRANSLITERATION (Revised Romanization) # ============================================================================= # Basic Hangul syllable decomposition tables HANGUL_INITIALS = [ 'g', 'kk', 'n', 'd', 'tt', 'r', 'm', 'b', 'pp', 's', 'ss', '', 'j', 'jj', 'ch', 'k', 't', 'p', 'h' ] HANGUL_MEDIALS = [ 'a', 'ae', 'ya', 'yae', 'eo', 'e', 'yeo', 'ye', 'o', 'wa', 'wae', 'oe', 'yo', 'u', 'wo', 'we', 'wi', 'yu', 'eu', 'ui', 'i' ] HANGUL_FINALS = [ '', 'k', 'k', 'k', 'n', 'n', 'n', 't', 'l', 'l', 'l', 'l', 'l', 'l', 'l', 'l', 'm', 'p', 'p', 's', 's', 'ng', 't', 't', 'k', 't', 'p', 't' ] def transliterate_korean(text: str) -> str: """Transliterate Korean Hangul to Revised Romanization.""" try: from korean_romanizer.romanizer import Romanizer r = Romanizer(text) return r.romanize() except ImportError: pass # Fallback: basic syllable decomposition result = [] for char in text: code = ord(char) if 0xAC00 <= code <= 0xD7AF: # Hangul syllable code -= 0xAC00 initial = code // (21 * 28) medial = (code % (21 * 28)) // 28 final = code % 28 syllable = HANGUL_INITIALS[initial] + HANGUL_MEDIALS[medial] if final > 0: syllable += HANGUL_FINALS[final] result.append(syllable) else: result.append(char) return ''.join(result) # ============================================================================= # ARABIC TRANSLITERATION (ISO 233-2) # ============================================================================= ARABIC_MAP = { 'ا': 'a', 'أ': 'a', 'إ': 'i', 'آ': 'a', 'ء': "'", 'ب': 'b', 'ت': 't', 'ث': 'th', 'ج': 'j', 'ح': 'h', 'خ': 'kh', 'د': 'd', 'ذ': 'dh', 'ر': 'r', 'ز': 'z', 'س': 's', 'ش': 'sh', 'ص': 's', 'ض': 'd', 'ط': 't', 'ظ': 'z', 'ع': "'", 'غ': 'gh', 'ف': 'f', 'ق': 'q', 'ك': 'k', 'ل': 'l', 'م': 'm', 'ن': 'n', 'ه': 'h', 'و': 'w', 'ي': 'y', 'ى': 'a', 'ة': 'a', # Persian additions 'پ': 'p', 'چ': 'ch', 'ژ': 'zh', 'گ': 'g', 'ک': 'k', 'ی': 'i', # Urdu additions 'ٹ': 't', 'ڈ': 'd', 'ڑ': 'r', 'ں': 'n', # Diacritics (vowel marks) 'َ': 'a', 'ِ': 'i', 'ُ': 'u', 'ً': 'an', 'ٍ': 'in', 'ٌ': 'un', 'ّ': '', # Shadda (gemination) - simplified } def transliterate_arabic(text: str) -> str: """Transliterate Arabic script to Latin (ISO 233 simplified).""" result = [] for c in text: if c in ARABIC_MAP: result.append(ARABIC_MAP[c]) elif c == ' ' or c.isalnum(): result.append(c) elif c == '\u200c': # Zero-width non-joiner (Persian) result.append('-') return ''.join(result) # ============================================================================= # HEBREW TRANSLITERATION (ISO 259-3) # ============================================================================= HEBREW_MAP = { 'א': '', 'ב': 'v', 'ג': 'g', 'ד': 'd', 'ה': 'h', 'ו': 'v', 'ז': 'z', 'ח': 'ch', 'ט': 't', 'י': 'y', 'כ': 'k', 'ך': 'k', 'ל': 'l', 'מ': 'm', 'ם': 'm', 'נ': 'n', 'ן': 'n', 'ס': 's', 'ע': '', 'פ': 'f', 'ף': 'f', 'צ': 'ts', 'ץ': 'ts', 'ק': 'k', 'ר': 'r', 'ש': 'sh', 'ת': 't', # With dagesh 'בּ': 'b', 'כּ': 'k', 'פּ': 'p', } def transliterate_hebrew(text: str) -> str: """Transliterate Hebrew to Latin (ISO 259-3 simplified).""" result = [] for c in text: if c in HEBREW_MAP: result.append(HEBREW_MAP[c]) elif c == ' ' or c.isalnum(): result.append(c) return ''.join(result) # ============================================================================= # GREEK TRANSLITERATION (ISO 843) # ============================================================================= GREEK_MAP = { 'Α': 'A', 'α': 'a', 'Β': 'V', 'β': 'v', 'Γ': 'G', 'γ': 'g', 'Δ': 'D', 'δ': 'd', 'Ε': 'E', 'ε': 'e', 'Ζ': 'Z', 'ζ': 'z', 'Η': 'I', 'η': 'i', 'Θ': 'Th', 'θ': 'th', 'Ι': 'I', 'ι': 'i', 'Κ': 'K', 'κ': 'k', 'Λ': 'L', 'λ': 'l', 'Μ': 'M', 'μ': 'm', 'Ν': 'N', 'ν': 'n', 'Ξ': 'X', 'ξ': 'x', 'Ο': 'O', 'ο': 'o', 'Π': 'P', 'π': 'p', 'Ρ': 'R', 'ρ': 'r', 'Σ': 'S', 'σ': 's', 'ς': 's', 'Τ': 'T', 'τ': 't', 'Υ': 'Y', 'υ': 'y', 'Φ': 'F', 'φ': 'f', 'Χ': 'Ch', 'χ': 'ch', 'Ψ': 'Ps', 'ψ': 'ps', 'Ω': 'O', 'ω': 'o', # With accents 'Ά': 'A', 'ά': 'a', 'Έ': 'E', 'έ': 'e', 'Ή': 'I', 'ή': 'i', 'Ί': 'I', 'ί': 'i', 'Ό': 'O', 'ό': 'o', 'Ύ': 'Y', 'ύ': 'y', 'Ώ': 'O', 'ώ': 'o', 'ϊ': 'i', 'ϋ': 'y', 'ΐ': 'i', 'ΰ': 'y', } def transliterate_greek(text: str) -> str: """Transliterate Greek to Latin (ISO 843).""" return ''.join(GREEK_MAP.get(c, c) for c in text) # ============================================================================= # DEVANAGARI TRANSLITERATION (ISO 15919) # ============================================================================= DEVANAGARI_MAP = { # Vowels 'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ii', 'उ': 'u', 'ऊ': 'uu', 'ऋ': 'ri', 'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au', # Vowel marks 'ा': 'a', 'ि': 'i', 'ी': 'i', 'ु': 'u', 'ू': 'u', 'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au', 'ं': 'm', 'ः': 'h', # Consonants 'क': 'ka', 'ख': 'kha', 'ग': 'ga', 'घ': 'gha', 'ङ': 'nga', 'च': 'cha', 'छ': 'chha', 'ज': 'ja', 'झ': 'jha', 'ञ': 'nya', 'ट': 'ta', 'ठ': 'tha', 'ड': 'da', 'ढ': 'dha', 'ण': 'na', 'त': 'ta', 'थ': 'tha', 'द': 'da', 'ध': 'dha', 'न': 'na', 'प': 'pa', 'फ': 'pha', 'ब': 'ba', 'भ': 'bha', 'म': 'ma', 'य': 'ya', 'र': 'ra', 'ल': 'la', 'व': 'va', 'श': 'sha', 'ष': 'sha', 'स': 'sa', 'ह': 'ha', '्': '', # Virama (removes inherent 'a') # Hindi-specific 'ड़': 'da', 'ढ़': 'dha', 'क़': 'qa', 'ख़': 'kha', 'ग़': 'gha', 'ज़': 'za', 'फ़': 'fa', } def transliterate_devanagari(text: str) -> str: """Transliterate Devanagari to Latin (ISO 15919 simplified).""" try: from indic_transliteration import sanscript from indic_transliteration.sanscript import transliterate as indic_translit return indic_translit(text, sanscript.DEVANAGARI, sanscript.IAST) except ImportError: pass # Fallback: basic mapping result = [] for c in text: if c in DEVANAGARI_MAP: result.append(DEVANAGARI_MAP[c]) elif c == ' ': result.append(' ') elif c.isalnum(): result.append(c) return ''.join(result) # ============================================================================= # THAI TRANSLITERATION (ISO 11940-2 / Royal Thai General System) # ============================================================================= # Thai consonants with RTGS romanization # Note: Thai consonants have inherent vowel 'o' or 'a' depending on syllable structure THAI_CONSONANTS = { # Initial consonants (high, mid, low class) 'ก': 'k', 'ข': 'kh', 'ฃ': 'kh', 'ค': 'kh', 'ฅ': 'kh', 'ฆ': 'kh', 'ง': 'ng', 'จ': 'ch', 'ฉ': 'ch', 'ช': 'ch', 'ซ': 's', 'ฌ': 'ch', 'ญ': 'y', # Initial: y, Final: n 'ฎ': 'd', 'ฏ': 't', 'ฐ': 'th', 'ฑ': 'th', 'ฒ': 'th', 'ณ': 'n', 'ด': 'd', 'ต': 't', 'ถ': 'th', 'ท': 'th', 'ธ': 'th', 'น': 'n', 'บ': 'b', 'ป': 'p', 'ผ': 'ph', 'ฝ': 'f', 'พ': 'ph', 'ฟ': 'f', 'ภ': 'ph', 'ม': 'm', 'ย': 'y', 'ร': 'r', 'ล': 'l', 'ว': 'w', 'ศ': 's', 'ษ': 's', 'ส': 's', 'ห': 'h', 'ฬ': 'l', 'อ': '', # อ is silent initial 'ฮ': 'h', } # Thai vowels (can appear before, after, above, or below consonants) THAI_VOWELS = { # Following vowels 'ะ': 'a', 'า': 'a', 'ำ': 'am', 'ิ': 'i', 'ี': 'i', 'ึ': 'ue', 'ื': 'ue', 'ุ': 'u', 'ู': 'u', 'เ': 'e', # Leading vowel 'แ': 'ae', # Leading vowel 'โ': 'o', # Leading vowel 'ใ': 'ai', # Leading vowel 'ไ': 'ai', # Leading vowel 'ๅ': 'a', # Lakkhangyao (rare) # Vowel combinations are handled by position } # Thai tone marks (don't affect RTGS romanization - just skip) THAI_TONE_MARKS = {'่', '้', '๊', '๋'} # Thai special characters THAI_SPECIAL = { '็': '', # Maitaikhu (shortens vowel) '์': '', # Thanthakhat (silent letter marker) 'ๆ': '', # Maiyamok (repetition) '฿': 'B', # Baht symbol 'ฯ': '', # Paiyannoi (abbreviation) '๏': '', # Fongman (obsolete) 'ํ': 'm', # Nikhahit (nasalization, often 'm') 'ฺ': '', # Phinthu (Sanskrit virama) 'ๆ': '', # Mai yamok (repeat previous) } # Thai numerals THAI_NUMERALS = { '๐': '0', '๑': '1', '๒': '2', '๓': '3', '๔': '4', '๕': '5', '๖': '6', '๗': '7', '๘': '8', '๙': '9', } # Common Thai heritage vocabulary - direct mappings for accuracy # These handle complex syllable combinations correctly THAI_HERITAGE_VOCAB = { # Common institutional terms 'สำนัก': 'samnak', 'หอจดหมายเหตุ': 'ho chotmaihet', 'หอสมุด': 'ho samut', 'แห่งชาติ': 'haeng chat', 'พิพิธภัณฑ': 'phiphitthaphan', 'พิพิธภัณฑสถาน': 'phiphitthaphanthasathan', 'พระนคร': 'phra nakhon', 'สยาม': 'sayam', 'สมาคม': 'samakhom', 'ใน': 'nai', 'พระบรมราชูปถัมภ์': 'phra borommarachuppatham', 'พระที่นั่ง': 'phra thi nang', 'ศิวโมกข': 'siwamok', 'พิมาน': 'phiman', 'วัด': 'wat', 'โพธิ์': 'pho', 'ราม': 'ram', # Geographic terms 'กรุงเทพ': 'krung thep', 'กรุงเทพมหานคร': 'krung thep maha nakhon', 'เชียงใหม่': 'chiang mai', 'ภูเก็ต': 'phuket', # Institution types 'มหาวิทยาลัย': 'mahawitthayalai', 'ศูนย์': 'sun', 'สถาบัน': 'sathaban', 'องค์กร': 'ongkon', 'กรม': 'krom', 'กระทรวง': 'krasuang', # Cultural terms 'วัฒนธรรม': 'watthanatham', 'ศิลปะ': 'sinlapa', 'ประวัติศาสตร์': 'prawattisat', 'โบราณ': 'boran', 'มรดก': 'moradok', } def transliterate_thai(text: str) -> str: """Transliterate Thai to Latin (Royal Thai General System). Uses pythainlp if available, otherwise falls back to vocabulary lookup and character-by-character transliteration. """ try: from pythainlp.transliterate import romanize return romanize(text, engine='royin') # Royal Institute standard except ImportError: pass # Fallback: vocabulary lookup + character mapping result = text # First pass: replace known vocabulary items (longest match first) for thai, latin in sorted(THAI_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])): result = result.replace(thai, f' {latin} ') # Second pass: transliterate remaining Thai characters output = [] i = 0 while i < len(result): c = result[i] # Skip if already Latin if c.isascii(): output.append(c) i += 1 continue # Check consonants if c in THAI_CONSONANTS: output.append(THAI_CONSONANTS[c]) i += 1 continue # Check vowels if c in THAI_VOWELS: output.append(THAI_VOWELS[c]) i += 1 continue # Skip tone marks if c in THAI_TONE_MARKS: i += 1 continue # Check special characters if c in THAI_SPECIAL: output.append(THAI_SPECIAL[c]) i += 1 continue # Check numerals if c in THAI_NUMERALS: output.append(THAI_NUMERALS[c]) i += 1 continue # Unknown character - keep as is output.append(c) i += 1 # Clean up spacing result = ''.join(output) result = ' '.join(result.split()) # Normalize whitespace return result # ============================================================================= # ARMENIAN TRANSLITERATION (ISO 9985) # ============================================================================= ARMENIAN_MAP = { 'Ա': 'A', 'ա': 'a', 'Բ': 'B', 'բ': 'b', 'Գ': 'G', 'գ': 'g', 'Դ': 'D', 'դ': 'd', 'Ե': 'E', 'ե': 'e', 'Զ': 'Z', 'զ': 'z', 'Է': 'E', 'է': 'e', 'Ը': 'Y', 'ը': 'y', 'Թ': 'T', 'թ': 't', 'Ժ': 'Zh', 'ժ': 'zh', 'Ի': 'I', 'ի': 'i', 'Լ': 'L', 'լ': 'l', 'Խ': 'Kh', 'խ': 'kh', 'Ծ': 'Ts', 'ծ': 'ts', 'Կ': 'K', 'կ': 'k', 'Հ': 'H', 'հ': 'h', 'Ձ': 'Dz', 'ձ': 'dz', 'Ղ': 'Gh', 'ղ': 'gh', 'Ճ': 'Ch', 'ճ': 'ch', 'Մ': 'M', 'մ': 'm', 'Յ': 'Y', 'յ': 'y', 'Ն': 'N', 'ն': 'n', 'Շ': 'Sh', 'շ': 'sh', 'Ո': 'O', 'ո': 'o', 'Չ': 'Ch', 'չ': 'ch', 'Պ': 'P', 'պ': 'p', 'Ջ': 'J', 'ջ': 'j', ' Delays': 'R', ' delays': 'r', ' Delays': 'S', 'ս': 's', 'Ვ': 'V', 'վ': 'v', 'Տ': 'T', 'տ': 't', 'ร': 'R', 'ր': 'r', 'Ց': 'Ts', 'ց': 'ts', 'Ւ': 'W', 'ւ': 'w', 'Փ': 'P', 'փ': 'p', 'Ք': 'K', 'ք': 'k', 'Օ': 'O', 'օ': 'o', 'Ֆ': 'F', 'ֆ': 'f', } def transliterate_armenian(text: str) -> str: """Transliterate Armenian to Latin (ISO 9985).""" return ''.join(ARMENIAN_MAP.get(c, c) for c in text) # ============================================================================= # GEORGIAN TRANSLITERATION (ISO 9984) # ============================================================================= GEORGIAN_MAP = { 'ა': 'a', 'ბ': 'b', 'გ': 'g', 'დ': 'd', 'ე': 'e', 'ვ': 'v', 'ზ': 'z', 'თ': 't', 'ი': 'i', 'კ': 'k', 'ლ': 'l', 'მ': 'm', 'ნ': 'n', 'ო': 'o', 'პ': 'p', 'ჟ': 'zh', 'რ': 'r', 'ს': 's', 'ტ': 't', 'უ': 'u', 'ფ': 'p', 'ქ': 'k', 'ღ': 'gh', 'ყ': 'q', 'შ': 'sh', 'ჩ': 'ch', 'ც': 'ts', 'ძ': 'dz', 'წ': 'ts', 'ჭ': 'ch', 'ხ': 'kh', 'ჯ': 'j', 'ჰ': 'h', } def transliterate_georgian(text: str) -> str: """Transliterate Georgian to Latin (ISO 9984).""" return ''.join(GEORGIAN_MAP.get(c, c) for c in text) # ============================================================================= # BENGALI TRANSLITERATION (ISO 15919) # ============================================================================= BENGALI_MAP = { # Vowels 'অ': 'a', 'আ': 'aa', 'ই': 'i', 'ঈ': 'ii', 'উ': 'u', 'ঊ': 'uu', 'এ': 'e', 'ঐ': 'ai', 'ও': 'o', 'ঔ': 'au', # Consonants 'ক': 'ka', 'খ': 'kha', 'গ': 'ga', 'ঘ': 'gha', 'ঙ': 'nga', 'চ': 'cha', 'ছ': 'chha', 'জ': 'ja', 'ঝ': 'jha', 'ঞ': 'nya', 'ট': 'ta', 'ঠ': 'tha', 'ড': 'da', 'ঢ': 'dha', 'ণ': 'na', 'ত': 'ta', 'থ': 'tha', 'দ': 'da', 'ধ': 'dha', 'ন': 'na', 'প': 'pa', 'ফ': 'pha', 'ব': 'ba', 'ভ': 'bha', 'ম': 'ma', 'য': 'ya', 'র': 'ra', 'ল': 'la', 'শ': 'sha', 'ষ': 'sha', 'স': 'sa', 'হ': 'ha', 'ড়': 'ra', 'ঢ়': 'rha', 'য়': 'ya', '়': '', # Nukta '্': '', # Virama # Vowel marks 'া': 'a', 'ি': 'i', 'ী': 'i', 'ু': 'u', 'ূ': 'u', 'ে': 'e', 'ৈ': 'ai', 'ো': 'o', 'ৌ': 'au', 'ং': 'ng', 'ঃ': 'h', 'ঁ': 'n', } def transliterate_bengali(text: str) -> str: """Transliterate Bengali to Latin (ISO 15919 simplified).""" result = [] for c in text: if c in BENGALI_MAP: result.append(BENGALI_MAP[c]) elif c == ' ': result.append(' ') elif c.isalnum(): result.append(c) return ''.join(result) # ============================================================================= # SINHALA TRANSLITERATION (ISO 15919) # ============================================================================= # Sinhala character map (ISO 15919 romanization) SINHALA_MAP = { # Independent vowels 'අ': 'a', 'ආ': 'aa', 'ඇ': 'ae', 'ඈ': 'aae', 'ඉ': 'i', 'ඊ': 'ii', 'උ': 'u', 'ඌ': 'uu', 'එ': 'e', 'ඒ': 'ee', 'ඓ': 'ai', 'ඔ': 'o', 'ඕ': 'oo', 'ඖ': 'au', 'ඍ': 'ri', 'ඎ': 'rii', # Consonants (with inherent 'a' vowel) 'ක': 'ka', 'ඛ': 'kha', 'ග': 'ga', 'ඝ': 'gha', 'ඞ': 'nga', 'ඟ': 'nnga', 'ච': 'cha', 'ඡ': 'chha', 'ජ': 'ja', 'ඣ': 'jha', 'ඤ': 'nya', 'ඥ': 'gnya', 'ට': 'ta', 'ඨ': 'tha', 'ඩ': 'da', 'ඪ': 'dha', 'ණ': 'na', 'ඬ': 'nda', 'ත': 'tha', 'ථ': 'thha', 'ද': 'da', 'ධ': 'dha', 'න': 'na', 'ඳ': 'nda', 'ප': 'pa', 'ඵ': 'pha', 'බ': 'ba', 'භ': 'bha', 'ම': 'ma', 'ඹ': 'mba', 'ය': 'ya', 'ර': 'ra', 'ල': 'la', 'ව': 'va', 'ළ': 'la', 'ශ': 'sha', 'ෂ': 'sha', 'ස': 'sa', 'හ': 'ha', 'ෆ': 'fa', # Used for foreign words # Dependent vowel signs (matras) 'ා': 'a', 'ැ': 'ae', 'ෑ': 'aae', 'ි': 'i', 'ී': 'ii', 'ු': 'u', 'ූ': 'uu', 'ෙ': 'e', 'ේ': 'ee', 'ෛ': 'ai', 'ො': 'o', 'ෝ': 'oo', 'ෞ': 'au', 'ෘ': 'ri', 'ෲ': 'rii', # Special marks '්': '', # Virama (hal kirima) - removes inherent vowel 'ං': 'ng', # Anusvara 'ඃ': 'h', # Visarga '෴': '', # Kunddaliya (punctuation) # Numerals (Sinhala uses both Sinhala and Arabic numerals) '෦': '0', '෧': '1', '෨': '2', '෩': '3', '෪': '4', '෫': '5', '෬': '6', '෭': '7', '෮': '8', '෯': '9', } # Common Sinhala heritage vocabulary SINHALA_HERITAGE_VOCAB = { # University/Education 'විශ්වවිද්‍යාලය': 'vishvavidyalaya', 'විශ්වවිද්': 'vishvavid', 'යාලය': 'yalaya', 'පේරාදෙණිය': 'peradeniya', # National/Government 'ජාතික': 'jathika', 'දෙපාර්තමේන්තුව': 'departmentuwa', # Museums/Archives 'කෞතුකාගාර': 'kauthukagara', 'කෞතුකාගාරය': 'kauthukagaaraya', 'ලේඛනාගාරය': 'lekhanagaaraya', 'පුස්තකාලය': 'pusthakaalaya', # Places 'කොළඹ': 'colombo', 'ශ්‍රී': 'sri', 'ලංකාව': 'lankava', } def transliterate_sinhala(text: str) -> str: """Transliterate Sinhala to Latin (ISO 15919). Args: text: Text in Sinhala script Returns: Romanized text using ISO 15919 standard """ # First pass: replace known vocabulary (longest match first) result = text for sinhala, latin in sorted(SINHALA_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])): result = result.replace(sinhala, f' {latin} ') # Second pass: transliterate remaining characters output = [] i = 0 while i < len(result): c = result[i] # Skip if already Latin if c.isascii(): output.append(c) i += 1 continue # Check character map if c in SINHALA_MAP: output.append(SINHALA_MAP[c]) i += 1 continue # Unknown character - keep as is or skip if c == ' ': output.append(' ') elif c.isalnum(): output.append(c) i += 1 # Clean up spacing result = ''.join(output) result = ' '.join(result.split()) return result # ============================================================================= # KHMER TRANSLITERATION (UNGEGN Romanization) # ============================================================================= # Khmer consonants (with inherent 'a' or 'o' vowel depending on register) KHMER_CONSONANTS = { # First series (inherent 'aa' in open syllables) 'ក': 'k', 'ខ': 'kh', 'គ': 'k', 'ឃ': 'kh', 'ង': 'ng', 'ច': 'ch', 'ឆ': 'chh', 'ជ': 'ch', 'ឈ': 'chh', 'ញ': 'nh', 'ដ': 'd', 'ឋ': 'th', 'ឌ': 'd', 'ឍ': 'th', 'ណ': 'n', 'ត': 't', 'ថ': 'th', 'ទ': 't', 'ធ': 'th', 'ន': 'n', 'ប': 'b', 'ផ': 'ph', 'ព': 'p', 'ភ': 'ph', 'ម': 'm', 'យ': 'y', 'រ': 'r', 'ល': 'l', 'វ': 'v', 'ឝ': 'sh', 'ឞ': 's', 'ស': 's', 'ហ': 'h', 'ឡ': 'l', 'អ': '', } # Khmer dependent vowels KHMER_VOWELS = { 'ា': 'a', 'ិ': 'i', 'ី': 'ii', 'ឹ': 'eu', 'ឺ': 'eu', 'ុ': 'o', 'ូ': 'ou', 'ួ': 'ua', 'ើ': 'ae', 'ែ': 'ae', 'ៃ': 'ai', 'ោ': 'ao', 'ៅ': 'au', '្': '', # Subscript consonant marker (coeng) } # Khmer independent vowels KHMER_INDEP_VOWELS = { 'ឥ': 'i', 'ឦ': 'ii', 'ឧ': 'u', 'ឨ': 'uk', 'ឩ': 'uu', 'ឪ': 'ou', 'ឫ': 'ry', 'ឬ': 'ryy', 'ឭ': 'ly', 'ឮ': 'lyy', 'ឯ': 'ae', 'ឰ': 'ai', 'ឱ': 'ao', 'ឲ': 'ao', 'ឳ': 'au', } # Khmer special signs KHMER_SPECIAL = { 'ំ': 'm', # Nikahit (anusvara) 'ះ': 'h', # Visarga '់': '', # Bantoc (shortens vowel) '៌': 'r', # Robat (repha) '៍': '', # Toandakhiat (silent letter) '៎': '', # Kakabat (emphasis) '៏': '', # Ahsda (obsolete) '៑': '', # Viriam (obsolete punctuation) '៖': ':', # Camnuc pii kuuh (colon) '។': '.', # Khan (period) '៕': '.', # Bariyoosan (end mark) '៚': '', # Koomuut (section mark) } # Khmer numerals KHMER_NUMERALS = { '០': '0', '១': '1', '២': '2', '៣': '3', '៤': '4', '៥': '5', '៦': '6', '៧': '7', '៨': '8', '៩': '9', } # Common Khmer heritage vocabulary KHMER_HERITAGE_VOCAB = { # Museums/Memorials 'សារមន្ទីរ': 'saaramontir', 'សារមន្ទីរទួលស្លែង': 'saaramontir tuol sleng', 'ទួលស្លែង': 'tuol sleng', # Archives/Libraries 'បណ្ណាល័យ': 'bannaalay', 'ឯកសារដ្ឋាន': 'aeksaarathan', 'ជាតិ': 'cheate', # Places 'ភ្នំពេញ': 'phnom penh', 'អង្គរ': 'angkor', 'សៀមរាប': 'siem reap', # Cultural terms 'វប្បធម៌': 'vabpatham', 'បេតិកភណ្ឌ': 'betekaphon', 'ប្រវត្តិសាស្ត្រ': 'pravattisaas', } def transliterate_khmer(text: str) -> str: """Transliterate Khmer to Latin (UNGEGN system). Args: text: Text in Khmer script Returns: Romanized text using UNGEGN standard """ # First pass: replace known vocabulary (longest match first) result = text for khmer, latin in sorted(KHMER_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])): result = result.replace(khmer, f' {latin} ') # Second pass: transliterate remaining characters output = [] i = 0 while i < len(result): c = result[i] # Skip if already Latin if c.isascii(): output.append(c) i += 1 continue # Check consonants if c in KHMER_CONSONANTS: output.append(KHMER_CONSONANTS[c]) i += 1 continue # Check vowels if c in KHMER_VOWELS: output.append(KHMER_VOWELS[c]) i += 1 continue # Check independent vowels if c in KHMER_INDEP_VOWELS: output.append(KHMER_INDEP_VOWELS[c]) i += 1 continue # Check special signs if c in KHMER_SPECIAL: output.append(KHMER_SPECIAL[c]) i += 1 continue # Check numerals if c in KHMER_NUMERALS: output.append(KHMER_NUMERALS[c]) i += 1 continue # Unknown character - keep as is or skip if c == ' ': output.append(' ') elif c.isalnum(): output.append(c) i += 1 # Clean up spacing result = ''.join(output) result = ' '.join(result.split()) return result # ============================================================================= # MAIN TRANSLITERATION DISPATCHER # ============================================================================= # Language to script mapping LANG_SCRIPT_MAP = { 'ru': 'cyrillic', 'uk': 'cyrillic', 'bg': 'cyrillic', 'sr': 'cyrillic', 'kk': 'cyrillic', 'zh': 'chinese', 'ja': 'japanese', 'ko': 'korean', 'ar': 'arabic', 'fa': 'arabic', 'ur': 'arabic', 'he': 'hebrew', 'el': 'greek', 'hi': 'devanagari', 'ne': 'devanagari', 'bn': 'bengali', 'th': 'thai', 'hy': 'armenian', 'ka': 'georgian', 'si': 'sinhala', 'km': 'khmer', } TRANSLITERATORS = { 'cyrillic': transliterate_cyrillic, 'chinese': transliterate_chinese, 'japanese': transliterate_japanese, 'korean': transliterate_korean, 'arabic': transliterate_arabic, 'hebrew': transliterate_hebrew, 'greek': transliterate_greek, 'devanagari': transliterate_devanagari, 'bengali': transliterate_bengali, 'thai': transliterate_thai, 'armenian': transliterate_armenian, 'georgian': transliterate_georgian, 'sinhala': transliterate_sinhala, 'khmer': transliterate_khmer, 'latin': lambda t: t, # No transliteration needed } def transliterate(text: str, lang: Optional[str] = None) -> str: """ Transliterate text from non-Latin script to Latin. Args: text: Input text in any script lang: Optional ISO 639-1 language code (e.g., 'ru', 'zh', 'ko') If not provided, script is auto-detected. Returns: Transliterated text in Latin characters. """ if not text: return text # Determine script if lang and lang in LANG_SCRIPT_MAP: script = LANG_SCRIPT_MAP[lang] else: script = detect_script(text) # Get transliterator translit_func = TRANSLITERATORS.get(script, lambda t: t) # For Cyrillic, pass language for dialect-specific handling if script == 'cyrillic' and lang: result = translit_func(text, lang) else: result = translit_func(text) # Normalize diacritics to ASCII normalized = unicodedata.normalize('NFD', result) ascii_result = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') return ascii_result def transliterate_for_abbreviation(emic_name: str, lang: str) -> str: """ Transliterate emic name for GHCID abbreviation generation. This is the main entry point for GHCID generation scripts. Args: emic_name: Institution name in original script lang: ISO 639-1 language code Returns: Transliterated name ready for abbreviation extraction """ # Step 1: Transliterate to Latin latin = transliterate(emic_name, lang) # Step 2: Remove special characters (except spaces and hyphens) clean = re.sub(r"[^a-zA-Z\s\-']", ' ', latin) # Step 3: Normalize whitespace clean = ' '.join(clean.split()) return clean # ============================================================================= # CLI INTERFACE # ============================================================================= def main(): parser = argparse.ArgumentParser( description='Transliterate non-Latin script text to Latin characters', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=''' Examples: python transliterate_emic_names.py --text "Институт" --lang ru python transliterate_emic_names.py --text "东巴文化博物院" --lang zh python transliterate_emic_names.py --file data/custodian/example.yaml Supported languages: ru (Russian), uk (Ukrainian), bg (Bulgarian), sr (Serbian), kk (Kazakh) zh (Chinese), ja (Japanese), ko (Korean) ar (Arabic), fa (Persian), ur (Urdu), he (Hebrew) el (Greek), hi (Hindi), ne (Nepali), bn (Bengali) th (Thai), hy (Armenian), ka (Georgian) ''' ) parser.add_argument('--text', '-t', help='Text to transliterate') parser.add_argument('--lang', '-l', help='ISO 639-1 language code') parser.add_argument('--file', '-f', help='YAML file to process') parser.add_argument('--detect', '-d', action='store_true', help='Only detect script, do not transliterate') parser.add_argument('--libs', action='store_true', help='Show available transliteration libraries') args = parser.parse_args() if args.libs: print("Available transliteration libraries:") for lib, available in AVAILABLE_LIBS.items(): status = "✓ installed" if available else "✗ not installed" print(f" {lib}: {status}") return if args.file: import yaml with open(args.file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) emic_name = data.get('custodian_name', {}).get('emic_name') lang = data.get('custodian_name', {}).get('name_language') if not emic_name: print(f"Error: No emic_name found in {args.file}") return print(f"Emic name: {emic_name}") print(f"Language: {lang or '(auto-detect)'}") if args.detect: script = detect_script(emic_name) print(f"Detected script: {script}") else: result = transliterate_for_abbreviation(emic_name, lang) print(f"Transliterated: {result}") return if args.text: if args.detect: script = detect_script(args.text) print(f"Input: {args.text}") print(f"Detected script: {script}") else: result = transliterate_for_abbreviation(args.text, args.lang) print(f"Input: {args.text}") print(f"Language: {args.lang or '(auto-detect)'}") print(f"Output: {result}") return parser.print_help() if __name__ == '__main__': main()