- Add fix_ghcid_diacritics.py for normalizing non-ASCII in GHCIDs - Add resolve_diacritics_collisions.py for collision handling - Add transliterate_emic_names.py for non-Latin script handling - Add transliteration tests
1267 lines
46 KiB
Python
1267 lines
46 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Transliteration Utility for GHCID Abbreviation Generation
|
||
|
||
This script provides transliteration functions for converting non-Latin script
|
||
institution names to Latin characters using ISO and recognized standards.
|
||
|
||
Usage:
|
||
# As a module
|
||
from scripts.transliterate_emic_names import transliterate_for_abbreviation
|
||
|
||
latin = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru")
|
||
# Result: "Institut vostochnykh rukopisey RAN"
|
||
|
||
# As a CLI tool
|
||
python scripts/transliterate_emic_names.py --text "東巴文化博物院" --lang zh
|
||
python scripts/transliterate_emic_names.py --file data/custodian/example.yaml
|
||
|
||
Standards:
|
||
- Cyrillic (ru, uk, bg, sr, kk): ISO 9:1995
|
||
- Chinese (zh): Hanyu Pinyin (ISO 7098)
|
||
- Japanese (ja): Modified Hepburn
|
||
- Korean (ko): Revised Romanization
|
||
- Arabic (ar, fa, ur): ISO 233-2/3
|
||
- Hebrew (he): ISO 259-3
|
||
- Greek (el): ISO 843
|
||
- Devanagari (hi, ne, bn): ISO 15919
|
||
- Thai (th): ISO 11940-2
|
||
- Armenian (hy): ISO 9985
|
||
- Georgian (ka): ISO 9984
|
||
|
||
Author: GLAM Project
|
||
Created: 2025-12-08
|
||
"""
|
||
|
||
import argparse
|
||
import re
|
||
import unicodedata
|
||
from pathlib import Path
|
||
from typing import Optional, Dict, List, Tuple
|
||
|
||
# Try importing optional transliteration libraries
|
||
AVAILABLE_LIBS: Dict[str, bool] = {}
|
||
|
||
try:
|
||
from pypinyin import pinyin, Style
|
||
AVAILABLE_LIBS['pypinyin'] = True
|
||
except ImportError:
|
||
AVAILABLE_LIBS['pypinyin'] = False
|
||
|
||
try:
|
||
import pykakasi
|
||
AVAILABLE_LIBS['pykakasi'] = True
|
||
except ImportError:
|
||
AVAILABLE_LIBS['pykakasi'] = False
|
||
|
||
try:
|
||
from transliterate import translit
|
||
AVAILABLE_LIBS['transliterate'] = True
|
||
except ImportError:
|
||
AVAILABLE_LIBS['transliterate'] = False
|
||
|
||
|
||
# =============================================================================
|
||
# SCRIPT DETECTION
|
||
# =============================================================================
|
||
|
||
def detect_script(text: str) -> str:
|
||
"""
|
||
Detect the primary script of the input text.
|
||
|
||
Returns one of:
|
||
- 'latin': Latin alphabet
|
||
- 'cyrillic': Cyrillic script
|
||
- 'chinese': Chinese characters (Hanzi)
|
||
- 'japanese': Japanese (mixed Kanji/Kana)
|
||
- 'korean': Korean Hangul
|
||
- 'arabic': Arabic script (includes Persian, Urdu)
|
||
- 'hebrew': Hebrew script
|
||
- 'greek': Greek script
|
||
- 'devanagari': Devanagari (Hindi, Nepali, Sanskrit)
|
||
- 'bengali': Bengali script
|
||
- 'thai': Thai script
|
||
- 'armenian': Armenian script
|
||
- 'georgian': Georgian script
|
||
- 'sinhala': Sinhala script
|
||
- 'khmer': Khmer script
|
||
- 'unknown': Cannot determine
|
||
"""
|
||
script_ranges = {
|
||
'cyrillic': (0x0400, 0x04FF),
|
||
'arabic': (0x0600, 0x06FF),
|
||
'persian_ext': (0x0750, 0x077F), # Arabic Supplement
|
||
'hebrew': (0x0590, 0x05FF),
|
||
'devanagari': (0x0900, 0x097F),
|
||
'bengali': (0x0980, 0x09FF),
|
||
'thai': (0x0E00, 0x0E7F),
|
||
'greek': (0x0370, 0x03FF),
|
||
'armenian': (0x0530, 0x058F),
|
||
'georgian': (0x10A0, 0x10FF),
|
||
'korean': (0xAC00, 0xD7AF), # Hangul syllables
|
||
'korean_jamo': (0x1100, 0x11FF), # Hangul Jamo
|
||
'japanese_hiragana': (0x3040, 0x309F),
|
||
'japanese_katakana': (0x30A0, 0x30FF),
|
||
'chinese': (0x4E00, 0x9FFF), # CJK Unified Ideographs
|
||
'chinese_ext': (0x3400, 0x4DBF), # CJK Extension A
|
||
'sinhala': (0x0D80, 0x0DFF),
|
||
'khmer': (0x1780, 0x17FF),
|
||
}
|
||
|
||
script_counts: Dict[str, int] = {script: 0 for script in script_ranges}
|
||
latin_count = 0
|
||
|
||
for char in text:
|
||
code = ord(char)
|
||
|
||
# Check Latin
|
||
if ('a' <= char <= 'z') or ('A' <= char <= 'Z'):
|
||
latin_count += 1
|
||
continue
|
||
|
||
# Check other scripts
|
||
for script, (start, end) in script_ranges.items():
|
||
if start <= code <= end:
|
||
script_counts[script] += 1
|
||
break
|
||
|
||
# Determine primary script
|
||
if latin_count > 0 and all(c == 0 for c in script_counts.values()):
|
||
return 'latin'
|
||
|
||
# Merge related scripts
|
||
script_counts['arabic'] += script_counts.get('persian_ext', 0)
|
||
script_counts['korean'] += script_counts.get('korean_jamo', 0)
|
||
script_counts['chinese'] += script_counts.get('chinese_ext', 0)
|
||
|
||
# Handle CJK disambiguation
|
||
cjk_count = script_counts.get('chinese', 0)
|
||
hiragana_count = script_counts.get('japanese_hiragana', 0)
|
||
katakana_count = script_counts.get('japanese_katakana', 0)
|
||
|
||
if hiragana_count > 0 or katakana_count > 0:
|
||
return 'japanese'
|
||
|
||
# Find max non-Latin script
|
||
primary_scripts = ['cyrillic', 'arabic', 'hebrew', 'devanagari', 'bengali',
|
||
'thai', 'greek', 'armenian', 'georgian', 'korean',
|
||
'chinese', 'sinhala', 'khmer']
|
||
|
||
max_script = max(primary_scripts, key=lambda s: script_counts.get(s, 0))
|
||
if script_counts.get(max_script, 0) > 0:
|
||
return max_script
|
||
|
||
return 'latin' if latin_count > 0 else 'unknown'
|
||
|
||
|
||
# =============================================================================
|
||
# CYRILLIC TRANSLITERATION (ISO 9:1995)
|
||
# =============================================================================
|
||
|
||
CYRILLIC_MAP = {
|
||
# Russian
|
||
'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E',
|
||
'Ё': 'E', 'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K',
|
||
'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R',
|
||
'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'Kh', 'Ц': 'Ts',
|
||
'Ч': 'Ch', 'Ш': 'Sh', 'Щ': 'Shch', 'Ъ': '', 'Ы': 'Y', 'Ь': '',
|
||
'Э': 'E', 'Ю': 'Yu', 'Я': 'Ya',
|
||
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e',
|
||
'ё': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k',
|
||
'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r',
|
||
'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
|
||
'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '',
|
||
'э': 'e', 'ю': 'yu', 'я': 'ya',
|
||
# Ukrainian additions
|
||
'І': 'I', 'і': 'i', 'Ї': 'Yi', 'ї': 'yi', 'Є': 'Ye', 'є': 'ye',
|
||
'Ґ': 'G', 'ґ': 'g', "'": '',
|
||
# Bulgarian additions
|
||
'Ъ': 'A', 'ъ': 'a', # Bulgarian hard sign = schwa
|
||
# Serbian additions
|
||
'Ђ': 'Dj', 'ђ': 'dj', 'Ј': 'J', 'ј': 'j', 'Љ': 'Lj', 'љ': 'lj',
|
||
'Њ': 'Nj', 'њ': 'nj', 'Ћ': 'C', 'ћ': 'c', 'Џ': 'Dz', 'џ': 'dz',
|
||
# Kazakh additions (Cyrillic-based)
|
||
'Ә': 'A', 'ә': 'a', 'Ғ': 'Gh', 'ғ': 'gh', 'Қ': 'Q', 'қ': 'q',
|
||
'Ң': 'Ng', 'ң': 'ng', 'Ө': 'O', 'ө': 'o', 'Ұ': 'U', 'ұ': 'u',
|
||
'Ү': 'U', 'ү': 'u', 'Һ': 'H', 'һ': 'h',
|
||
}
|
||
|
||
|
||
def transliterate_cyrillic(text: str, lang: str = 'ru') -> str:
|
||
"""Transliterate Cyrillic text using ISO 9 mapping."""
|
||
if AVAILABLE_LIBS.get('transliterate'):
|
||
try:
|
||
return translit(text, lang, reversed=True)
|
||
except Exception:
|
||
pass
|
||
|
||
# Fallback to manual mapping
|
||
return ''.join(CYRILLIC_MAP.get(c, c) for c in text)
|
||
|
||
|
||
# =============================================================================
|
||
# CHINESE TRANSLITERATION (Hanyu Pinyin)
|
||
# =============================================================================
|
||
|
||
# Basic Pinyin dictionary for common museum/library/archive vocabulary
|
||
# This allows basic transliteration without pypinyin library
|
||
CHINESE_PINYIN_MAP = {
|
||
# Numbers
|
||
'一': 'yi', '二': 'er', '三': 'san', '四': 'si', '五': 'wu',
|
||
'六': 'liu', '七': 'qi', '八': 'ba', '九': 'jiu', '十': 'shi',
|
||
'百': 'bai', '千': 'qian', '万': 'wan',
|
||
|
||
# Heritage/Museum vocabulary
|
||
'博': 'bo', '物': 'wu', '館': 'guan', '馆': 'guan', '院': 'yuan',
|
||
'文': 'wen', '化': 'hua', '藝': 'yi', '艺': 'yi', '術': 'shu', '术': 'shu',
|
||
'歷': 'li', '历': 'li', '史': 'shi', '遺': 'yi', '遗': 'yi', '產': 'chan', '产': 'chan',
|
||
'國': 'guo', '国': 'guo', '立': 'li', '家': 'jia', '民': 'min', '族': 'zu',
|
||
'中': 'zhong', '央': 'yang', '省': 'sheng', '市': 'shi', '縣': 'xian', '县': 'xian',
|
||
'圖': 'tu', '图': 'tu', '書': 'shu', '书': 'shu', '檔': 'dang', '档': 'dang', '案': 'an',
|
||
'美': 'mei', '古': 'gu', '典': 'dian', '藏': 'cang', '品': 'pin', '展': 'zhan', '覽': 'lan', '览': 'lan',
|
||
'紀': 'ji', '纪': 'ji', '念': 'nian', '碑': 'bei', '塔': 'ta', '廟': 'miao', '庙': 'miao',
|
||
'寺': 'si', '宮': 'gong', '宫': 'gong', '殿': 'dian', '城': 'cheng', '堡': 'bao',
|
||
'樓': 'lou', '楼': 'lou', '閣': 'ge', '阁': 'ge', '亭': 'ting', '園': 'yuan', '园': 'yuan',
|
||
'研': 'yan', '究': 'jiu', '所': 'suo', '中': 'zhong', '心': 'xin',
|
||
'學': 'xue', '学': 'xue', '院': 'yuan', '校': 'xiao', '系': 'xi',
|
||
'會': 'hui', '会': 'hui', '社': 'she', '團': 'tuan', '团': 'tuan',
|
||
'東': 'dong', '东': 'dong', '西': 'xi', '南': 'nan', '北': 'bei',
|
||
'京': 'jing', '海': 'hai', '山': 'shan', '河': 'he', '江': 'jiang',
|
||
'大': 'da', '小': 'xiao', '新': 'xin', '老': 'lao', '古': 'gu',
|
||
'自': 'zi', '然': 'ran', '科': 'ke', '技': 'ji',
|
||
'巴': 'ba', '東': 'dong', '納': 'na', '纳': 'na', '西': 'xi',
|
||
'故': 'gu', '宮': 'gong',
|
||
'基': 'ji', '金': 'jin', '銀': 'yin', '银': 'yin',
|
||
'教': 'jiao', '育': 'yu', '傳': 'chuan', '传': 'chuan', '統': 'tong', '统': 'tong',
|
||
'絲': 'si', '丝': 'si', '綢': 'chou', '绸': 'chou', '路': 'lu',
|
||
'陶': 'tao', '瓷': 'ci', '玉': 'yu', '石': 'shi', '銅': 'tong', '铜': 'tong',
|
||
'畫': 'hua', '画': 'hua', '雕': 'diao', '塑': 'su',
|
||
'民': 'min', '俗': 'su', '風': 'feng', '风': 'feng', '土': 'tu',
|
||
'革': 'ge', '命': 'ming', '戰': 'zhan', '战': 'zhan', '爭': 'zheng', '争': 'zheng',
|
||
'軍': 'jun', '军': 'jun', '事': 'shi', '航': 'hang', '空': 'kong', '天': 'tian',
|
||
'宗': 'zong', '佛': 'fo', '道': 'dao', '儒': 'ru',
|
||
'絃': 'xian', '弦': 'xian', '琴': 'qin', '樂': 'yue', '乐': 'yue',
|
||
'舞': 'wu', '劇': 'ju', '剧': 'ju', '戲': 'xi', '戏': 'xi',
|
||
'茶': 'cha', '酒': 'jiu', '食': 'shi', '餐': 'can',
|
||
'衣': 'yi', '服': 'fu', '紡': 'fang', '纺': 'fang', '織': 'zhi', '织': 'zhi',
|
||
'建': 'jian', '築': 'zhu', '筑': 'zhu', '房': 'fang', '屋': 'wu',
|
||
'水': 'shui', '電': 'dian', '电': 'dian', '火': 'huo', '木': 'mu',
|
||
'農': 'nong', '农': 'nong', '業': 'ye', '业': 'ye', '工': 'gong', '商': 'shang',
|
||
'醫': 'yi', '医': 'yi', '藥': 'yao', '药': 'yao',
|
||
'人': 'ren', '物': 'wu', '生': 'sheng', '活': 'huo',
|
||
'和': 'he', '平': 'ping', '友': 'you', '誼': 'yi', '谊': 'yi',
|
||
'港': 'gang', '澳': 'ao', '台': 'tai', '灣': 'wan', '湾': 'wan',
|
||
'華': 'hua', '华': 'hua', '僑': 'qiao', '侨': 'qiao',
|
||
'海': 'hai', '外': 'wai', '交': 'jiao', '流': 'liu',
|
||
'保': 'bao', '護': 'hu', '护': 'hu', '修': 'xiu', '復': 'fu', '复': 'fu',
|
||
'鑒': 'jian', '鉴': 'jian', '定': 'ding', '評': 'ping', '评': 'ping', '估': 'gu',
|
||
}
|
||
|
||
def transliterate_chinese(text: str) -> str:
|
||
"""Transliterate Chinese to Pinyin without tone marks."""
|
||
if AVAILABLE_LIBS.get('pypinyin'):
|
||
result = pinyin(text, style=Style.NORMAL)
|
||
return ' '.join([''.join(p) for p in result])
|
||
|
||
# Fallback: use basic vocabulary mapping
|
||
result = []
|
||
for char in text:
|
||
if char in CHINESE_PINYIN_MAP:
|
||
result.append(CHINESE_PINYIN_MAP[char])
|
||
elif char == ' ':
|
||
result.append(' ')
|
||
elif char.isalnum():
|
||
result.append(char)
|
||
|
||
# If we got no result, return warning
|
||
if not result:
|
||
return '[REQUIRES_PYPINYIN]'
|
||
|
||
return ''.join(result)
|
||
|
||
|
||
# =============================================================================
|
||
# JAPANESE TRANSLITERATION (Modified Hepburn)
|
||
# =============================================================================
|
||
|
||
# Basic Kanji/Kana to Romaji map for common heritage vocabulary
|
||
# This allows basic transliteration without pykakasi library
|
||
JAPANESE_ROMAJI_MAP = {
|
||
# Common heritage vocabulary Kanji
|
||
'博': 'haku', '物': 'butsu', '館': 'kan', '院': 'in',
|
||
'文': 'bun', '化': 'ka', '藝': 'gei', '術': 'jutsu',
|
||
'歷': 'reki', '史': 'shi', '遺': 'i', '產': 'san',
|
||
'國': 'koku', '国': 'koku', '立': 'ritsu', '家': 'ka',
|
||
'民': 'min', '族': 'zoku', '中': 'chuu', '央': 'ou',
|
||
'圖': 'to', '図': 'to', '書': 'sho', '檔': 'tou', '案': 'an',
|
||
'美': 'bi', '古': 'ko', '典': 'ten', '藏': 'zou', '品': 'hin',
|
||
'展': 'ten', '覽': 'ran', '紀': 'ki', '念': 'nen',
|
||
'寺': 'ji', '宮': 'kyuu', '殿': 'den', '城': 'jou', '堡': 'hou',
|
||
'樓': 'rou', '閣': 'kaku', '亭': 'tei', '園': 'en',
|
||
'研': 'ken', '究': 'kyuu', '所': 'sho', '心': 'shin',
|
||
'學': 'gaku', '学': 'gaku', '校': 'kou', '系': 'kei',
|
||
'會': 'kai', '会': 'kai', '社': 'sha', '團': 'dan',
|
||
'東': 'tou', '西': 'sei', '南': 'nan', '北': 'hoku',
|
||
'京': 'kyou', '都': 'to', '海': 'kai', '山': 'zan', '河': 'ka', '川': 'kawa',
|
||
'大': 'dai', '小': 'shou', '新': 'shin', '老': 'rou',
|
||
'自': 'ji', '然': 'nen', '科': 'ka', '技': 'gi',
|
||
'故': 'ko', '金': 'kin', '銀': 'gin',
|
||
'教': 'kyou', '育': 'iku', '傳': 'den', '統': 'tou',
|
||
'陶': 'tou', '瓷': 'ji', '玉': 'gyoku', '石': 'seki', '銅': 'dou',
|
||
'畫': 'ga', '画': 'ga', '雕': 'chou', '塑': 'so',
|
||
'俗': 'zoku', '風': 'fuu', '土': 'do',
|
||
'革': 'kaku', '命': 'mei', '戰': 'sen', '爭': 'sou',
|
||
'軍': 'gun', '事': 'ji', '航': 'kou', '空': 'kuu', '天': 'ten',
|
||
'宗': 'shuu', '佛': 'butsu', '道': 'dou', '儒': 'ju',
|
||
'琴': 'kin', '樂': 'gaku', '舞': 'bu', '劇': 'geki', '戲': 'gi',
|
||
'茶': 'cha', '酒': 'shu', '食': 'shoku', '餐': 'san',
|
||
'衣': 'i', '服': 'fuku', '紡': 'bou', '織': 'shoku',
|
||
'建': 'ken', '築': 'chiku', '房': 'bou', '屋': 'oku',
|
||
'水': 'sui', '電': 'den', '火': 'ka', '木': 'moku',
|
||
'農': 'nou', '業': 'gyou', '工': 'kou', '商': 'shou',
|
||
'醫': 'i', '藥': 'yaku', '人': 'jin', '生': 'sei', '活': 'katsu',
|
||
'和': 'wa', '平': 'hei', '友': 'yuu', '誼': 'gi',
|
||
'港': 'kou', '灣': 'wan', '華': 'ka', '僑': 'kyou',
|
||
'外': 'gai', '交': 'kou', '流': 'ryuu',
|
||
'保': 'ho', '護': 'go', '修': 'shuu', '復': 'fuku',
|
||
'鑒': 'kan', '定': 'tei', '評': 'hyou', '估': 'ko',
|
||
'記': 'ki', '録': 'roku', '資': 'shi', '料': 'ryou',
|
||
# Hiragana
|
||
'あ': 'a', 'い': 'i', 'う': 'u', 'え': 'e', 'お': 'o',
|
||
'か': 'ka', 'き': 'ki', 'く': 'ku', 'け': 'ke', 'こ': 'ko',
|
||
'さ': 'sa', 'し': 'shi', 'す': 'su', 'せ': 'se', 'そ': 'so',
|
||
'た': 'ta', 'ち': 'chi', 'つ': 'tsu', 'て': 'te', 'と': 'to',
|
||
'な': 'na', 'に': 'ni', 'ぬ': 'nu', 'ね': 'ne', 'の': 'no',
|
||
'は': 'ha', 'ひ': 'hi', 'ふ': 'fu', 'へ': 'he', 'ほ': 'ho',
|
||
'ま': 'ma', 'み': 'mi', 'む': 'mu', 'め': 'me', 'も': 'mo',
|
||
'や': 'ya', 'ゆ': 'yu', 'よ': 'yo',
|
||
'ら': 'ra', 'り': 'ri', 'る': 'ru', 'れ': 're', 'ろ': 'ro',
|
||
'わ': 'wa', 'を': 'wo', 'ん': 'n',
|
||
# Katakana
|
||
'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o',
|
||
'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke', 'コ': 'ko',
|
||
'サ': 'sa', 'シ': 'shi', 'ス': 'su', 'セ': 'se', 'ソ': 'so',
|
||
'タ': 'ta', 'チ': 'chi', 'ツ': 'tsu', 'テ': 'te', 'ト': 'to',
|
||
'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no',
|
||
'ハ': 'ha', 'ヒ': 'hi', 'フ': 'fu', 'ヘ': 'he', 'ホ': 'ho',
|
||
'マ': 'ma', 'ミ': 'mi', 'ム': 'mu', 'メ': 'me', 'モ': 'mo',
|
||
'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo',
|
||
'ラ': 'ra', 'リ': 'ri', 'ル': 'ru', 'レ': 're', 'ロ': 'ro',
|
||
'ワ': 'wa', 'ヲ': 'wo', 'ン': 'n',
|
||
}
|
||
|
||
|
||
def transliterate_japanese(text: str) -> str:
|
||
"""Transliterate Japanese to Romaji using Modified Hepburn."""
|
||
if AVAILABLE_LIBS.get('pykakasi'):
|
||
kakasi = pykakasi.kakasi()
|
||
result = kakasi.convert(text)
|
||
return ' '.join([item['hepburn'] for item in result])
|
||
|
||
# Fallback: use basic vocabulary mapping
|
||
result = []
|
||
for char in text:
|
||
if char in JAPANESE_ROMAJI_MAP:
|
||
result.append(JAPANESE_ROMAJI_MAP[char])
|
||
elif char == ' ':
|
||
result.append(' ')
|
||
elif char.isalnum():
|
||
result.append(char)
|
||
|
||
# If we got no result, return warning
|
||
if not result:
|
||
return '[REQUIRES_PYKAKASI]'
|
||
|
||
return ''.join(result)
|
||
|
||
|
||
# =============================================================================
|
||
# KOREAN TRANSLITERATION (Revised Romanization)
|
||
# =============================================================================
|
||
|
||
# Basic Hangul syllable decomposition tables
|
||
HANGUL_INITIALS = [
|
||
'g', 'kk', 'n', 'd', 'tt', 'r', 'm', 'b', 'pp', 's', 'ss', '',
|
||
'j', 'jj', 'ch', 'k', 't', 'p', 'h'
|
||
]
|
||
|
||
HANGUL_MEDIALS = [
|
||
'a', 'ae', 'ya', 'yae', 'eo', 'e', 'yeo', 'ye', 'o', 'wa', 'wae',
|
||
'oe', 'yo', 'u', 'wo', 'we', 'wi', 'yu', 'eu', 'ui', 'i'
|
||
]
|
||
|
||
HANGUL_FINALS = [
|
||
'', 'k', 'k', 'k', 'n', 'n', 'n', 't', 'l', 'l', 'l', 'l', 'l',
|
||
'l', 'l', 'l', 'm', 'p', 'p', 's', 's', 'ng', 't', 't', 'k', 't', 'p', 't'
|
||
]
|
||
|
||
|
||
def transliterate_korean(text: str) -> str:
|
||
"""Transliterate Korean Hangul to Revised Romanization."""
|
||
try:
|
||
from korean_romanizer.romanizer import Romanizer
|
||
r = Romanizer(text)
|
||
return r.romanize()
|
||
except ImportError:
|
||
pass
|
||
|
||
# Fallback: basic syllable decomposition
|
||
result = []
|
||
for char in text:
|
||
code = ord(char)
|
||
if 0xAC00 <= code <= 0xD7AF: # Hangul syllable
|
||
code -= 0xAC00
|
||
initial = code // (21 * 28)
|
||
medial = (code % (21 * 28)) // 28
|
||
final = code % 28
|
||
|
||
syllable = HANGUL_INITIALS[initial] + HANGUL_MEDIALS[medial]
|
||
if final > 0:
|
||
syllable += HANGUL_FINALS[final]
|
||
result.append(syllable)
|
||
else:
|
||
result.append(char)
|
||
|
||
return ''.join(result)
|
||
|
||
|
||
# =============================================================================
|
||
# ARABIC TRANSLITERATION (ISO 233-2)
|
||
# =============================================================================
|
||
|
||
ARABIC_MAP = {
|
||
'ا': 'a', 'أ': 'a', 'إ': 'i', 'آ': 'a', 'ء': "'",
|
||
'ب': 'b', 'ت': 't', 'ث': 'th', 'ج': 'j',
|
||
'ح': 'h', 'خ': 'kh', 'د': 'd', 'ذ': 'dh',
|
||
'ر': 'r', 'ز': 'z', 'س': 's', 'ش': 'sh',
|
||
'ص': 's', 'ض': 'd', 'ط': 't', 'ظ': 'z',
|
||
'ع': "'", 'غ': 'gh', 'ف': 'f', 'ق': 'q',
|
||
'ك': 'k', 'ل': 'l', 'م': 'm', 'ن': 'n',
|
||
'ه': 'h', 'و': 'w', 'ي': 'y', 'ى': 'a',
|
||
'ة': 'a',
|
||
# Persian additions
|
||
'پ': 'p', 'چ': 'ch', 'ژ': 'zh', 'گ': 'g',
|
||
'ک': 'k', 'ی': 'i',
|
||
# Urdu additions
|
||
'ٹ': 't', 'ڈ': 'd', 'ڑ': 'r', 'ں': 'n',
|
||
# Diacritics (vowel marks)
|
||
'َ': 'a', 'ِ': 'i', 'ُ': 'u',
|
||
'ً': 'an', 'ٍ': 'in', 'ٌ': 'un',
|
||
'ّ': '', # Shadda (gemination) - simplified
|
||
}
|
||
|
||
|
||
def transliterate_arabic(text: str) -> str:
|
||
"""Transliterate Arabic script to Latin (ISO 233 simplified)."""
|
||
result = []
|
||
for c in text:
|
||
if c in ARABIC_MAP:
|
||
result.append(ARABIC_MAP[c])
|
||
elif c == ' ' or c.isalnum():
|
||
result.append(c)
|
||
elif c == '\u200c': # Zero-width non-joiner (Persian)
|
||
result.append('-')
|
||
return ''.join(result)
|
||
|
||
|
||
# =============================================================================
|
||
# HEBREW TRANSLITERATION (ISO 259-3)
|
||
# =============================================================================
|
||
|
||
HEBREW_MAP = {
|
||
'א': '', 'ב': 'v', 'ג': 'g', 'ד': 'd', 'ה': 'h',
|
||
'ו': 'v', 'ז': 'z', 'ח': 'ch', 'ט': 't', 'י': 'y',
|
||
'כ': 'k', 'ך': 'k', 'ל': 'l', 'מ': 'm', 'ם': 'm',
|
||
'נ': 'n', 'ן': 'n', 'ס': 's', 'ע': '', 'פ': 'f',
|
||
'ף': 'f', 'צ': 'ts', 'ץ': 'ts', 'ק': 'k', 'ר': 'r',
|
||
'ש': 'sh', 'ת': 't',
|
||
# With dagesh
|
||
'בּ': 'b', 'כּ': 'k', 'פּ': 'p',
|
||
}
|
||
|
||
|
||
def transliterate_hebrew(text: str) -> str:
|
||
"""Transliterate Hebrew to Latin (ISO 259-3 simplified)."""
|
||
result = []
|
||
for c in text:
|
||
if c in HEBREW_MAP:
|
||
result.append(HEBREW_MAP[c])
|
||
elif c == ' ' or c.isalnum():
|
||
result.append(c)
|
||
return ''.join(result)
|
||
|
||
|
||
# =============================================================================
|
||
# GREEK TRANSLITERATION (ISO 843)
|
||
# =============================================================================
|
||
|
||
GREEK_MAP = {
|
||
'Α': 'A', 'α': 'a', 'Β': 'V', 'β': 'v', 'Γ': 'G', 'γ': 'g',
|
||
'Δ': 'D', 'δ': 'd', 'Ε': 'E', 'ε': 'e', 'Ζ': 'Z', 'ζ': 'z',
|
||
'Η': 'I', 'η': 'i', 'Θ': 'Th', 'θ': 'th', 'Ι': 'I', 'ι': 'i',
|
||
'Κ': 'K', 'κ': 'k', 'Λ': 'L', 'λ': 'l', 'Μ': 'M', 'μ': 'm',
|
||
'Ν': 'N', 'ν': 'n', 'Ξ': 'X', 'ξ': 'x', 'Ο': 'O', 'ο': 'o',
|
||
'Π': 'P', 'π': 'p', 'Ρ': 'R', 'ρ': 'r', 'Σ': 'S', 'σ': 's',
|
||
'ς': 's', 'Τ': 'T', 'τ': 't', 'Υ': 'Y', 'υ': 'y', 'Φ': 'F',
|
||
'φ': 'f', 'Χ': 'Ch', 'χ': 'ch', 'Ψ': 'Ps', 'ψ': 'ps',
|
||
'Ω': 'O', 'ω': 'o',
|
||
# With accents
|
||
'Ά': 'A', 'ά': 'a', 'Έ': 'E', 'έ': 'e', 'Ή': 'I', 'ή': 'i',
|
||
'Ί': 'I', 'ί': 'i', 'Ό': 'O', 'ό': 'o', 'Ύ': 'Y', 'ύ': 'y',
|
||
'Ώ': 'O', 'ώ': 'o', 'ϊ': 'i', 'ϋ': 'y', 'ΐ': 'i', 'ΰ': 'y',
|
||
}
|
||
|
||
|
||
def transliterate_greek(text: str) -> str:
|
||
"""Transliterate Greek to Latin (ISO 843)."""
|
||
return ''.join(GREEK_MAP.get(c, c) for c in text)
|
||
|
||
|
||
# =============================================================================
|
||
# DEVANAGARI TRANSLITERATION (ISO 15919)
|
||
# =============================================================================
|
||
|
||
DEVANAGARI_MAP = {
|
||
# Vowels
|
||
'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ii', 'उ': 'u', 'ऊ': 'uu',
|
||
'ऋ': 'ri', 'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au',
|
||
# Vowel marks
|
||
'ा': 'a', 'ि': 'i', 'ी': 'i', 'ु': 'u', 'ू': 'u',
|
||
'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au', 'ं': 'm', 'ः': 'h',
|
||
# Consonants
|
||
'क': 'ka', 'ख': 'kha', 'ग': 'ga', 'घ': 'gha', 'ङ': 'nga',
|
||
'च': 'cha', 'छ': 'chha', 'ज': 'ja', 'झ': 'jha', 'ञ': 'nya',
|
||
'ट': 'ta', 'ठ': 'tha', 'ड': 'da', 'ढ': 'dha', 'ण': 'na',
|
||
'त': 'ta', 'थ': 'tha', 'द': 'da', 'ध': 'dha', 'न': 'na',
|
||
'प': 'pa', 'फ': 'pha', 'ब': 'ba', 'भ': 'bha', 'म': 'ma',
|
||
'य': 'ya', 'र': 'ra', 'ल': 'la', 'व': 'va',
|
||
'श': 'sha', 'ष': 'sha', 'स': 'sa', 'ह': 'ha',
|
||
'्': '', # Virama (removes inherent 'a')
|
||
# Hindi-specific
|
||
'ड़': 'da', 'ढ़': 'dha', 'क़': 'qa', 'ख़': 'kha', 'ग़': 'gha',
|
||
'ज़': 'za', 'फ़': 'fa',
|
||
}
|
||
|
||
|
||
def transliterate_devanagari(text: str) -> str:
|
||
"""Transliterate Devanagari to Latin (ISO 15919 simplified)."""
|
||
try:
|
||
from indic_transliteration import sanscript
|
||
from indic_transliteration.sanscript import transliterate as indic_translit
|
||
return indic_translit(text, sanscript.DEVANAGARI, sanscript.IAST)
|
||
except ImportError:
|
||
pass
|
||
|
||
# Fallback: basic mapping
|
||
result = []
|
||
for c in text:
|
||
if c in DEVANAGARI_MAP:
|
||
result.append(DEVANAGARI_MAP[c])
|
||
elif c == ' ':
|
||
result.append(' ')
|
||
elif c.isalnum():
|
||
result.append(c)
|
||
return ''.join(result)
|
||
|
||
|
||
# =============================================================================
|
||
# THAI TRANSLITERATION (ISO 11940-2 / Royal Thai General System)
|
||
# =============================================================================
|
||
|
||
# Thai consonants with RTGS romanization
|
||
# Note: Thai consonants have inherent vowel 'o' or 'a' depending on syllable structure
|
||
THAI_CONSONANTS = {
|
||
# Initial consonants (high, mid, low class)
|
||
'ก': 'k', 'ข': 'kh', 'ฃ': 'kh', 'ค': 'kh', 'ฅ': 'kh', 'ฆ': 'kh',
|
||
'ง': 'ng',
|
||
'จ': 'ch', 'ฉ': 'ch', 'ช': 'ch', 'ซ': 's', 'ฌ': 'ch',
|
||
'ญ': 'y', # Initial: y, Final: n
|
||
'ฎ': 'd', 'ฏ': 't', 'ฐ': 'th', 'ฑ': 'th', 'ฒ': 'th',
|
||
'ณ': 'n',
|
||
'ด': 'd', 'ต': 't', 'ถ': 'th', 'ท': 'th', 'ธ': 'th',
|
||
'น': 'n',
|
||
'บ': 'b', 'ป': 'p', 'ผ': 'ph', 'ฝ': 'f', 'พ': 'ph', 'ฟ': 'f', 'ภ': 'ph',
|
||
'ม': 'm',
|
||
'ย': 'y', 'ร': 'r', 'ล': 'l', 'ว': 'w',
|
||
'ศ': 's', 'ษ': 's', 'ส': 's', 'ห': 'h', 'ฬ': 'l', 'อ': '', # อ is silent initial
|
||
'ฮ': 'h',
|
||
}
|
||
|
||
# Thai vowels (can appear before, after, above, or below consonants)
|
||
THAI_VOWELS = {
|
||
# Following vowels
|
||
'ะ': 'a', 'า': 'a', 'ำ': 'am',
|
||
'ิ': 'i', 'ี': 'i',
|
||
'ึ': 'ue', 'ื': 'ue',
|
||
'ุ': 'u', 'ู': 'u',
|
||
'เ': 'e', # Leading vowel
|
||
'แ': 'ae', # Leading vowel
|
||
'โ': 'o', # Leading vowel
|
||
'ใ': 'ai', # Leading vowel
|
||
'ไ': 'ai', # Leading vowel
|
||
'ๅ': 'a', # Lakkhangyao (rare)
|
||
# Vowel combinations are handled by position
|
||
}
|
||
|
||
# Thai tone marks (don't affect RTGS romanization - just skip)
|
||
THAI_TONE_MARKS = {'่', '้', '๊', '๋'}
|
||
|
||
# Thai special characters
|
||
THAI_SPECIAL = {
|
||
'็': '', # Maitaikhu (shortens vowel)
|
||
'์': '', # Thanthakhat (silent letter marker)
|
||
'ๆ': '', # Maiyamok (repetition)
|
||
'฿': 'B', # Baht symbol
|
||
'ฯ': '', # Paiyannoi (abbreviation)
|
||
'๏': '', # Fongman (obsolete)
|
||
'ํ': 'm', # Nikhahit (nasalization, often 'm')
|
||
'ฺ': '', # Phinthu (Sanskrit virama)
|
||
'ๆ': '', # Mai yamok (repeat previous)
|
||
}
|
||
|
||
# Thai numerals
|
||
THAI_NUMERALS = {
|
||
'๐': '0', '๑': '1', '๒': '2', '๓': '3', '๔': '4',
|
||
'๕': '5', '๖': '6', '๗': '7', '๘': '8', '๙': '9',
|
||
}
|
||
|
||
# Common Thai heritage vocabulary - direct mappings for accuracy
|
||
# These handle complex syllable combinations correctly
|
||
THAI_HERITAGE_VOCAB = {
|
||
# Common institutional terms
|
||
'สำนัก': 'samnak',
|
||
'หอจดหมายเหตุ': 'ho chotmaihet',
|
||
'หอสมุด': 'ho samut',
|
||
'แห่งชาติ': 'haeng chat',
|
||
'พิพิธภัณฑ': 'phiphitthaphan',
|
||
'พิพิธภัณฑสถาน': 'phiphitthaphanthasathan',
|
||
'พระนคร': 'phra nakhon',
|
||
'สยาม': 'sayam',
|
||
'สมาคม': 'samakhom',
|
||
'ใน': 'nai',
|
||
'พระบรมราชูปถัมภ์': 'phra borommarachuppatham',
|
||
'พระที่นั่ง': 'phra thi nang',
|
||
'ศิวโมกข': 'siwamok',
|
||
'พิมาน': 'phiman',
|
||
'วัด': 'wat',
|
||
'โพธิ์': 'pho',
|
||
'ราม': 'ram',
|
||
# Geographic terms
|
||
'กรุงเทพ': 'krung thep',
|
||
'กรุงเทพมหานคร': 'krung thep maha nakhon',
|
||
'เชียงใหม่': 'chiang mai',
|
||
'ภูเก็ต': 'phuket',
|
||
# Institution types
|
||
'มหาวิทยาลัย': 'mahawitthayalai',
|
||
'ศูนย์': 'sun',
|
||
'สถาบัน': 'sathaban',
|
||
'องค์กร': 'ongkon',
|
||
'กรม': 'krom',
|
||
'กระทรวง': 'krasuang',
|
||
# Cultural terms
|
||
'วัฒนธรรม': 'watthanatham',
|
||
'ศิลปะ': 'sinlapa',
|
||
'ประวัติศาสตร์': 'prawattisat',
|
||
'โบราณ': 'boran',
|
||
'มรดก': 'moradok',
|
||
}
|
||
|
||
|
||
def transliterate_thai(text: str) -> str:
|
||
"""Transliterate Thai to Latin (Royal Thai General System).
|
||
|
||
Uses pythainlp if available, otherwise falls back to vocabulary lookup
|
||
and character-by-character transliteration.
|
||
"""
|
||
try:
|
||
from pythainlp.transliterate import romanize
|
||
return romanize(text, engine='royin') # Royal Institute standard
|
||
except ImportError:
|
||
pass
|
||
|
||
# Fallback: vocabulary lookup + character mapping
|
||
result = text
|
||
|
||
# First pass: replace known vocabulary items (longest match first)
|
||
for thai, latin in sorted(THAI_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])):
|
||
result = result.replace(thai, f' {latin} ')
|
||
|
||
# Second pass: transliterate remaining Thai characters
|
||
output = []
|
||
i = 0
|
||
while i < len(result):
|
||
c = result[i]
|
||
|
||
# Skip if already Latin
|
||
if c.isascii():
|
||
output.append(c)
|
||
i += 1
|
||
continue
|
||
|
||
# Check consonants
|
||
if c in THAI_CONSONANTS:
|
||
output.append(THAI_CONSONANTS[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Check vowels
|
||
if c in THAI_VOWELS:
|
||
output.append(THAI_VOWELS[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Skip tone marks
|
||
if c in THAI_TONE_MARKS:
|
||
i += 1
|
||
continue
|
||
|
||
# Check special characters
|
||
if c in THAI_SPECIAL:
|
||
output.append(THAI_SPECIAL[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Check numerals
|
||
if c in THAI_NUMERALS:
|
||
output.append(THAI_NUMERALS[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Unknown character - keep as is
|
||
output.append(c)
|
||
i += 1
|
||
|
||
# Clean up spacing
|
||
result = ''.join(output)
|
||
result = ' '.join(result.split()) # Normalize whitespace
|
||
|
||
return result
|
||
|
||
|
||
# =============================================================================
|
||
# ARMENIAN TRANSLITERATION (ISO 9985)
|
||
# =============================================================================
|
||
|
||
ARMENIAN_MAP = {
|
||
'Ա': 'A', 'ա': 'a', 'Բ': 'B', 'բ': 'b', 'Գ': 'G', 'գ': 'g',
|
||
'Դ': 'D', 'դ': 'd', 'Ե': 'E', 'ե': 'e', 'Զ': 'Z', 'զ': 'z',
|
||
'Է': 'E', 'է': 'e', 'Ը': 'Y', 'ը': 'y', 'Թ': 'T', 'թ': 't',
|
||
'Ժ': 'Zh', 'ժ': 'zh', 'Ի': 'I', 'ի': 'i', 'Լ': 'L', 'լ': 'l',
|
||
'Խ': 'Kh', 'խ': 'kh', 'Ծ': 'Ts', 'ծ': 'ts', 'Կ': 'K', 'կ': 'k',
|
||
'Հ': 'H', 'հ': 'h', 'Ձ': 'Dz', 'ձ': 'dz', 'Ղ': 'Gh', 'ղ': 'gh',
|
||
'Ճ': 'Ch', 'ճ': 'ch', 'Մ': 'M', 'մ': 'm', 'Յ': 'Y', 'յ': 'y',
|
||
'Ն': 'N', 'ն': 'n', 'Շ': 'Sh', 'շ': 'sh', 'Ո': 'O', 'ո': 'o',
|
||
'Չ': 'Ch', 'չ': 'ch', 'Պ': 'P', 'պ': 'p', 'Ջ': 'J', 'ջ': 'j',
|
||
' Delays': 'R', ' delays': 'r', ' Delays': 'S', 'ս': 's', 'Ვ': 'V', 'վ': 'v',
|
||
'Տ': 'T', 'տ': 't', 'ร': 'R', 'ր': 'r', 'Ց': 'Ts', 'ց': 'ts',
|
||
'Ւ': 'W', 'ւ': 'w', 'Փ': 'P', 'փ': 'p', 'Ք': 'K', 'ք': 'k',
|
||
'Օ': 'O', 'օ': 'o', 'Ֆ': 'F', 'ֆ': 'f',
|
||
}
|
||
|
||
|
||
def transliterate_armenian(text: str) -> str:
|
||
"""Transliterate Armenian to Latin (ISO 9985)."""
|
||
return ''.join(ARMENIAN_MAP.get(c, c) for c in text)
|
||
|
||
|
||
# =============================================================================
|
||
# GEORGIAN TRANSLITERATION (ISO 9984)
|
||
# =============================================================================
|
||
|
||
GEORGIAN_MAP = {
|
||
'ა': 'a', 'ბ': 'b', 'გ': 'g', 'დ': 'd', 'ე': 'e', 'ვ': 'v',
|
||
'ზ': 'z', 'თ': 't', 'ი': 'i', 'კ': 'k', 'ლ': 'l', 'მ': 'm',
|
||
'ნ': 'n', 'ო': 'o', 'პ': 'p', 'ჟ': 'zh', 'რ': 'r', 'ს': 's',
|
||
'ტ': 't', 'უ': 'u', 'ფ': 'p', 'ქ': 'k', 'ღ': 'gh', 'ყ': 'q',
|
||
'შ': 'sh', 'ჩ': 'ch', 'ც': 'ts', 'ძ': 'dz', 'წ': 'ts', 'ჭ': 'ch',
|
||
'ხ': 'kh', 'ჯ': 'j', 'ჰ': 'h',
|
||
}
|
||
|
||
|
||
def transliterate_georgian(text: str) -> str:
|
||
"""Transliterate Georgian to Latin (ISO 9984)."""
|
||
return ''.join(GEORGIAN_MAP.get(c, c) for c in text)
|
||
|
||
|
||
# =============================================================================
|
||
# BENGALI TRANSLITERATION (ISO 15919)
|
||
# =============================================================================
|
||
|
||
BENGALI_MAP = {
|
||
# Vowels
|
||
'অ': 'a', 'আ': 'aa', 'ই': 'i', 'ঈ': 'ii', 'উ': 'u', 'ঊ': 'uu',
|
||
'এ': 'e', 'ঐ': 'ai', 'ও': 'o', 'ঔ': 'au',
|
||
# Consonants
|
||
'ক': 'ka', 'খ': 'kha', 'গ': 'ga', 'ঘ': 'gha', 'ঙ': 'nga',
|
||
'চ': 'cha', 'ছ': 'chha', 'জ': 'ja', 'ঝ': 'jha', 'ঞ': 'nya',
|
||
'ট': 'ta', 'ঠ': 'tha', 'ড': 'da', 'ঢ': 'dha', 'ণ': 'na',
|
||
'ত': 'ta', 'থ': 'tha', 'দ': 'da', 'ধ': 'dha', 'ন': 'na',
|
||
'প': 'pa', 'ফ': 'pha', 'ব': 'ba', 'ভ': 'bha', 'ম': 'ma',
|
||
'য': 'ya', 'র': 'ra', 'ল': 'la', 'শ': 'sha', 'ষ': 'sha',
|
||
'স': 'sa', 'হ': 'ha', 'ড়': 'ra', 'ঢ়': 'rha', 'য়': 'ya',
|
||
'়': '', # Nukta
|
||
'্': '', # Virama
|
||
# Vowel marks
|
||
'া': 'a', 'ি': 'i', 'ী': 'i', 'ু': 'u', 'ূ': 'u',
|
||
'ে': 'e', 'ৈ': 'ai', 'ো': 'o', 'ৌ': 'au',
|
||
'ং': 'ng', 'ঃ': 'h', 'ঁ': 'n',
|
||
}
|
||
|
||
|
||
def transliterate_bengali(text: str) -> str:
|
||
"""Transliterate Bengali to Latin (ISO 15919 simplified)."""
|
||
result = []
|
||
for c in text:
|
||
if c in BENGALI_MAP:
|
||
result.append(BENGALI_MAP[c])
|
||
elif c == ' ':
|
||
result.append(' ')
|
||
elif c.isalnum():
|
||
result.append(c)
|
||
return ''.join(result)
|
||
|
||
|
||
# =============================================================================
|
||
# SINHALA TRANSLITERATION (ISO 15919)
|
||
# =============================================================================
|
||
|
||
# Sinhala character map (ISO 15919 romanization)
|
||
SINHALA_MAP = {
|
||
# Independent vowels
|
||
'අ': 'a', 'ආ': 'aa', 'ඇ': 'ae', 'ඈ': 'aae',
|
||
'ඉ': 'i', 'ඊ': 'ii', 'උ': 'u', 'ඌ': 'uu',
|
||
'එ': 'e', 'ඒ': 'ee', 'ඓ': 'ai',
|
||
'ඔ': 'o', 'ඕ': 'oo', 'ඖ': 'au',
|
||
'ඍ': 'ri', 'ඎ': 'rii',
|
||
|
||
# Consonants (with inherent 'a' vowel)
|
||
'ක': 'ka', 'ඛ': 'kha', 'ග': 'ga', 'ඝ': 'gha', 'ඞ': 'nga', 'ඟ': 'nnga',
|
||
'ච': 'cha', 'ඡ': 'chha', 'ජ': 'ja', 'ඣ': 'jha', 'ඤ': 'nya', 'ඥ': 'gnya',
|
||
'ට': 'ta', 'ඨ': 'tha', 'ඩ': 'da', 'ඪ': 'dha', 'ණ': 'na', 'ඬ': 'nda',
|
||
'ත': 'tha', 'ථ': 'thha', 'ද': 'da', 'ධ': 'dha', 'න': 'na', 'ඳ': 'nda',
|
||
'ප': 'pa', 'ඵ': 'pha', 'බ': 'ba', 'භ': 'bha', 'ම': 'ma', 'ඹ': 'mba',
|
||
'ය': 'ya', 'ර': 'ra', 'ල': 'la', 'ව': 'va', 'ළ': 'la',
|
||
'ශ': 'sha', 'ෂ': 'sha', 'ස': 'sa', 'හ': 'ha',
|
||
'ෆ': 'fa', # Used for foreign words
|
||
|
||
# Dependent vowel signs (matras)
|
||
'ා': 'a', 'ැ': 'ae', 'ෑ': 'aae',
|
||
'ි': 'i', 'ී': 'ii', 'ු': 'u', 'ූ': 'uu',
|
||
'ෙ': 'e', 'ේ': 'ee', 'ෛ': 'ai',
|
||
'ො': 'o', 'ෝ': 'oo', 'ෞ': 'au',
|
||
'ෘ': 'ri', 'ෲ': 'rii',
|
||
|
||
# Special marks
|
||
'්': '', # Virama (hal kirima) - removes inherent vowel
|
||
'ං': 'ng', # Anusvara
|
||
'ඃ': 'h', # Visarga
|
||
'෴': '', # Kunddaliya (punctuation)
|
||
|
||
# Numerals (Sinhala uses both Sinhala and Arabic numerals)
|
||
'෦': '0', '෧': '1', '෨': '2', '෩': '3', '෪': '4',
|
||
'෫': '5', '෬': '6', '෭': '7', '෮': '8', '෯': '9',
|
||
}
|
||
|
||
# Common Sinhala heritage vocabulary
|
||
SINHALA_HERITAGE_VOCAB = {
|
||
# University/Education
|
||
'විශ්වවිද්යාලය': 'vishvavidyalaya',
|
||
'විශ්වවිද්': 'vishvavid',
|
||
'යාලය': 'yalaya',
|
||
'පේරාදෙණිය': 'peradeniya',
|
||
|
||
# National/Government
|
||
'ජාතික': 'jathika',
|
||
'දෙපාර්තමේන්තුව': 'departmentuwa',
|
||
|
||
# Museums/Archives
|
||
'කෞතුකාගාර': 'kauthukagara',
|
||
'කෞතුකාගාරය': 'kauthukagaaraya',
|
||
'ලේඛනාගාරය': 'lekhanagaaraya',
|
||
'පුස්තකාලය': 'pusthakaalaya',
|
||
|
||
# Places
|
||
'කොළඹ': 'colombo',
|
||
'ශ්රී': 'sri',
|
||
'ලංකාව': 'lankava',
|
||
}
|
||
|
||
|
||
def transliterate_sinhala(text: str) -> str:
|
||
"""Transliterate Sinhala to Latin (ISO 15919).
|
||
|
||
Args:
|
||
text: Text in Sinhala script
|
||
|
||
Returns:
|
||
Romanized text using ISO 15919 standard
|
||
"""
|
||
# First pass: replace known vocabulary (longest match first)
|
||
result = text
|
||
for sinhala, latin in sorted(SINHALA_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])):
|
||
result = result.replace(sinhala, f' {latin} ')
|
||
|
||
# Second pass: transliterate remaining characters
|
||
output = []
|
||
i = 0
|
||
while i < len(result):
|
||
c = result[i]
|
||
|
||
# Skip if already Latin
|
||
if c.isascii():
|
||
output.append(c)
|
||
i += 1
|
||
continue
|
||
|
||
# Check character map
|
||
if c in SINHALA_MAP:
|
||
output.append(SINHALA_MAP[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Unknown character - keep as is or skip
|
||
if c == ' ':
|
||
output.append(' ')
|
||
elif c.isalnum():
|
||
output.append(c)
|
||
i += 1
|
||
|
||
# Clean up spacing
|
||
result = ''.join(output)
|
||
result = ' '.join(result.split())
|
||
|
||
return result
|
||
|
||
|
||
# =============================================================================
|
||
# KHMER TRANSLITERATION (UNGEGN Romanization)
|
||
# =============================================================================
|
||
|
||
# Khmer consonants (with inherent 'a' or 'o' vowel depending on register)
|
||
KHMER_CONSONANTS = {
|
||
# First series (inherent 'aa' in open syllables)
|
||
'ក': 'k', 'ខ': 'kh', 'គ': 'k', 'ឃ': 'kh', 'ង': 'ng',
|
||
'ច': 'ch', 'ឆ': 'chh', 'ជ': 'ch', 'ឈ': 'chh', 'ញ': 'nh',
|
||
'ដ': 'd', 'ឋ': 'th', 'ឌ': 'd', 'ឍ': 'th', 'ណ': 'n',
|
||
'ត': 't', 'ថ': 'th', 'ទ': 't', 'ធ': 'th', 'ន': 'n',
|
||
'ប': 'b', 'ផ': 'ph', 'ព': 'p', 'ភ': 'ph', 'ម': 'm',
|
||
'យ': 'y', 'រ': 'r', 'ល': 'l', 'វ': 'v', 'ឝ': 'sh',
|
||
'ឞ': 's', 'ស': 's', 'ហ': 'h', 'ឡ': 'l', 'អ': '',
|
||
}
|
||
|
||
# Khmer dependent vowels
|
||
KHMER_VOWELS = {
|
||
'ា': 'a', 'ិ': 'i', 'ី': 'ii', 'ឹ': 'eu', 'ឺ': 'eu',
|
||
'ុ': 'o', 'ូ': 'ou', 'ួ': 'ua', 'ើ': 'ae',
|
||
'ែ': 'ae', 'ៃ': 'ai', 'ោ': 'ao', 'ៅ': 'au',
|
||
'្': '', # Subscript consonant marker (coeng)
|
||
}
|
||
|
||
# Khmer independent vowels
|
||
KHMER_INDEP_VOWELS = {
|
||
'ឥ': 'i', 'ឦ': 'ii', 'ឧ': 'u', 'ឨ': 'uk',
|
||
'ឩ': 'uu', 'ឪ': 'ou', 'ឫ': 'ry', 'ឬ': 'ryy',
|
||
'ឭ': 'ly', 'ឮ': 'lyy', 'ឯ': 'ae', 'ឰ': 'ai',
|
||
'ឱ': 'ao', 'ឲ': 'ao', 'ឳ': 'au',
|
||
}
|
||
|
||
# Khmer special signs
|
||
KHMER_SPECIAL = {
|
||
'ំ': 'm', # Nikahit (anusvara)
|
||
'ះ': 'h', # Visarga
|
||
'់': '', # Bantoc (shortens vowel)
|
||
'៌': 'r', # Robat (repha)
|
||
'៍': '', # Toandakhiat (silent letter)
|
||
'៎': '', # Kakabat (emphasis)
|
||
'៏': '', # Ahsda (obsolete)
|
||
'៑': '', # Viriam (obsolete punctuation)
|
||
'៖': ':', # Camnuc pii kuuh (colon)
|
||
'។': '.', # Khan (period)
|
||
'៕': '.', # Bariyoosan (end mark)
|
||
'៚': '', # Koomuut (section mark)
|
||
}
|
||
|
||
# Khmer numerals
|
||
KHMER_NUMERALS = {
|
||
'០': '0', '១': '1', '២': '2', '៣': '3', '៤': '4',
|
||
'៥': '5', '៦': '6', '៧': '7', '៨': '8', '៩': '9',
|
||
}
|
||
|
||
# Common Khmer heritage vocabulary
|
||
KHMER_HERITAGE_VOCAB = {
|
||
# Museums/Memorials
|
||
'សារមន្ទីរ': 'saaramontir',
|
||
'សារមន្ទីរទួលស្លែង': 'saaramontir tuol sleng',
|
||
'ទួលស្លែង': 'tuol sleng',
|
||
|
||
# Archives/Libraries
|
||
'បណ្ណាល័យ': 'bannaalay',
|
||
'ឯកសារដ្ឋាន': 'aeksaarathan',
|
||
'ជាតិ': 'cheate',
|
||
|
||
# Places
|
||
'ភ្នំពេញ': 'phnom penh',
|
||
'អង្គរ': 'angkor',
|
||
'សៀមរាប': 'siem reap',
|
||
|
||
# Cultural terms
|
||
'វប្បធម៌': 'vabpatham',
|
||
'បេតិកភណ្ឌ': 'betekaphon',
|
||
'ប្រវត្តិសាស្ត្រ': 'pravattisaas',
|
||
}
|
||
|
||
|
||
def transliterate_khmer(text: str) -> str:
|
||
"""Transliterate Khmer to Latin (UNGEGN system).
|
||
|
||
Args:
|
||
text: Text in Khmer script
|
||
|
||
Returns:
|
||
Romanized text using UNGEGN standard
|
||
"""
|
||
# First pass: replace known vocabulary (longest match first)
|
||
result = text
|
||
for khmer, latin in sorted(KHMER_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])):
|
||
result = result.replace(khmer, f' {latin} ')
|
||
|
||
# Second pass: transliterate remaining characters
|
||
output = []
|
||
i = 0
|
||
while i < len(result):
|
||
c = result[i]
|
||
|
||
# Skip if already Latin
|
||
if c.isascii():
|
||
output.append(c)
|
||
i += 1
|
||
continue
|
||
|
||
# Check consonants
|
||
if c in KHMER_CONSONANTS:
|
||
output.append(KHMER_CONSONANTS[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Check vowels
|
||
if c in KHMER_VOWELS:
|
||
output.append(KHMER_VOWELS[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Check independent vowels
|
||
if c in KHMER_INDEP_VOWELS:
|
||
output.append(KHMER_INDEP_VOWELS[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Check special signs
|
||
if c in KHMER_SPECIAL:
|
||
output.append(KHMER_SPECIAL[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Check numerals
|
||
if c in KHMER_NUMERALS:
|
||
output.append(KHMER_NUMERALS[c])
|
||
i += 1
|
||
continue
|
||
|
||
# Unknown character - keep as is or skip
|
||
if c == ' ':
|
||
output.append(' ')
|
||
elif c.isalnum():
|
||
output.append(c)
|
||
i += 1
|
||
|
||
# Clean up spacing
|
||
result = ''.join(output)
|
||
result = ' '.join(result.split())
|
||
|
||
return result
|
||
|
||
|
||
# =============================================================================
|
||
# MAIN TRANSLITERATION DISPATCHER
|
||
# =============================================================================
|
||
|
||
# Language to script mapping
|
||
LANG_SCRIPT_MAP = {
|
||
'ru': 'cyrillic', 'uk': 'cyrillic', 'bg': 'cyrillic',
|
||
'sr': 'cyrillic', 'kk': 'cyrillic',
|
||
'zh': 'chinese',
|
||
'ja': 'japanese',
|
||
'ko': 'korean',
|
||
'ar': 'arabic', 'fa': 'arabic', 'ur': 'arabic',
|
||
'he': 'hebrew',
|
||
'el': 'greek',
|
||
'hi': 'devanagari', 'ne': 'devanagari',
|
||
'bn': 'bengali',
|
||
'th': 'thai',
|
||
'hy': 'armenian',
|
||
'ka': 'georgian',
|
||
'si': 'sinhala',
|
||
'km': 'khmer',
|
||
}
|
||
|
||
TRANSLITERATORS = {
|
||
'cyrillic': transliterate_cyrillic,
|
||
'chinese': transliterate_chinese,
|
||
'japanese': transliterate_japanese,
|
||
'korean': transliterate_korean,
|
||
'arabic': transliterate_arabic,
|
||
'hebrew': transliterate_hebrew,
|
||
'greek': transliterate_greek,
|
||
'devanagari': transliterate_devanagari,
|
||
'bengali': transliterate_bengali,
|
||
'thai': transliterate_thai,
|
||
'armenian': transliterate_armenian,
|
||
'georgian': transliterate_georgian,
|
||
'sinhala': transliterate_sinhala,
|
||
'khmer': transliterate_khmer,
|
||
'latin': lambda t: t, # No transliteration needed
|
||
}
|
||
|
||
|
||
def transliterate(text: str, lang: Optional[str] = None) -> str:
|
||
"""
|
||
Transliterate text from non-Latin script to Latin.
|
||
|
||
Args:
|
||
text: Input text in any script
|
||
lang: Optional ISO 639-1 language code (e.g., 'ru', 'zh', 'ko')
|
||
If not provided, script is auto-detected.
|
||
|
||
Returns:
|
||
Transliterated text in Latin characters.
|
||
"""
|
||
if not text:
|
||
return text
|
||
|
||
# Determine script
|
||
if lang and lang in LANG_SCRIPT_MAP:
|
||
script = LANG_SCRIPT_MAP[lang]
|
||
else:
|
||
script = detect_script(text)
|
||
|
||
# Get transliterator
|
||
translit_func = TRANSLITERATORS.get(script, lambda t: t)
|
||
|
||
# For Cyrillic, pass language for dialect-specific handling
|
||
if script == 'cyrillic' and lang:
|
||
result = translit_func(text, lang)
|
||
else:
|
||
result = translit_func(text)
|
||
|
||
# Normalize diacritics to ASCII
|
||
normalized = unicodedata.normalize('NFD', result)
|
||
ascii_result = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||
|
||
return ascii_result
|
||
|
||
|
||
def transliterate_for_abbreviation(emic_name: str, lang: str) -> str:
|
||
"""
|
||
Transliterate emic name for GHCID abbreviation generation.
|
||
|
||
This is the main entry point for GHCID generation scripts.
|
||
|
||
Args:
|
||
emic_name: Institution name in original script
|
||
lang: ISO 639-1 language code
|
||
|
||
Returns:
|
||
Transliterated name ready for abbreviation extraction
|
||
"""
|
||
# Step 1: Transliterate to Latin
|
||
latin = transliterate(emic_name, lang)
|
||
|
||
# Step 2: Remove special characters (except spaces and hyphens)
|
||
clean = re.sub(r"[^a-zA-Z\s\-']", ' ', latin)
|
||
|
||
# Step 3: Normalize whitespace
|
||
clean = ' '.join(clean.split())
|
||
|
||
return clean
|
||
|
||
|
||
# =============================================================================
|
||
# CLI INTERFACE
|
||
# =============================================================================
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description='Transliterate non-Latin script text to Latin characters',
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog='''
|
||
Examples:
|
||
python transliterate_emic_names.py --text "Институт" --lang ru
|
||
python transliterate_emic_names.py --text "东巴文化博物院" --lang zh
|
||
python transliterate_emic_names.py --file data/custodian/example.yaml
|
||
|
||
Supported languages:
|
||
ru (Russian), uk (Ukrainian), bg (Bulgarian), sr (Serbian), kk (Kazakh)
|
||
zh (Chinese), ja (Japanese), ko (Korean)
|
||
ar (Arabic), fa (Persian), ur (Urdu), he (Hebrew)
|
||
el (Greek), hi (Hindi), ne (Nepali), bn (Bengali)
|
||
th (Thai), hy (Armenian), ka (Georgian)
|
||
'''
|
||
)
|
||
|
||
parser.add_argument('--text', '-t', help='Text to transliterate')
|
||
parser.add_argument('--lang', '-l', help='ISO 639-1 language code')
|
||
parser.add_argument('--file', '-f', help='YAML file to process')
|
||
parser.add_argument('--detect', '-d', action='store_true',
|
||
help='Only detect script, do not transliterate')
|
||
parser.add_argument('--libs', action='store_true',
|
||
help='Show available transliteration libraries')
|
||
|
||
args = parser.parse_args()
|
||
|
||
if args.libs:
|
||
print("Available transliteration libraries:")
|
||
for lib, available in AVAILABLE_LIBS.items():
|
||
status = "✓ installed" if available else "✗ not installed"
|
||
print(f" {lib}: {status}")
|
||
return
|
||
|
||
if args.file:
|
||
import yaml
|
||
with open(args.file, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
|
||
emic_name = data.get('custodian_name', {}).get('emic_name')
|
||
lang = data.get('custodian_name', {}).get('name_language')
|
||
|
||
if not emic_name:
|
||
print(f"Error: No emic_name found in {args.file}")
|
||
return
|
||
|
||
print(f"Emic name: {emic_name}")
|
||
print(f"Language: {lang or '(auto-detect)'}")
|
||
|
||
if args.detect:
|
||
script = detect_script(emic_name)
|
||
print(f"Detected script: {script}")
|
||
else:
|
||
result = transliterate_for_abbreviation(emic_name, lang)
|
||
print(f"Transliterated: {result}")
|
||
return
|
||
|
||
if args.text:
|
||
if args.detect:
|
||
script = detect_script(args.text)
|
||
print(f"Input: {args.text}")
|
||
print(f"Detected script: {script}")
|
||
else:
|
||
result = transliterate_for_abbreviation(args.text, args.lang)
|
||
print(f"Input: {args.text}")
|
||
print(f"Language: {args.lang or '(auto-detect)'}")
|
||
print(f"Output: {result}")
|
||
return
|
||
|
||
parser.print_help()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|