glam/scripts/transliterate_emic_names.py
kempersc 891692a4d6 feat(ghcid): add diacritics normalization and transliteration scripts
- Add fix_ghcid_diacritics.py for normalizing non-ASCII in GHCIDs
- Add resolve_diacritics_collisions.py for collision handling
- Add transliterate_emic_names.py for non-Latin script handling
- Add transliteration tests
2025-12-08 14:59:28 +01:00

1267 lines
46 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Transliteration Utility for GHCID Abbreviation Generation
This script provides transliteration functions for converting non-Latin script
institution names to Latin characters using ISO and recognized standards.
Usage:
# As a module
from scripts.transliterate_emic_names import transliterate_for_abbreviation
latin = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru")
# Result: "Institut vostochnykh rukopisey RAN"
# As a CLI tool
python scripts/transliterate_emic_names.py --text "東巴文化博物院" --lang zh
python scripts/transliterate_emic_names.py --file data/custodian/example.yaml
Standards:
- Cyrillic (ru, uk, bg, sr, kk): ISO 9:1995
- Chinese (zh): Hanyu Pinyin (ISO 7098)
- Japanese (ja): Modified Hepburn
- Korean (ko): Revised Romanization
- Arabic (ar, fa, ur): ISO 233-2/3
- Hebrew (he): ISO 259-3
- Greek (el): ISO 843
- Devanagari (hi, ne, bn): ISO 15919
- Thai (th): ISO 11940-2
- Armenian (hy): ISO 9985
- Georgian (ka): ISO 9984
Author: GLAM Project
Created: 2025-12-08
"""
import argparse
import re
import unicodedata
from pathlib import Path
from typing import Optional, Dict, List, Tuple
# Try importing optional transliteration libraries
AVAILABLE_LIBS: Dict[str, bool] = {}
try:
from pypinyin import pinyin, Style
AVAILABLE_LIBS['pypinyin'] = True
except ImportError:
AVAILABLE_LIBS['pypinyin'] = False
try:
import pykakasi
AVAILABLE_LIBS['pykakasi'] = True
except ImportError:
AVAILABLE_LIBS['pykakasi'] = False
try:
from transliterate import translit
AVAILABLE_LIBS['transliterate'] = True
except ImportError:
AVAILABLE_LIBS['transliterate'] = False
# =============================================================================
# SCRIPT DETECTION
# =============================================================================
def detect_script(text: str) -> str:
"""
Detect the primary script of the input text.
Returns one of:
- 'latin': Latin alphabet
- 'cyrillic': Cyrillic script
- 'chinese': Chinese characters (Hanzi)
- 'japanese': Japanese (mixed Kanji/Kana)
- 'korean': Korean Hangul
- 'arabic': Arabic script (includes Persian, Urdu)
- 'hebrew': Hebrew script
- 'greek': Greek script
- 'devanagari': Devanagari (Hindi, Nepali, Sanskrit)
- 'bengali': Bengali script
- 'thai': Thai script
- 'armenian': Armenian script
- 'georgian': Georgian script
- 'sinhala': Sinhala script
- 'khmer': Khmer script
- 'unknown': Cannot determine
"""
script_ranges = {
'cyrillic': (0x0400, 0x04FF),
'arabic': (0x0600, 0x06FF),
'persian_ext': (0x0750, 0x077F), # Arabic Supplement
'hebrew': (0x0590, 0x05FF),
'devanagari': (0x0900, 0x097F),
'bengali': (0x0980, 0x09FF),
'thai': (0x0E00, 0x0E7F),
'greek': (0x0370, 0x03FF),
'armenian': (0x0530, 0x058F),
'georgian': (0x10A0, 0x10FF),
'korean': (0xAC00, 0xD7AF), # Hangul syllables
'korean_jamo': (0x1100, 0x11FF), # Hangul Jamo
'japanese_hiragana': (0x3040, 0x309F),
'japanese_katakana': (0x30A0, 0x30FF),
'chinese': (0x4E00, 0x9FFF), # CJK Unified Ideographs
'chinese_ext': (0x3400, 0x4DBF), # CJK Extension A
'sinhala': (0x0D80, 0x0DFF),
'khmer': (0x1780, 0x17FF),
}
script_counts: Dict[str, int] = {script: 0 for script in script_ranges}
latin_count = 0
for char in text:
code = ord(char)
# Check Latin
if ('a' <= char <= 'z') or ('A' <= char <= 'Z'):
latin_count += 1
continue
# Check other scripts
for script, (start, end) in script_ranges.items():
if start <= code <= end:
script_counts[script] += 1
break
# Determine primary script
if latin_count > 0 and all(c == 0 for c in script_counts.values()):
return 'latin'
# Merge related scripts
script_counts['arabic'] += script_counts.get('persian_ext', 0)
script_counts['korean'] += script_counts.get('korean_jamo', 0)
script_counts['chinese'] += script_counts.get('chinese_ext', 0)
# Handle CJK disambiguation
cjk_count = script_counts.get('chinese', 0)
hiragana_count = script_counts.get('japanese_hiragana', 0)
katakana_count = script_counts.get('japanese_katakana', 0)
if hiragana_count > 0 or katakana_count > 0:
return 'japanese'
# Find max non-Latin script
primary_scripts = ['cyrillic', 'arabic', 'hebrew', 'devanagari', 'bengali',
'thai', 'greek', 'armenian', 'georgian', 'korean',
'chinese', 'sinhala', 'khmer']
max_script = max(primary_scripts, key=lambda s: script_counts.get(s, 0))
if script_counts.get(max_script, 0) > 0:
return max_script
return 'latin' if latin_count > 0 else 'unknown'
# =============================================================================
# CYRILLIC TRANSLITERATION (ISO 9:1995)
# =============================================================================
CYRILLIC_MAP = {
# Russian
'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E',
'Ё': 'E', 'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K',
'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R',
'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'Kh', 'Ц': 'Ts',
'Ч': 'Ch', 'Ш': 'Sh', 'Щ': 'Shch', 'Ъ': '', 'Ы': 'Y', 'Ь': '',
'Э': 'E', 'Ю': 'Yu', 'Я': 'Ya',
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e',
'ё': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k',
'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r',
'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '',
'э': 'e', 'ю': 'yu', 'я': 'ya',
# Ukrainian additions
'І': 'I', 'і': 'i', 'Ї': 'Yi', 'ї': 'yi', 'Є': 'Ye', 'є': 'ye',
'Ґ': 'G', 'ґ': 'g', "'": '',
# Bulgarian additions
'Ъ': 'A', 'ъ': 'a', # Bulgarian hard sign = schwa
# Serbian additions
'Ђ': 'Dj', 'ђ': 'dj', 'Ј': 'J', 'ј': 'j', 'Љ': 'Lj', 'љ': 'lj',
'Њ': 'Nj', 'њ': 'nj', 'Ћ': 'C', 'ћ': 'c', 'Џ': 'Dz', 'џ': 'dz',
# Kazakh additions (Cyrillic-based)
'Ә': 'A', 'ә': 'a', 'Ғ': 'Gh', 'ғ': 'gh', 'Қ': 'Q', 'қ': 'q',
'Ң': 'Ng', 'ң': 'ng', 'Ө': 'O', 'ө': 'o', 'Ұ': 'U', 'ұ': 'u',
'Ү': 'U', 'ү': 'u', 'Һ': 'H', 'һ': 'h',
}
def transliterate_cyrillic(text: str, lang: str = 'ru') -> str:
"""Transliterate Cyrillic text using ISO 9 mapping."""
if AVAILABLE_LIBS.get('transliterate'):
try:
return translit(text, lang, reversed=True)
except Exception:
pass
# Fallback to manual mapping
return ''.join(CYRILLIC_MAP.get(c, c) for c in text)
# =============================================================================
# CHINESE TRANSLITERATION (Hanyu Pinyin)
# =============================================================================
# Basic Pinyin dictionary for common museum/library/archive vocabulary
# This allows basic transliteration without pypinyin library
CHINESE_PINYIN_MAP = {
# Numbers
'': 'yi', '': 'er', '': 'san', '': 'si', '': 'wu',
'': 'liu', '': 'qi', '': 'ba', '': 'jiu', '': 'shi',
'': 'bai', '': 'qian', '': 'wan',
# Heritage/Museum vocabulary
'': 'bo', '': 'wu', '': 'guan', '': 'guan', '': 'yuan',
'': 'wen', '': 'hua', '': 'yi', '': 'yi', '': 'shu', '': 'shu',
'': 'li', '': 'li', '': 'shi', '': 'yi', '': 'yi', '': 'chan', '': 'chan',
'': 'guo', '': 'guo', '': 'li', '': 'jia', '': 'min', '': 'zu',
'': 'zhong', '': 'yang', '': 'sheng', '': 'shi', '': 'xian', '': 'xian',
'': 'tu', '': 'tu', '': 'shu', '': 'shu', '': 'dang', '': 'dang', '': 'an',
'': 'mei', '': 'gu', '': 'dian', '': 'cang', '': 'pin', '': 'zhan', '': 'lan', '': 'lan',
'': 'ji', '': 'ji', '': 'nian', '': 'bei', '': 'ta', '': 'miao', '': 'miao',
'': 'si', '': 'gong', '': 'gong', '殿': 'dian', '': 'cheng', '': 'bao',
'': 'lou', '': 'lou', '': 'ge', '': 'ge', '': 'ting', '': 'yuan', '': 'yuan',
'': 'yan', '': 'jiu', '': 'suo', '': 'zhong', '': 'xin',
'': 'xue', '': 'xue', '': 'yuan', '': 'xiao', '': 'xi',
'': 'hui', '': 'hui', '': 'she', '': 'tuan', '': 'tuan',
'': 'dong', '': 'dong', '西': 'xi', '': 'nan', '': 'bei',
'': 'jing', '': 'hai', '': 'shan', '': 'he', '': 'jiang',
'': 'da', '': 'xiao', '': 'xin', '': 'lao', '': 'gu',
'': 'zi', '': 'ran', '': 'ke', '': 'ji',
'': 'ba', '': 'dong', '': 'na', '': 'na', '西': 'xi',
'': 'gu', '': 'gong',
'': 'ji', '': 'jin', '': 'yin', '': 'yin',
'': 'jiao', '': 'yu', '': 'chuan', '': 'chuan', '': 'tong', '': 'tong',
'': 'si', '': 'si', '': 'chou', '': 'chou', '': 'lu',
'': 'tao', '': 'ci', '': 'yu', '': 'shi', '': 'tong', '': 'tong',
'': 'hua', '': 'hua', '': 'diao', '': 'su',
'': 'min', '': 'su', '': 'feng', '': 'feng', '': 'tu',
'': 'ge', '': 'ming', '': 'zhan', '': 'zhan', '': 'zheng', '': 'zheng',
'': 'jun', '': 'jun', '': 'shi', '': 'hang', '': 'kong', '': 'tian',
'': 'zong', '': 'fo', '': 'dao', '': 'ru',
'': 'xian', '': 'xian', '': 'qin', '': 'yue', '': 'yue',
'': 'wu', '': 'ju', '': 'ju', '': 'xi', '': 'xi',
'': 'cha', '': 'jiu', '': 'shi', '': 'can',
'': 'yi', '': 'fu', '': 'fang', '': 'fang', '': 'zhi', '': 'zhi',
'': 'jian', '': 'zhu', '': 'zhu', '': 'fang', '': 'wu',
'': 'shui', '': 'dian', '': 'dian', '': 'huo', '': 'mu',
'': 'nong', '': 'nong', '': 'ye', '': 'ye', '': 'gong', '': 'shang',
'': 'yi', '': 'yi', '': 'yao', '': 'yao',
'': 'ren', '': 'wu', '': 'sheng', '': 'huo',
'': 'he', '': 'ping', '': 'you', '': 'yi', '': 'yi',
'': 'gang', '': 'ao', '': 'tai', '': 'wan', '': 'wan',
'': 'hua', '': 'hua', '': 'qiao', '': 'qiao',
'': 'hai', '': 'wai', '': 'jiao', '': 'liu',
'': 'bao', '': 'hu', '': 'hu', '': 'xiu', '': 'fu', '': 'fu',
'': 'jian', '': 'jian', '': 'ding', '': 'ping', '': 'ping', '': 'gu',
}
def transliterate_chinese(text: str) -> str:
"""Transliterate Chinese to Pinyin without tone marks."""
if AVAILABLE_LIBS.get('pypinyin'):
result = pinyin(text, style=Style.NORMAL)
return ' '.join([''.join(p) for p in result])
# Fallback: use basic vocabulary mapping
result = []
for char in text:
if char in CHINESE_PINYIN_MAP:
result.append(CHINESE_PINYIN_MAP[char])
elif char == ' ':
result.append(' ')
elif char.isalnum():
result.append(char)
# If we got no result, return warning
if not result:
return '[REQUIRES_PYPINYIN]'
return ''.join(result)
# =============================================================================
# JAPANESE TRANSLITERATION (Modified Hepburn)
# =============================================================================
# Basic Kanji/Kana to Romaji map for common heritage vocabulary
# This allows basic transliteration without pykakasi library
JAPANESE_ROMAJI_MAP = {
# Common heritage vocabulary Kanji
'': 'haku', '': 'butsu', '': 'kan', '': 'in',
'': 'bun', '': 'ka', '': 'gei', '': 'jutsu',
'': 'reki', '': 'shi', '': 'i', '': 'san',
'': 'koku', '': 'koku', '': 'ritsu', '': 'ka',
'': 'min', '': 'zoku', '': 'chuu', '': 'ou',
'': 'to', '': 'to', '': 'sho', '': 'tou', '': 'an',
'': 'bi', '': 'ko', '': 'ten', '': 'zou', '': 'hin',
'': 'ten', '': 'ran', '': 'ki', '': 'nen',
'': 'ji', '': 'kyuu', '殿': 'den', '': 'jou', '': 'hou',
'': 'rou', '': 'kaku', '': 'tei', '': 'en',
'': 'ken', '': 'kyuu', '': 'sho', '': 'shin',
'': 'gaku', '': 'gaku', '': 'kou', '': 'kei',
'': 'kai', '': 'kai', '': 'sha', '': 'dan',
'': 'tou', '西': 'sei', '': 'nan', '': 'hoku',
'': 'kyou', '': 'to', '': 'kai', '': 'zan', '': 'ka', '': 'kawa',
'': 'dai', '': 'shou', '': 'shin', '': 'rou',
'': 'ji', '': 'nen', '': 'ka', '': 'gi',
'': 'ko', '': 'kin', '': 'gin',
'': 'kyou', '': 'iku', '': 'den', '': 'tou',
'': 'tou', '': 'ji', '': 'gyoku', '': 'seki', '': 'dou',
'': 'ga', '': 'ga', '': 'chou', '': 'so',
'': 'zoku', '': 'fuu', '': 'do',
'': 'kaku', '': 'mei', '': 'sen', '': 'sou',
'': 'gun', '': 'ji', '': 'kou', '': 'kuu', '': 'ten',
'': 'shuu', '': 'butsu', '': 'dou', '': 'ju',
'': 'kin', '': 'gaku', '': 'bu', '': 'geki', '': 'gi',
'': 'cha', '': 'shu', '': 'shoku', '': 'san',
'': 'i', '': 'fuku', '': 'bou', '': 'shoku',
'': 'ken', '': 'chiku', '': 'bou', '': 'oku',
'': 'sui', '': 'den', '': 'ka', '': 'moku',
'': 'nou', '': 'gyou', '': 'kou', '': 'shou',
'': 'i', '': 'yaku', '': 'jin', '': 'sei', '': 'katsu',
'': 'wa', '': 'hei', '': 'yuu', '': 'gi',
'': 'kou', '': 'wan', '': 'ka', '': 'kyou',
'': 'gai', '': 'kou', '': 'ryuu',
'': 'ho', '': 'go', '': 'shuu', '': 'fuku',
'': 'kan', '': 'tei', '': 'hyou', '': 'ko',
'': 'ki', '': 'roku', '': 'shi', '': 'ryou',
# Hiragana
'': 'a', '': 'i', '': 'u', '': 'e', '': 'o',
'': 'ka', '': 'ki', '': 'ku', '': 'ke', '': 'ko',
'': 'sa', '': 'shi', '': 'su', '': 'se', '': 'so',
'': 'ta', '': 'chi', '': 'tsu', '': 'te', '': 'to',
'': 'na', '': 'ni', '': 'nu', '': 'ne', '': 'no',
'': 'ha', '': 'hi', '': 'fu', '': 'he', '': 'ho',
'': 'ma', '': 'mi', '': 'mu', '': 'me', '': 'mo',
'': 'ya', '': 'yu', '': 'yo',
'': 'ra', '': 'ri', '': 'ru', '': 're', '': 'ro',
'': 'wa', '': 'wo', '': 'n',
# Katakana
'': 'a', '': 'i', '': 'u', '': 'e', '': 'o',
'': 'ka', '': 'ki', '': 'ku', '': 'ke', '': 'ko',
'': 'sa', '': 'shi', '': 'su', '': 'se', '': 'so',
'': 'ta', '': 'chi', '': 'tsu', '': 'te', '': 'to',
'': 'na', '': 'ni', '': 'nu', '': 'ne', '': 'no',
'': 'ha', '': 'hi', '': 'fu', '': 'he', '': 'ho',
'': 'ma', '': 'mi', '': 'mu', '': 'me', '': 'mo',
'': 'ya', '': 'yu', '': 'yo',
'': 'ra', '': 'ri', '': 'ru', '': 're', '': 'ro',
'': 'wa', '': 'wo', '': 'n',
}
def transliterate_japanese(text: str) -> str:
"""Transliterate Japanese to Romaji using Modified Hepburn."""
if AVAILABLE_LIBS.get('pykakasi'):
kakasi = pykakasi.kakasi()
result = kakasi.convert(text)
return ' '.join([item['hepburn'] for item in result])
# Fallback: use basic vocabulary mapping
result = []
for char in text:
if char in JAPANESE_ROMAJI_MAP:
result.append(JAPANESE_ROMAJI_MAP[char])
elif char == ' ':
result.append(' ')
elif char.isalnum():
result.append(char)
# If we got no result, return warning
if not result:
return '[REQUIRES_PYKAKASI]'
return ''.join(result)
# =============================================================================
# KOREAN TRANSLITERATION (Revised Romanization)
# =============================================================================
# Basic Hangul syllable decomposition tables
HANGUL_INITIALS = [
'g', 'kk', 'n', 'd', 'tt', 'r', 'm', 'b', 'pp', 's', 'ss', '',
'j', 'jj', 'ch', 'k', 't', 'p', 'h'
]
HANGUL_MEDIALS = [
'a', 'ae', 'ya', 'yae', 'eo', 'e', 'yeo', 'ye', 'o', 'wa', 'wae',
'oe', 'yo', 'u', 'wo', 'we', 'wi', 'yu', 'eu', 'ui', 'i'
]
HANGUL_FINALS = [
'', 'k', 'k', 'k', 'n', 'n', 'n', 't', 'l', 'l', 'l', 'l', 'l',
'l', 'l', 'l', 'm', 'p', 'p', 's', 's', 'ng', 't', 't', 'k', 't', 'p', 't'
]
def transliterate_korean(text: str) -> str:
"""Transliterate Korean Hangul to Revised Romanization."""
try:
from korean_romanizer.romanizer import Romanizer
r = Romanizer(text)
return r.romanize()
except ImportError:
pass
# Fallback: basic syllable decomposition
result = []
for char in text:
code = ord(char)
if 0xAC00 <= code <= 0xD7AF: # Hangul syllable
code -= 0xAC00
initial = code // (21 * 28)
medial = (code % (21 * 28)) // 28
final = code % 28
syllable = HANGUL_INITIALS[initial] + HANGUL_MEDIALS[medial]
if final > 0:
syllable += HANGUL_FINALS[final]
result.append(syllable)
else:
result.append(char)
return ''.join(result)
# =============================================================================
# ARABIC TRANSLITERATION (ISO 233-2)
# =============================================================================
ARABIC_MAP = {
'ا': 'a', 'أ': 'a', 'إ': 'i', 'آ': 'a', 'ء': "'",
'ب': 'b', 'ت': 't', 'ث': 'th', 'ج': 'j',
'ح': 'h', 'خ': 'kh', 'د': 'd', 'ذ': 'dh',
'ر': 'r', 'ز': 'z', 'س': 's', 'ش': 'sh',
'ص': 's', 'ض': 'd', 'ط': 't', 'ظ': 'z',
'ع': "'", 'غ': 'gh', 'ف': 'f', 'ق': 'q',
'ك': 'k', 'ل': 'l', 'م': 'm', 'ن': 'n',
'ه': 'h', 'و': 'w', 'ي': 'y', 'ى': 'a',
'ة': 'a',
# Persian additions
'پ': 'p', 'چ': 'ch', 'ژ': 'zh', 'گ': 'g',
'ک': 'k', 'ی': 'i',
# Urdu additions
'ٹ': 't', 'ڈ': 'd', 'ڑ': 'r', 'ں': 'n',
# Diacritics (vowel marks)
'َ': 'a', 'ِ': 'i', 'ُ': 'u',
'ً': 'an', 'ٍ': 'in', 'ٌ': 'un',
'ّ': '', # Shadda (gemination) - simplified
}
def transliterate_arabic(text: str) -> str:
"""Transliterate Arabic script to Latin (ISO 233 simplified)."""
result = []
for c in text:
if c in ARABIC_MAP:
result.append(ARABIC_MAP[c])
elif c == ' ' or c.isalnum():
result.append(c)
elif c == '\u200c': # Zero-width non-joiner (Persian)
result.append('-')
return ''.join(result)
# =============================================================================
# HEBREW TRANSLITERATION (ISO 259-3)
# =============================================================================
HEBREW_MAP = {
'א': '', 'ב': 'v', 'ג': 'g', 'ד': 'd', 'ה': 'h',
'ו': 'v', 'ז': 'z', 'ח': 'ch', 'ט': 't', 'י': 'y',
'כ': 'k', 'ך': 'k', 'ל': 'l', 'מ': 'm', 'ם': 'm',
'נ': 'n', 'ן': 'n', 'ס': 's', 'ע': '', 'פ': 'f',
'ף': 'f', 'צ': 'ts', 'ץ': 'ts', 'ק': 'k', 'ר': 'r',
'ש': 'sh', 'ת': 't',
# With dagesh
'בּ': 'b', 'כּ': 'k', 'פּ': 'p',
}
def transliterate_hebrew(text: str) -> str:
"""Transliterate Hebrew to Latin (ISO 259-3 simplified)."""
result = []
for c in text:
if c in HEBREW_MAP:
result.append(HEBREW_MAP[c])
elif c == ' ' or c.isalnum():
result.append(c)
return ''.join(result)
# =============================================================================
# GREEK TRANSLITERATION (ISO 843)
# =============================================================================
GREEK_MAP = {
'Α': 'A', 'α': 'a', 'Β': 'V', 'β': 'v', 'Γ': 'G', 'γ': 'g',
'Δ': 'D', 'δ': 'd', 'Ε': 'E', 'ε': 'e', 'Ζ': 'Z', 'ζ': 'z',
'Η': 'I', 'η': 'i', 'Θ': 'Th', 'θ': 'th', 'Ι': 'I', 'ι': 'i',
'Κ': 'K', 'κ': 'k', 'Λ': 'L', 'λ': 'l', 'Μ': 'M', 'μ': 'm',
'Ν': 'N', 'ν': 'n', 'Ξ': 'X', 'ξ': 'x', 'Ο': 'O', 'ο': 'o',
'Π': 'P', 'π': 'p', 'Ρ': 'R', 'ρ': 'r', 'Σ': 'S', 'σ': 's',
'ς': 's', 'Τ': 'T', 'τ': 't', 'Υ': 'Y', 'υ': 'y', 'Φ': 'F',
'φ': 'f', 'Χ': 'Ch', 'χ': 'ch', 'Ψ': 'Ps', 'ψ': 'ps',
'Ω': 'O', 'ω': 'o',
# With accents
'Ά': 'A', 'ά': 'a', 'Έ': 'E', 'έ': 'e', 'Ή': 'I', 'ή': 'i',
'Ί': 'I', 'ί': 'i', 'Ό': 'O', 'ό': 'o', 'Ύ': 'Y', 'ύ': 'y',
'Ώ': 'O', 'ώ': 'o', 'ϊ': 'i', 'ϋ': 'y', 'ΐ': 'i', 'ΰ': 'y',
}
def transliterate_greek(text: str) -> str:
"""Transliterate Greek to Latin (ISO 843)."""
return ''.join(GREEK_MAP.get(c, c) for c in text)
# =============================================================================
# DEVANAGARI TRANSLITERATION (ISO 15919)
# =============================================================================
DEVANAGARI_MAP = {
# Vowels
'': 'a', '': 'aa', '': 'i', '': 'ii', '': 'u', '': 'uu',
'': 'ri', '': 'e', '': 'ai', '': 'o', '': 'au',
# Vowel marks
'': 'a', 'ि': 'i', '': 'i', '': 'u', '': 'u',
'': 'e', '': 'ai', '': 'o', '': 'au', '': 'm', '': 'h',
# Consonants
'': 'ka', '': 'kha', '': 'ga', '': 'gha', '': 'nga',
'': 'cha', '': 'chha', '': 'ja', '': 'jha', '': 'nya',
'': 'ta', '': 'tha', '': 'da', '': 'dha', '': 'na',
'': 'ta', '': 'tha', '': 'da', '': 'dha', '': 'na',
'': 'pa', '': 'pha', '': 'ba', '': 'bha', '': 'ma',
'': 'ya', '': 'ra', '': 'la', '': 'va',
'': 'sha', '': 'sha', '': 'sa', '': 'ha',
'': '', # Virama (removes inherent 'a')
# Hindi-specific
'ड़': 'da', 'ढ़': 'dha', 'क़': 'qa', 'ख़': 'kha', 'ग़': 'gha',
'ज़': 'za', 'फ़': 'fa',
}
def transliterate_devanagari(text: str) -> str:
"""Transliterate Devanagari to Latin (ISO 15919 simplified)."""
try:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate as indic_translit
return indic_translit(text, sanscript.DEVANAGARI, sanscript.IAST)
except ImportError:
pass
# Fallback: basic mapping
result = []
for c in text:
if c in DEVANAGARI_MAP:
result.append(DEVANAGARI_MAP[c])
elif c == ' ':
result.append(' ')
elif c.isalnum():
result.append(c)
return ''.join(result)
# =============================================================================
# THAI TRANSLITERATION (ISO 11940-2 / Royal Thai General System)
# =============================================================================
# Thai consonants with RTGS romanization
# Note: Thai consonants have inherent vowel 'o' or 'a' depending on syllable structure
THAI_CONSONANTS = {
# Initial consonants (high, mid, low class)
'': 'k', '': 'kh', '': 'kh', '': 'kh', '': 'kh', '': 'kh',
'': 'ng',
'': 'ch', '': 'ch', '': 'ch', '': 's', '': 'ch',
'': 'y', # Initial: y, Final: n
'': 'd', '': 't', '': 'th', '': 'th', '': 'th',
'': 'n',
'': 'd', '': 't', '': 'th', '': 'th', '': 'th',
'': 'n',
'': 'b', '': 'p', '': 'ph', '': 'f', '': 'ph', '': 'f', '': 'ph',
'': 'm',
'': 'y', '': 'r', '': 'l', '': 'w',
'': 's', '': 's', '': 's', '': 'h', '': 'l', '': '', # อ is silent initial
'': 'h',
}
# Thai vowels (can appear before, after, above, or below consonants)
THAI_VOWELS = {
# Following vowels
'': 'a', '': 'a', '': 'am',
'': 'i', '': 'i',
'': 'ue', '': 'ue',
'': 'u', '': 'u',
'': 'e', # Leading vowel
'': 'ae', # Leading vowel
'': 'o', # Leading vowel
'': 'ai', # Leading vowel
'': 'ai', # Leading vowel
'': 'a', # Lakkhangyao (rare)
# Vowel combinations are handled by position
}
# Thai tone marks (don't affect RTGS romanization - just skip)
THAI_TONE_MARKS = {'', '', '', ''}
# Thai special characters
THAI_SPECIAL = {
'': '', # Maitaikhu (shortens vowel)
'': '', # Thanthakhat (silent letter marker)
'': '', # Maiyamok (repetition)
'฿': 'B', # Baht symbol
'': '', # Paiyannoi (abbreviation)
'': '', # Fongman (obsolete)
'': 'm', # Nikhahit (nasalization, often 'm')
'': '', # Phinthu (Sanskrit virama)
'': '', # Mai yamok (repeat previous)
}
# Thai numerals
THAI_NUMERALS = {
'': '0', '': '1', '': '2', '': '3', '': '4',
'': '5', '': '6', '': '7', '': '8', '': '9',
}
# Common Thai heritage vocabulary - direct mappings for accuracy
# These handle complex syllable combinations correctly
THAI_HERITAGE_VOCAB = {
# Common institutional terms
'สำนัก': 'samnak',
'หอจดหมายเหตุ': 'ho chotmaihet',
'หอสมุด': 'ho samut',
'แห่งชาติ': 'haeng chat',
'พิพิธภัณฑ': 'phiphitthaphan',
'พิพิธภัณฑสถาน': 'phiphitthaphanthasathan',
'พระนคร': 'phra nakhon',
'สยาม': 'sayam',
'สมาคม': 'samakhom',
'ใน': 'nai',
'พระบรมราชูปถัมภ์': 'phra borommarachuppatham',
'พระที่นั่ง': 'phra thi nang',
'ศิวโมกข': 'siwamok',
'พิมาน': 'phiman',
'วัด': 'wat',
'โพธิ์': 'pho',
'ราม': 'ram',
# Geographic terms
'กรุงเทพ': 'krung thep',
'กรุงเทพมหานคร': 'krung thep maha nakhon',
'เชียงใหม่': 'chiang mai',
'ภูเก็ต': 'phuket',
# Institution types
'มหาวิทยาลัย': 'mahawitthayalai',
'ศูนย์': 'sun',
'สถาบัน': 'sathaban',
'องค์กร': 'ongkon',
'กรม': 'krom',
'กระทรวง': 'krasuang',
# Cultural terms
'วัฒนธรรม': 'watthanatham',
'ศิลปะ': 'sinlapa',
'ประวัติศาสตร์': 'prawattisat',
'โบราณ': 'boran',
'มรดก': 'moradok',
}
def transliterate_thai(text: str) -> str:
"""Transliterate Thai to Latin (Royal Thai General System).
Uses pythainlp if available, otherwise falls back to vocabulary lookup
and character-by-character transliteration.
"""
try:
from pythainlp.transliterate import romanize
return romanize(text, engine='royin') # Royal Institute standard
except ImportError:
pass
# Fallback: vocabulary lookup + character mapping
result = text
# First pass: replace known vocabulary items (longest match first)
for thai, latin in sorted(THAI_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])):
result = result.replace(thai, f' {latin} ')
# Second pass: transliterate remaining Thai characters
output = []
i = 0
while i < len(result):
c = result[i]
# Skip if already Latin
if c.isascii():
output.append(c)
i += 1
continue
# Check consonants
if c in THAI_CONSONANTS:
output.append(THAI_CONSONANTS[c])
i += 1
continue
# Check vowels
if c in THAI_VOWELS:
output.append(THAI_VOWELS[c])
i += 1
continue
# Skip tone marks
if c in THAI_TONE_MARKS:
i += 1
continue
# Check special characters
if c in THAI_SPECIAL:
output.append(THAI_SPECIAL[c])
i += 1
continue
# Check numerals
if c in THAI_NUMERALS:
output.append(THAI_NUMERALS[c])
i += 1
continue
# Unknown character - keep as is
output.append(c)
i += 1
# Clean up spacing
result = ''.join(output)
result = ' '.join(result.split()) # Normalize whitespace
return result
# =============================================================================
# ARMENIAN TRANSLITERATION (ISO 9985)
# =============================================================================
ARMENIAN_MAP = {
'Ա': 'A', 'ա': 'a', 'Բ': 'B', 'բ': 'b', 'Գ': 'G', 'գ': 'g',
'Դ': 'D', 'դ': 'd', 'Ե': 'E', 'ե': 'e', 'Զ': 'Z', 'զ': 'z',
'Է': 'E', 'է': 'e', 'Ը': 'Y', 'ը': 'y', 'Թ': 'T', 'թ': 't',
'Ժ': 'Zh', 'ժ': 'zh', 'Ի': 'I', 'ի': 'i', 'Լ': 'L', 'լ': 'l',
'Խ': 'Kh', 'խ': 'kh', 'Ծ': 'Ts', 'ծ': 'ts', 'Կ': 'K', 'կ': 'k',
'Հ': 'H', 'հ': 'h', 'Ձ': 'Dz', 'ձ': 'dz', 'Ղ': 'Gh', 'ղ': 'gh',
'Ճ': 'Ch', 'ճ': 'ch', 'Մ': 'M', 'մ': 'm', 'Յ': 'Y', 'յ': 'y',
'Ն': 'N', 'ն': 'n', 'Շ': 'Sh', 'շ': 'sh', 'Ո': 'O', 'ո': 'o',
'Չ': 'Ch', 'չ': 'ch', 'Պ': 'P', 'պ': 'p', 'Ջ': 'J', 'ջ': 'j',
' Delays': 'R', ' delays': 'r', ' Delays': 'S', 'ս': 's', '': 'V', 'վ': 'v',
'Տ': 'T', 'տ': 't', '': 'R', 'ր': 'r', 'Ց': 'Ts', 'ց': 'ts',
'Ւ': 'W', 'ւ': 'w', 'Փ': 'P', 'փ': 'p', 'Ք': 'K', 'ք': 'k',
'Օ': 'O', 'օ': 'o', 'Ֆ': 'F', 'ֆ': 'f',
}
def transliterate_armenian(text: str) -> str:
"""Transliterate Armenian to Latin (ISO 9985)."""
return ''.join(ARMENIAN_MAP.get(c, c) for c in text)
# =============================================================================
# GEORGIAN TRANSLITERATION (ISO 9984)
# =============================================================================
GEORGIAN_MAP = {
'': 'a', '': 'b', '': 'g', '': 'd', '': 'e', '': 'v',
'': 'z', '': 't', '': 'i', '': 'k', '': 'l', '': 'm',
'': 'n', '': 'o', '': 'p', '': 'zh', '': 'r', '': 's',
'': 't', '': 'u', '': 'p', '': 'k', '': 'gh', '': 'q',
'': 'sh', '': 'ch', '': 'ts', '': 'dz', '': 'ts', '': 'ch',
'': 'kh', '': 'j', '': 'h',
}
def transliterate_georgian(text: str) -> str:
"""Transliterate Georgian to Latin (ISO 9984)."""
return ''.join(GEORGIAN_MAP.get(c, c) for c in text)
# =============================================================================
# BENGALI TRANSLITERATION (ISO 15919)
# =============================================================================
BENGALI_MAP = {
# Vowels
'': 'a', '': 'aa', '': 'i', '': 'ii', '': 'u', '': 'uu',
'': 'e', '': 'ai', '': 'o', '': 'au',
# Consonants
'': 'ka', '': 'kha', '': 'ga', '': 'gha', '': 'nga',
'': 'cha', '': 'chha', '': 'ja', '': 'jha', '': 'nya',
'': 'ta', '': 'tha', '': 'da', '': 'dha', '': 'na',
'': 'ta', '': 'tha', '': 'da', '': 'dha', '': 'na',
'': 'pa', '': 'pha', '': 'ba', '': 'bha', '': 'ma',
'': 'ya', '': 'ra', '': 'la', '': 'sha', '': 'sha',
'': 'sa', '': 'ha', 'ড়': 'ra', 'ঢ়': 'rha', 'য়': 'ya',
'': '', # Nukta
'': '', # Virama
# Vowel marks
'': 'a', 'ি': 'i', '': 'i', '': 'u', '': 'u',
'': 'e', '': 'ai', '': 'o', '': 'au',
'': 'ng', '': 'h', '': 'n',
}
def transliterate_bengali(text: str) -> str:
"""Transliterate Bengali to Latin (ISO 15919 simplified)."""
result = []
for c in text:
if c in BENGALI_MAP:
result.append(BENGALI_MAP[c])
elif c == ' ':
result.append(' ')
elif c.isalnum():
result.append(c)
return ''.join(result)
# =============================================================================
# SINHALA TRANSLITERATION (ISO 15919)
# =============================================================================
# Sinhala character map (ISO 15919 romanization)
SINHALA_MAP = {
# Independent vowels
'': 'a', '': 'aa', '': 'ae', '': 'aae',
'': 'i', '': 'ii', '': 'u', '': 'uu',
'': 'e', '': 'ee', '': 'ai',
'': 'o', '': 'oo', '': 'au',
'': 'ri', '': 'rii',
# Consonants (with inherent 'a' vowel)
'': 'ka', '': 'kha', '': 'ga', '': 'gha', '': 'nga', '': 'nnga',
'': 'cha', '': 'chha', '': 'ja', '': 'jha', '': 'nya', '': 'gnya',
'': 'ta', '': 'tha', '': 'da', '': 'dha', '': 'na', '': 'nda',
'': 'tha', '': 'thha', '': 'da', '': 'dha', '': 'na', '': 'nda',
'': 'pa', '': 'pha', '': 'ba', '': 'bha', '': 'ma', '': 'mba',
'': 'ya', '': 'ra', '': 'la', '': 'va', '': 'la',
'': 'sha', '': 'sha', '': 'sa', '': 'ha',
'': 'fa', # Used for foreign words
# Dependent vowel signs (matras)
'': 'a', '': 'ae', '': 'aae',
'': 'i', '': 'ii', '': 'u', '': 'uu',
'': 'e', '': 'ee', '': 'ai',
'': 'o', '': 'oo', '': 'au',
'': 'ri', '': 'rii',
# Special marks
'': '', # Virama (hal kirima) - removes inherent vowel
'': 'ng', # Anusvara
'': 'h', # Visarga
'': '', # Kunddaliya (punctuation)
# Numerals (Sinhala uses both Sinhala and Arabic numerals)
'': '0', '': '1', '': '2', '': '3', '': '4',
'': '5', '': '6', '': '7', '': '8', '': '9',
}
# Common Sinhala heritage vocabulary
SINHALA_HERITAGE_VOCAB = {
# University/Education
'විශ්වවිද්‍යාලය': 'vishvavidyalaya',
'විශ්වවිද්': 'vishvavid',
'යාලය': 'yalaya',
'පේරාදෙණිය': 'peradeniya',
# National/Government
'ජාතික': 'jathika',
'දෙපාර්තමේන්තුව': 'departmentuwa',
# Museums/Archives
'කෞතුකාගාර': 'kauthukagara',
'කෞතුකාගාරය': 'kauthukagaaraya',
'ලේඛනාගාරය': 'lekhanagaaraya',
'පුස්තකාලය': 'pusthakaalaya',
# Places
'කොළඹ': 'colombo',
'ශ්‍රී': 'sri',
'ලංකාව': 'lankava',
}
def transliterate_sinhala(text: str) -> str:
"""Transliterate Sinhala to Latin (ISO 15919).
Args:
text: Text in Sinhala script
Returns:
Romanized text using ISO 15919 standard
"""
# First pass: replace known vocabulary (longest match first)
result = text
for sinhala, latin in sorted(SINHALA_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])):
result = result.replace(sinhala, f' {latin} ')
# Second pass: transliterate remaining characters
output = []
i = 0
while i < len(result):
c = result[i]
# Skip if already Latin
if c.isascii():
output.append(c)
i += 1
continue
# Check character map
if c in SINHALA_MAP:
output.append(SINHALA_MAP[c])
i += 1
continue
# Unknown character - keep as is or skip
if c == ' ':
output.append(' ')
elif c.isalnum():
output.append(c)
i += 1
# Clean up spacing
result = ''.join(output)
result = ' '.join(result.split())
return result
# =============================================================================
# KHMER TRANSLITERATION (UNGEGN Romanization)
# =============================================================================
# Khmer consonants (with inherent 'a' or 'o' vowel depending on register)
KHMER_CONSONANTS = {
# First series (inherent 'aa' in open syllables)
'': 'k', '': 'kh', '': 'k', '': 'kh', '': 'ng',
'': 'ch', '': 'chh', '': 'ch', '': 'chh', '': 'nh',
'': 'd', '': 'th', '': 'd', '': 'th', '': 'n',
'': 't', '': 'th', '': 't', '': 'th', '': 'n',
'': 'b', '': 'ph', '': 'p', '': 'ph', '': 'm',
'': 'y', '': 'r', '': 'l', '': 'v', '': 'sh',
'': 's', '': 's', '': 'h', '': 'l', '': '',
}
# Khmer dependent vowels
KHMER_VOWELS = {
'': 'a', '': 'i', '': 'ii', '': 'eu', '': 'eu',
'': 'o', '': 'ou', '': 'ua', '': 'ae',
'': 'ae', '': 'ai', '': 'ao', '': 'au',
'': '', # Subscript consonant marker (coeng)
}
# Khmer independent vowels
KHMER_INDEP_VOWELS = {
'': 'i', '': 'ii', '': 'u', '': 'uk',
'': 'uu', '': 'ou', '': 'ry', '': 'ryy',
'': 'ly', '': 'lyy', '': 'ae', '': 'ai',
'': 'ao', '': 'ao', '': 'au',
}
# Khmer special signs
KHMER_SPECIAL = {
'': 'm', # Nikahit (anusvara)
'': 'h', # Visarga
'': '', # Bantoc (shortens vowel)
'': 'r', # Robat (repha)
'': '', # Toandakhiat (silent letter)
'': '', # Kakabat (emphasis)
'': '', # Ahsda (obsolete)
'': '', # Viriam (obsolete punctuation)
'': ':', # Camnuc pii kuuh (colon)
'': '.', # Khan (period)
'': '.', # Bariyoosan (end mark)
'': '', # Koomuut (section mark)
}
# Khmer numerals
KHMER_NUMERALS = {
'': '0', '': '1', '': '2', '': '3', '': '4',
'': '5', '': '6', '': '7', '': '8', '': '9',
}
# Common Khmer heritage vocabulary
KHMER_HERITAGE_VOCAB = {
# Museums/Memorials
'សារមន្ទីរ': 'saaramontir',
'សារមន្ទីរទួលស្លែង': 'saaramontir tuol sleng',
'ទួលស្លែង': 'tuol sleng',
# Archives/Libraries
'បណ្ណាល័យ': 'bannaalay',
'ឯកសារដ្ឋាន': 'aeksaarathan',
'ជាតិ': 'cheate',
# Places
'ភ្នំពេញ': 'phnom penh',
'អង្គរ': 'angkor',
'សៀមរាប': 'siem reap',
# Cultural terms
'វប្បធម៌': 'vabpatham',
'បេតិកភណ្ឌ': 'betekaphon',
'ប្រវត្តិសាស្ត្រ': 'pravattisaas',
}
def transliterate_khmer(text: str) -> str:
"""Transliterate Khmer to Latin (UNGEGN system).
Args:
text: Text in Khmer script
Returns:
Romanized text using UNGEGN standard
"""
# First pass: replace known vocabulary (longest match first)
result = text
for khmer, latin in sorted(KHMER_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])):
result = result.replace(khmer, f' {latin} ')
# Second pass: transliterate remaining characters
output = []
i = 0
while i < len(result):
c = result[i]
# Skip if already Latin
if c.isascii():
output.append(c)
i += 1
continue
# Check consonants
if c in KHMER_CONSONANTS:
output.append(KHMER_CONSONANTS[c])
i += 1
continue
# Check vowels
if c in KHMER_VOWELS:
output.append(KHMER_VOWELS[c])
i += 1
continue
# Check independent vowels
if c in KHMER_INDEP_VOWELS:
output.append(KHMER_INDEP_VOWELS[c])
i += 1
continue
# Check special signs
if c in KHMER_SPECIAL:
output.append(KHMER_SPECIAL[c])
i += 1
continue
# Check numerals
if c in KHMER_NUMERALS:
output.append(KHMER_NUMERALS[c])
i += 1
continue
# Unknown character - keep as is or skip
if c == ' ':
output.append(' ')
elif c.isalnum():
output.append(c)
i += 1
# Clean up spacing
result = ''.join(output)
result = ' '.join(result.split())
return result
# =============================================================================
# MAIN TRANSLITERATION DISPATCHER
# =============================================================================
# Language to script mapping
LANG_SCRIPT_MAP = {
'ru': 'cyrillic', 'uk': 'cyrillic', 'bg': 'cyrillic',
'sr': 'cyrillic', 'kk': 'cyrillic',
'zh': 'chinese',
'ja': 'japanese',
'ko': 'korean',
'ar': 'arabic', 'fa': 'arabic', 'ur': 'arabic',
'he': 'hebrew',
'el': 'greek',
'hi': 'devanagari', 'ne': 'devanagari',
'bn': 'bengali',
'th': 'thai',
'hy': 'armenian',
'ka': 'georgian',
'si': 'sinhala',
'km': 'khmer',
}
TRANSLITERATORS = {
'cyrillic': transliterate_cyrillic,
'chinese': transliterate_chinese,
'japanese': transliterate_japanese,
'korean': transliterate_korean,
'arabic': transliterate_arabic,
'hebrew': transliterate_hebrew,
'greek': transliterate_greek,
'devanagari': transliterate_devanagari,
'bengali': transliterate_bengali,
'thai': transliterate_thai,
'armenian': transliterate_armenian,
'georgian': transliterate_georgian,
'sinhala': transliterate_sinhala,
'khmer': transliterate_khmer,
'latin': lambda t: t, # No transliteration needed
}
def transliterate(text: str, lang: Optional[str] = None) -> str:
"""
Transliterate text from non-Latin script to Latin.
Args:
text: Input text in any script
lang: Optional ISO 639-1 language code (e.g., 'ru', 'zh', 'ko')
If not provided, script is auto-detected.
Returns:
Transliterated text in Latin characters.
"""
if not text:
return text
# Determine script
if lang and lang in LANG_SCRIPT_MAP:
script = LANG_SCRIPT_MAP[lang]
else:
script = detect_script(text)
# Get transliterator
translit_func = TRANSLITERATORS.get(script, lambda t: t)
# For Cyrillic, pass language for dialect-specific handling
if script == 'cyrillic' and lang:
result = translit_func(text, lang)
else:
result = translit_func(text)
# Normalize diacritics to ASCII
normalized = unicodedata.normalize('NFD', result)
ascii_result = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_result
def transliterate_for_abbreviation(emic_name: str, lang: str) -> str:
"""
Transliterate emic name for GHCID abbreviation generation.
This is the main entry point for GHCID generation scripts.
Args:
emic_name: Institution name in original script
lang: ISO 639-1 language code
Returns:
Transliterated name ready for abbreviation extraction
"""
# Step 1: Transliterate to Latin
latin = transliterate(emic_name, lang)
# Step 2: Remove special characters (except spaces and hyphens)
clean = re.sub(r"[^a-zA-Z\s\-']", ' ', latin)
# Step 3: Normalize whitespace
clean = ' '.join(clean.split())
return clean
# =============================================================================
# CLI INTERFACE
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Transliterate non-Latin script text to Latin characters',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
Examples:
python transliterate_emic_names.py --text "Институт" --lang ru
python transliterate_emic_names.py --text "东巴文化博物院" --lang zh
python transliterate_emic_names.py --file data/custodian/example.yaml
Supported languages:
ru (Russian), uk (Ukrainian), bg (Bulgarian), sr (Serbian), kk (Kazakh)
zh (Chinese), ja (Japanese), ko (Korean)
ar (Arabic), fa (Persian), ur (Urdu), he (Hebrew)
el (Greek), hi (Hindi), ne (Nepali), bn (Bengali)
th (Thai), hy (Armenian), ka (Georgian)
'''
)
parser.add_argument('--text', '-t', help='Text to transliterate')
parser.add_argument('--lang', '-l', help='ISO 639-1 language code')
parser.add_argument('--file', '-f', help='YAML file to process')
parser.add_argument('--detect', '-d', action='store_true',
help='Only detect script, do not transliterate')
parser.add_argument('--libs', action='store_true',
help='Show available transliteration libraries')
args = parser.parse_args()
if args.libs:
print("Available transliteration libraries:")
for lib, available in AVAILABLE_LIBS.items():
status = "✓ installed" if available else "✗ not installed"
print(f" {lib}: {status}")
return
if args.file:
import yaml
with open(args.file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
emic_name = data.get('custodian_name', {}).get('emic_name')
lang = data.get('custodian_name', {}).get('name_language')
if not emic_name:
print(f"Error: No emic_name found in {args.file}")
return
print(f"Emic name: {emic_name}")
print(f"Language: {lang or '(auto-detect)'}")
if args.detect:
script = detect_script(emic_name)
print(f"Detected script: {script}")
else:
result = transliterate_for_abbreviation(emic_name, lang)
print(f"Transliterated: {result}")
return
if args.text:
if args.detect:
script = detect_script(args.text)
print(f"Input: {args.text}")
print(f"Detected script: {script}")
else:
result = transliterate_for_abbreviation(args.text, args.lang)
print(f"Input: {args.text}")
print(f"Language: {args.lang or '(auto-detect)'}")
print(f"Output: {result}")
return
parser.print_help()
if __name__ == '__main__':
main()