diff --git a/scripts/fix_ghcid_diacritics.py b/scripts/fix_ghcid_diacritics.py new file mode 100644 index 0000000000..4e9085b8ee --- /dev/null +++ b/scripts/fix_ghcid_diacritics.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python3 +""" +Fix GHCID abbreviations containing diacritics. + +This script normalizes diacritics in GHCID abbreviation components to ASCII, +regenerates UUIDs and numeric IDs, updates GHCID history, and renames files. + +Rule: ABBREV-DIACRITICS +See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md + +Usage: + python scripts/fix_ghcid_diacritics.py --dry-run # Preview changes + python scripts/fix_ghcid_diacritics.py # Apply changes +""" + +import argparse +import hashlib +import os +import re +import shutil +import unicodedata +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +import yaml + +# GHCID namespace UUID for deterministic UUID generation +GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # URL namespace + +# Regex pattern for common diacritics +DIACRITICS_PATTERN = re.compile(r'[ČŘŠŽĚŮÁÉÍÓÚÝÄÖÜßÑÇÀÈÌÒÙŁŃŚŹŻĄĘÅØÆŐŰÂÊÎÔÛčřšžěůáéíóúýäöüñçàèìòùłńśźżąęåøæőűâêîôû]') + + +def normalize_diacritics(text: str) -> str: + """ + Normalize diacritics to ASCII equivalents. + + Uses Unicode NFD decomposition to separate base characters from + combining marks, then removes the combining marks. + + Examples: + "Č" → "C" + "Ř" → "R" + "Ö" → "O" + "ñ" → "n" + """ + # NFD decomposition separates base characters from combining marks + normalized = unicodedata.normalize('NFD', text) + # Remove combining marks (category 'Mn' = Mark, Nonspacing) + ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + return ascii_text + + +def has_diacritics_in_ghcid(ghcid: str) -> bool: + """Check if GHCID contains any diacritics (in any component). + + Diacritics can appear in: + - Region code (e.g., '31' is fine, but city code 'ČB' has diacritics) + - City code (e.g., 'TŘE' for Třebíč) + - Abbreviation (e.g., 'VHSPAOČRZS') + """ + return bool(DIACRITICS_PATTERN.search(ghcid)) + + +def has_diacritics_in_abbreviation(ghcid: str) -> bool: + """Check if GHCID abbreviation component contains diacritics.""" + # GHCID format: CC-RR-CCC-T-ABBREV or CC-RR-CCC-T-ABBREV-suffix + parts = ghcid.split('-') + if len(parts) >= 5: + # Abbreviation is the 5th component (index 4) + abbrev = parts[4] + return bool(DIACRITICS_PATTERN.search(abbrev)) + return False + + +def fix_ghcid_diacritics(ghcid: str) -> str: + """ + Fix diacritics in ALL GHCID components. + + Normalizes diacritics in all parts: country, region, city, type, + abbreviation, and any suffix components. + """ + parts = ghcid.split('-') + # Normalize all parts + normalized_parts = [normalize_diacritics(part) for part in parts] + return '-'.join(normalized_parts) + + +def generate_uuid_v5(ghcid_string: str) -> str: + """Generate deterministic UUID v5 from GHCID string.""" + return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string)) + + +def generate_uuid_v8_sha256(ghcid_string: str) -> str: + """Generate UUID v8 from SHA-256 hash of GHCID string.""" + sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest() + # Take first 16 bytes for UUID + uuid_bytes = bytearray(sha256_hash[:16]) + # Set version to 8 (custom) + uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80 + # Set variant to RFC 4122 + uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80 + return str(uuid.UUID(bytes=bytes(uuid_bytes))) + + +def generate_numeric_id(ghcid_string: str) -> int: + """Generate 64-bit numeric ID from SHA-256 hash.""" + sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest() + # Take first 8 bytes as 64-bit unsigned integer + numeric_id = int.from_bytes(sha256_hash[:8], byteorder='big') + return numeric_id + + +def process_file(file_path: Path, dry_run: bool = True) -> Optional[dict]: + """ + Process a single YAML file to fix GHCID diacritics. + + Returns dict with change info, or None if no change needed. + """ + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" Error reading {file_path}: {e}") + return None + + if not data or 'ghcid' not in data: + return None + + ghcid_section = data.get('ghcid', {}) + old_ghcid = ghcid_section.get('ghcid_current', '') + + if not has_diacritics_in_ghcid(old_ghcid): + return None + + # Fix the GHCID + new_ghcid = fix_ghcid_diacritics(old_ghcid) + + if new_ghcid == old_ghcid: + return None + + # Generate new identifiers + new_uuid_v5 = generate_uuid_v5(new_ghcid) + new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid) + new_numeric = generate_numeric_id(new_ghcid) + timestamp_now = datetime.now(timezone.utc).isoformat() + + change_info = { + 'file': str(file_path), + 'old_ghcid': old_ghcid, + 'new_ghcid': new_ghcid, + 'old_uuid': ghcid_section.get('ghcid_uuid', ''), + 'new_uuid': new_uuid_v5, + 'old_numeric': ghcid_section.get('ghcid_numeric', 0), + 'new_numeric': new_numeric, + } + + if dry_run: + return change_info + + # Update ghcid section + ghcid_section['ghcid_current'] = new_ghcid + ghcid_section['ghcid_uuid'] = new_uuid_v5 + ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8 + ghcid_section['ghcid_numeric'] = new_numeric + # Keep original as-is (for historical reference) + + # Add history entry for the fix + ghcid_history = ghcid_section.get('ghcid_history', []) + + # Add new entry at the beginning + new_history_entry = { + 'ghcid': new_ghcid, + 'ghcid_numeric': new_numeric, + 'valid_from': timestamp_now, + 'reason': f"Normalized diacritics to ASCII per ABBREV-DIACRITICS rule (was: {old_ghcid})" + } + + # Mark previous entry as superseded + if ghcid_history: + if 'valid_to' not in ghcid_history[0]: + ghcid_history[0]['valid_to'] = timestamp_now + + ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history + data['ghcid'] = ghcid_section + + # Update identifiers section + identifiers = data.get('identifiers', []) + for ident in identifiers: + if ident.get('identifier_scheme') == 'GHCID': + ident['identifier_value'] = new_ghcid + elif ident.get('identifier_scheme') == 'GHCID_UUID': + ident['identifier_value'] = new_uuid_v5 + elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256': + ident['identifier_value'] = new_uuid_v8 + elif ident.get('identifier_scheme') == 'GHCID_NUMERIC': + ident['identifier_value'] = str(new_numeric) + data['identifiers'] = identifiers + + # Write updated file + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + # Rename file to match new GHCID + old_filename = file_path.name + new_filename = f"{new_ghcid}.yaml" + + if old_filename != new_filename: + new_file_path = file_path.parent / new_filename + if new_file_path.exists(): + print(f" Warning: Target file already exists: {new_file_path}") + # Don't rename if target exists + else: + shutil.move(str(file_path), str(new_file_path)) + change_info['new_file'] = str(new_file_path) + + return change_info + + +def find_affected_files(custodian_dir: Path) -> list[Path]: + """Find all YAML files with diacritics in GHCID abbreviation. + + Uses filename-based detection for speed, since filenames match GHCID. + """ + import subprocess + + # Use find with regex for speed - filenames contain the GHCID + try: + result = subprocess.run( + ['find', str(custodian_dir), '-name', '*.yaml', '-type', 'f'], + capture_output=True, + text=True, + timeout=30 + ) + + all_files = [Path(p) for p in result.stdout.strip().split('\n') if p] + except Exception: + # Fallback to glob + all_files = list(custodian_dir.glob("*.yaml")) + + affected = [] + for yaml_file in all_files: + # Check filename for diacritics (faster than parsing YAML) + if DIACRITICS_PATTERN.search(yaml_file.stem): + affected.append(yaml_file) + + return affected + + +def main(): + parser = argparse.ArgumentParser( + description="Fix GHCID abbreviations containing diacritics" + ) + parser.add_argument( + '--dry-run', + action='store_true', + help="Preview changes without modifying files" + ) + parser.add_argument( + '--limit', + type=int, + default=0, + help="Limit number of files to process (0 = no limit)" + ) + parser.add_argument( + '--custodian-dir', + type=Path, + default=Path('data/custodian'), + help="Path to custodian directory" + ) + + args = parser.parse_args() + + custodian_dir = args.custodian_dir + if not custodian_dir.exists(): + print(f"Error: Directory not found: {custodian_dir}") + return 1 + + print(f"Scanning {custodian_dir} for files with diacritics in GHCID abbreviation...") + affected_files = find_affected_files(custodian_dir) + + print(f"Found {len(affected_files)} affected files") + + if args.limit > 0: + affected_files = affected_files[:args.limit] + print(f"Limited to {args.limit} files") + + if args.dry_run: + print("\n=== DRY RUN (no changes will be made) ===\n") + else: + print("\n=== APPLYING CHANGES ===\n") + + changes = [] + for i, file_path in enumerate(affected_files, 1): + print(f"[{i}/{len(affected_files)}] Processing {file_path.name}...") + change = process_file(file_path, dry_run=args.dry_run) + if change: + changes.append(change) + print(f" {change['old_ghcid']} → {change['new_ghcid']}") + + print(f"\n=== SUMMARY ===") + print(f"Files processed: {len(affected_files)}") + print(f"Files changed: {len(changes)}") + + if args.dry_run and changes: + print("\nTo apply changes, run without --dry-run flag") + + # Show country distribution + if changes: + countries = {} + for c in changes: + cc = c['old_ghcid'].split('-')[0] + countries[cc] = countries.get(cc, 0) + 1 + + print("\nBy country:") + for cc, count in sorted(countries.items(), key=lambda x: -x[1]): + print(f" {cc}: {count}") + + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/resolve_diacritics_collisions.py b/scripts/resolve_diacritics_collisions.py new file mode 100644 index 0000000000..b43a98d51a --- /dev/null +++ b/scripts/resolve_diacritics_collisions.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 +""" +Resolve GHCID collisions caused by diacritics normalization. + +When a file with diacritics normalizes to the same GHCID as an existing file, +the diacritics file gets a name suffix per AGENTS.md collision rules. + +Usage: + python scripts/resolve_diacritics_collisions.py --dry-run # Preview changes + python scripts/resolve_diacritics_collisions.py # Apply changes +""" + +import argparse +import hashlib +import os +import re +import shutil +import unicodedata +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +import yaml + +# GHCID namespace UUID for deterministic UUID generation +GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") + +# Regex pattern for common diacritics +DIACRITICS_PATTERN = re.compile(r'[ČŘŠŽĚŮÁÉÍÓÚÝÄÖÜßÑÇÀÈÌÒÙŁŃŚŹŻĄĘÅØÆŐŰÂÊÎÔÛčřšžěůáéíóúýäöüñçàèìòùłńśźżąęåøæőűâêîôûĎŇŤďňť]') + + +def normalize_diacritics(text: str) -> str: + """Normalize diacritics to ASCII equivalents.""" + normalized = unicodedata.normalize('NFD', text) + ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + return ascii_text + + +def generate_name_suffix(native_name: str) -> str: + """Convert native language institution name to snake_case suffix.""" + # Normalize unicode (NFD decomposition) and remove diacritics + normalized = unicodedata.normalize('NFD', native_name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + # Convert to lowercase + lowercase = ascii_name.lower() + + # Remove apostrophes, commas, and other punctuation + no_punct = re.sub(r"[''`\",.:;!?()[\]{}‒–—]", '', lowercase) + + # Replace spaces and hyphens with underscores + underscored = re.sub(r'[\s\-]+', '_', no_punct) + + # Remove any remaining non-alphanumeric characters (except underscores) + clean = re.sub(r'[^a-z0-9_]', '', underscored) + + # Collapse multiple underscores + final = re.sub(r'_+', '_', clean).strip('_') + + return final + + +def generate_uuid_v5(ghcid_string: str) -> str: + """Generate deterministic UUID v5 from GHCID string.""" + return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string)) + + +def generate_uuid_v8_sha256(ghcid_string: str) -> str: + """Generate UUID v8 from SHA-256 hash of GHCID string.""" + sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest() + uuid_bytes = bytearray(sha256_hash[:16]) + uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80 + uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80 + return str(uuid.UUID(bytes=bytes(uuid_bytes))) + + +def generate_numeric_id(ghcid_string: str) -> int: + """Generate 64-bit numeric ID from SHA-256 hash.""" + sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest() + return int.from_bytes(sha256_hash[:8], byteorder='big') + + +def find_collision_pairs(custodian_dir: Path) -> list[tuple[Path, Path, str]]: + """Find files with diacritics that collide with existing ASCII files. + + Returns list of (diacritics_file, ascii_file, ascii_ghcid). + """ + collisions = [] + + for yaml_file in custodian_dir.glob("*.yaml"): + filename = yaml_file.stem # Without .yaml + + if not DIACRITICS_PATTERN.search(filename): + continue + + # Normalize to ASCII + ascii_filename = normalize_diacritics(filename) + ascii_file = custodian_dir / f"{ascii_filename}.yaml" + + if ascii_file.exists(): + collisions.append((yaml_file, ascii_file, ascii_filename)) + + return collisions + + +def resolve_collision(diacritics_file: Path, ascii_ghcid: str, dry_run: bool = True) -> Optional[dict]: + """ + Resolve a collision by adding a name suffix to the diacritics file. + + The diacritics file gets a name suffix since it's being added later. + """ + try: + with open(diacritics_file, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" Error reading {diacritics_file}: {e}") + return None + + if not data: + return None + + # Get institution name for suffix + original_entry = data.get('original_entry', {}) + inst_name = original_entry.get('name', '') + + if not inst_name: + print(f" Warning: No institution name found in {diacritics_file}") + return None + + # Generate name suffix + name_suffix = generate_name_suffix(inst_name) + + # Create new GHCID with name suffix + new_ghcid = f"{ascii_ghcid}-{name_suffix}" + + # Get old GHCID from file + ghcid_section = data.get('ghcid', {}) + old_ghcid = ghcid_section.get('ghcid_current', diacritics_file.stem) + + # Generate new identifiers + new_uuid_v5 = generate_uuid_v5(new_ghcid) + new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid) + new_numeric = generate_numeric_id(new_ghcid) + timestamp_now = datetime.now(timezone.utc).isoformat() + + change_info = { + 'file': str(diacritics_file), + 'institution_name': inst_name, + 'old_ghcid': old_ghcid, + 'new_ghcid': new_ghcid, + 'name_suffix': name_suffix, + } + + if dry_run: + return change_info + + # Update ghcid section + ghcid_section['ghcid_current'] = new_ghcid + ghcid_section['ghcid_uuid'] = new_uuid_v5 + ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8 + ghcid_section['ghcid_numeric'] = new_numeric + + # Add history entry + ghcid_history = ghcid_section.get('ghcid_history', []) + + new_history_entry = { + 'ghcid': new_ghcid, + 'ghcid_numeric': new_numeric, + 'valid_from': timestamp_now, + 'reason': f"Name suffix added to resolve collision with {ascii_ghcid} (was: {old_ghcid})" + } + + if ghcid_history and 'valid_to' not in ghcid_history[0]: + ghcid_history[0]['valid_to'] = timestamp_now + + ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history + data['ghcid'] = ghcid_section + + # Update identifiers section + identifiers = data.get('identifiers', []) + for ident in identifiers: + if ident.get('identifier_scheme') == 'GHCID': + ident['identifier_value'] = new_ghcid + elif ident.get('identifier_scheme') == 'GHCID_UUID': + ident['identifier_value'] = new_uuid_v5 + elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256': + ident['identifier_value'] = new_uuid_v8 + elif ident.get('identifier_scheme') == 'GHCID_NUMERIC': + ident['identifier_value'] = str(new_numeric) + data['identifiers'] = identifiers + + # Write updated file + with open(diacritics_file, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + # Rename file to match new GHCID + new_filename = f"{new_ghcid}.yaml" + new_file_path = diacritics_file.parent / new_filename + + if new_file_path.exists(): + print(f" Warning: Target file already exists: {new_file_path}") + else: + shutil.move(str(diacritics_file), str(new_file_path)) + change_info['new_file'] = str(new_file_path) + + return change_info + + +def main(): + parser = argparse.ArgumentParser( + description="Resolve GHCID collisions caused by diacritics normalization" + ) + parser.add_argument( + '--dry-run', + action='store_true', + help="Preview changes without modifying files" + ) + parser.add_argument( + '--custodian-dir', + type=Path, + default=Path('data/custodian'), + help="Path to custodian directory" + ) + + args = parser.parse_args() + custodian_dir = args.custodian_dir + + if not custodian_dir.exists(): + print(f"Error: Directory not found: {custodian_dir}") + return 1 + + print(f"Scanning {custodian_dir} for diacritics collision pairs...") + collisions = find_collision_pairs(custodian_dir) + + print(f"Found {len(collisions)} collision pairs\n") + + if args.dry_run: + print("=== DRY RUN (no changes will be made) ===\n") + else: + print("=== APPLYING CHANGES ===\n") + + changes = [] + for i, (diacritics_file, ascii_file, ascii_ghcid) in enumerate(collisions, 1): + print(f"[{i}/{len(collisions)}] Collision:") + print(f" Diacritics file: {diacritics_file.name}") + print(f" Collides with: {ascii_file.name}") + + change = resolve_collision(diacritics_file, ascii_ghcid, dry_run=args.dry_run) + + if change: + changes.append(change) + print(f" Institution: {change['institution_name']}") + print(f" GHCID change: {change['old_ghcid']} → {change['new_ghcid']}") + if 'new_file' in change: + print(f" New file: {Path(change['new_file']).name}") + print() + + print(f"=== SUMMARY ===") + print(f"Collisions found: {len(collisions)}") + print(f"Files resolved: {len(changes)}") + + if args.dry_run and changes: + print("\nTo apply changes, run without --dry-run flag") + + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/transliterate_emic_names.py b/scripts/transliterate_emic_names.py new file mode 100644 index 0000000000..706d128ba4 --- /dev/null +++ b/scripts/transliterate_emic_names.py @@ -0,0 +1,1267 @@ +#!/usr/bin/env python3 +""" +Transliteration Utility for GHCID Abbreviation Generation + +This script provides transliteration functions for converting non-Latin script +institution names to Latin characters using ISO and recognized standards. + +Usage: + # As a module + from scripts.transliterate_emic_names import transliterate_for_abbreviation + + latin = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru") + # Result: "Institut vostochnykh rukopisey RAN" + + # As a CLI tool + python scripts/transliterate_emic_names.py --text "東巴文化博物院" --lang zh + python scripts/transliterate_emic_names.py --file data/custodian/example.yaml + +Standards: + - Cyrillic (ru, uk, bg, sr, kk): ISO 9:1995 + - Chinese (zh): Hanyu Pinyin (ISO 7098) + - Japanese (ja): Modified Hepburn + - Korean (ko): Revised Romanization + - Arabic (ar, fa, ur): ISO 233-2/3 + - Hebrew (he): ISO 259-3 + - Greek (el): ISO 843 + - Devanagari (hi, ne, bn): ISO 15919 + - Thai (th): ISO 11940-2 + - Armenian (hy): ISO 9985 + - Georgian (ka): ISO 9984 + +Author: GLAM Project +Created: 2025-12-08 +""" + +import argparse +import re +import unicodedata +from pathlib import Path +from typing import Optional, Dict, List, Tuple + +# Try importing optional transliteration libraries +AVAILABLE_LIBS: Dict[str, bool] = {} + +try: + from pypinyin import pinyin, Style + AVAILABLE_LIBS['pypinyin'] = True +except ImportError: + AVAILABLE_LIBS['pypinyin'] = False + +try: + import pykakasi + AVAILABLE_LIBS['pykakasi'] = True +except ImportError: + AVAILABLE_LIBS['pykakasi'] = False + +try: + from transliterate import translit + AVAILABLE_LIBS['transliterate'] = True +except ImportError: + AVAILABLE_LIBS['transliterate'] = False + + +# ============================================================================= +# SCRIPT DETECTION +# ============================================================================= + +def detect_script(text: str) -> str: + """ + Detect the primary script of the input text. + + Returns one of: + - 'latin': Latin alphabet + - 'cyrillic': Cyrillic script + - 'chinese': Chinese characters (Hanzi) + - 'japanese': Japanese (mixed Kanji/Kana) + - 'korean': Korean Hangul + - 'arabic': Arabic script (includes Persian, Urdu) + - 'hebrew': Hebrew script + - 'greek': Greek script + - 'devanagari': Devanagari (Hindi, Nepali, Sanskrit) + - 'bengali': Bengali script + - 'thai': Thai script + - 'armenian': Armenian script + - 'georgian': Georgian script + - 'sinhala': Sinhala script + - 'khmer': Khmer script + - 'unknown': Cannot determine + """ + script_ranges = { + 'cyrillic': (0x0400, 0x04FF), + 'arabic': (0x0600, 0x06FF), + 'persian_ext': (0x0750, 0x077F), # Arabic Supplement + 'hebrew': (0x0590, 0x05FF), + 'devanagari': (0x0900, 0x097F), + 'bengali': (0x0980, 0x09FF), + 'thai': (0x0E00, 0x0E7F), + 'greek': (0x0370, 0x03FF), + 'armenian': (0x0530, 0x058F), + 'georgian': (0x10A0, 0x10FF), + 'korean': (0xAC00, 0xD7AF), # Hangul syllables + 'korean_jamo': (0x1100, 0x11FF), # Hangul Jamo + 'japanese_hiragana': (0x3040, 0x309F), + 'japanese_katakana': (0x30A0, 0x30FF), + 'chinese': (0x4E00, 0x9FFF), # CJK Unified Ideographs + 'chinese_ext': (0x3400, 0x4DBF), # CJK Extension A + 'sinhala': (0x0D80, 0x0DFF), + 'khmer': (0x1780, 0x17FF), + } + + script_counts: Dict[str, int] = {script: 0 for script in script_ranges} + latin_count = 0 + + for char in text: + code = ord(char) + + # Check Latin + if ('a' <= char <= 'z') or ('A' <= char <= 'Z'): + latin_count += 1 + continue + + # Check other scripts + for script, (start, end) in script_ranges.items(): + if start <= code <= end: + script_counts[script] += 1 + break + + # Determine primary script + if latin_count > 0 and all(c == 0 for c in script_counts.values()): + return 'latin' + + # Merge related scripts + script_counts['arabic'] += script_counts.get('persian_ext', 0) + script_counts['korean'] += script_counts.get('korean_jamo', 0) + script_counts['chinese'] += script_counts.get('chinese_ext', 0) + + # Handle CJK disambiguation + cjk_count = script_counts.get('chinese', 0) + hiragana_count = script_counts.get('japanese_hiragana', 0) + katakana_count = script_counts.get('japanese_katakana', 0) + + if hiragana_count > 0 or katakana_count > 0: + return 'japanese' + + # Find max non-Latin script + primary_scripts = ['cyrillic', 'arabic', 'hebrew', 'devanagari', 'bengali', + 'thai', 'greek', 'armenian', 'georgian', 'korean', + 'chinese', 'sinhala', 'khmer'] + + max_script = max(primary_scripts, key=lambda s: script_counts.get(s, 0)) + if script_counts.get(max_script, 0) > 0: + return max_script + + return 'latin' if latin_count > 0 else 'unknown' + + +# ============================================================================= +# CYRILLIC TRANSLITERATION (ISO 9:1995) +# ============================================================================= + +CYRILLIC_MAP = { + # Russian + 'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E', + 'Ё': 'E', 'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K', + 'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R', + 'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'Kh', 'Ц': 'Ts', + 'Ч': 'Ch', 'Ш': 'Sh', 'Щ': 'Shch', 'Ъ': '', 'Ы': 'Y', 'Ь': '', + 'Э': 'E', 'Ю': 'Yu', 'Я': 'Ya', + 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', + 'ё': 'e', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', + 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r', + 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts', + 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', + 'э': 'e', 'ю': 'yu', 'я': 'ya', + # Ukrainian additions + 'І': 'I', 'і': 'i', 'Ї': 'Yi', 'ї': 'yi', 'Є': 'Ye', 'є': 'ye', + 'Ґ': 'G', 'ґ': 'g', "'": '', + # Bulgarian additions + 'Ъ': 'A', 'ъ': 'a', # Bulgarian hard sign = schwa + # Serbian additions + 'Ђ': 'Dj', 'ђ': 'dj', 'Ј': 'J', 'ј': 'j', 'Љ': 'Lj', 'љ': 'lj', + 'Њ': 'Nj', 'њ': 'nj', 'Ћ': 'C', 'ћ': 'c', 'Џ': 'Dz', 'џ': 'dz', + # Kazakh additions (Cyrillic-based) + 'Ә': 'A', 'ә': 'a', 'Ғ': 'Gh', 'ғ': 'gh', 'Қ': 'Q', 'қ': 'q', + 'Ң': 'Ng', 'ң': 'ng', 'Ө': 'O', 'ө': 'o', 'Ұ': 'U', 'ұ': 'u', + 'Ү': 'U', 'ү': 'u', 'Һ': 'H', 'һ': 'h', +} + + +def transliterate_cyrillic(text: str, lang: str = 'ru') -> str: + """Transliterate Cyrillic text using ISO 9 mapping.""" + if AVAILABLE_LIBS.get('transliterate'): + try: + return translit(text, lang, reversed=True) + except Exception: + pass + + # Fallback to manual mapping + return ''.join(CYRILLIC_MAP.get(c, c) for c in text) + + +# ============================================================================= +# CHINESE TRANSLITERATION (Hanyu Pinyin) +# ============================================================================= + +# Basic Pinyin dictionary for common museum/library/archive vocabulary +# This allows basic transliteration without pypinyin library +CHINESE_PINYIN_MAP = { + # Numbers + '一': 'yi', '二': 'er', '三': 'san', '四': 'si', '五': 'wu', + '六': 'liu', '七': 'qi', '八': 'ba', '九': 'jiu', '十': 'shi', + '百': 'bai', '千': 'qian', '万': 'wan', + + # Heritage/Museum vocabulary + '博': 'bo', '物': 'wu', '館': 'guan', '馆': 'guan', '院': 'yuan', + '文': 'wen', '化': 'hua', '藝': 'yi', '艺': 'yi', '術': 'shu', '术': 'shu', + '歷': 'li', '历': 'li', '史': 'shi', '遺': 'yi', '遗': 'yi', '產': 'chan', '产': 'chan', + '國': 'guo', '国': 'guo', '立': 'li', '家': 'jia', '民': 'min', '族': 'zu', + '中': 'zhong', '央': 'yang', '省': 'sheng', '市': 'shi', '縣': 'xian', '县': 'xian', + '圖': 'tu', '图': 'tu', '書': 'shu', '书': 'shu', '檔': 'dang', '档': 'dang', '案': 'an', + '美': 'mei', '古': 'gu', '典': 'dian', '藏': 'cang', '品': 'pin', '展': 'zhan', '覽': 'lan', '览': 'lan', + '紀': 'ji', '纪': 'ji', '念': 'nian', '碑': 'bei', '塔': 'ta', '廟': 'miao', '庙': 'miao', + '寺': 'si', '宮': 'gong', '宫': 'gong', '殿': 'dian', '城': 'cheng', '堡': 'bao', + '樓': 'lou', '楼': 'lou', '閣': 'ge', '阁': 'ge', '亭': 'ting', '園': 'yuan', '园': 'yuan', + '研': 'yan', '究': 'jiu', '所': 'suo', '中': 'zhong', '心': 'xin', + '學': 'xue', '学': 'xue', '院': 'yuan', '校': 'xiao', '系': 'xi', + '會': 'hui', '会': 'hui', '社': 'she', '團': 'tuan', '团': 'tuan', + '東': 'dong', '东': 'dong', '西': 'xi', '南': 'nan', '北': 'bei', + '京': 'jing', '海': 'hai', '山': 'shan', '河': 'he', '江': 'jiang', + '大': 'da', '小': 'xiao', '新': 'xin', '老': 'lao', '古': 'gu', + '自': 'zi', '然': 'ran', '科': 'ke', '技': 'ji', + '巴': 'ba', '東': 'dong', '納': 'na', '纳': 'na', '西': 'xi', + '故': 'gu', '宮': 'gong', + '基': 'ji', '金': 'jin', '銀': 'yin', '银': 'yin', + '教': 'jiao', '育': 'yu', '傳': 'chuan', '传': 'chuan', '統': 'tong', '统': 'tong', + '絲': 'si', '丝': 'si', '綢': 'chou', '绸': 'chou', '路': 'lu', + '陶': 'tao', '瓷': 'ci', '玉': 'yu', '石': 'shi', '銅': 'tong', '铜': 'tong', + '畫': 'hua', '画': 'hua', '雕': 'diao', '塑': 'su', + '民': 'min', '俗': 'su', '風': 'feng', '风': 'feng', '土': 'tu', + '革': 'ge', '命': 'ming', '戰': 'zhan', '战': 'zhan', '爭': 'zheng', '争': 'zheng', + '軍': 'jun', '军': 'jun', '事': 'shi', '航': 'hang', '空': 'kong', '天': 'tian', + '宗': 'zong', '佛': 'fo', '道': 'dao', '儒': 'ru', + '絃': 'xian', '弦': 'xian', '琴': 'qin', '樂': 'yue', '乐': 'yue', + '舞': 'wu', '劇': 'ju', '剧': 'ju', '戲': 'xi', '戏': 'xi', + '茶': 'cha', '酒': 'jiu', '食': 'shi', '餐': 'can', + '衣': 'yi', '服': 'fu', '紡': 'fang', '纺': 'fang', '織': 'zhi', '织': 'zhi', + '建': 'jian', '築': 'zhu', '筑': 'zhu', '房': 'fang', '屋': 'wu', + '水': 'shui', '電': 'dian', '电': 'dian', '火': 'huo', '木': 'mu', + '農': 'nong', '农': 'nong', '業': 'ye', '业': 'ye', '工': 'gong', '商': 'shang', + '醫': 'yi', '医': 'yi', '藥': 'yao', '药': 'yao', + '人': 'ren', '物': 'wu', '生': 'sheng', '活': 'huo', + '和': 'he', '平': 'ping', '友': 'you', '誼': 'yi', '谊': 'yi', + '港': 'gang', '澳': 'ao', '台': 'tai', '灣': 'wan', '湾': 'wan', + '華': 'hua', '华': 'hua', '僑': 'qiao', '侨': 'qiao', + '海': 'hai', '外': 'wai', '交': 'jiao', '流': 'liu', + '保': 'bao', '護': 'hu', '护': 'hu', '修': 'xiu', '復': 'fu', '复': 'fu', + '鑒': 'jian', '鉴': 'jian', '定': 'ding', '評': 'ping', '评': 'ping', '估': 'gu', +} + +def transliterate_chinese(text: str) -> str: + """Transliterate Chinese to Pinyin without tone marks.""" + if AVAILABLE_LIBS.get('pypinyin'): + result = pinyin(text, style=Style.NORMAL) + return ' '.join([''.join(p) for p in result]) + + # Fallback: use basic vocabulary mapping + result = [] + for char in text: + if char in CHINESE_PINYIN_MAP: + result.append(CHINESE_PINYIN_MAP[char]) + elif char == ' ': + result.append(' ') + elif char.isalnum(): + result.append(char) + + # If we got no result, return warning + if not result: + return '[REQUIRES_PYPINYIN]' + + return ''.join(result) + + +# ============================================================================= +# JAPANESE TRANSLITERATION (Modified Hepburn) +# ============================================================================= + +# Basic Kanji/Kana to Romaji map for common heritage vocabulary +# This allows basic transliteration without pykakasi library +JAPANESE_ROMAJI_MAP = { + # Common heritage vocabulary Kanji + '博': 'haku', '物': 'butsu', '館': 'kan', '院': 'in', + '文': 'bun', '化': 'ka', '藝': 'gei', '術': 'jutsu', + '歷': 'reki', '史': 'shi', '遺': 'i', '產': 'san', + '國': 'koku', '国': 'koku', '立': 'ritsu', '家': 'ka', + '民': 'min', '族': 'zoku', '中': 'chuu', '央': 'ou', + '圖': 'to', '図': 'to', '書': 'sho', '檔': 'tou', '案': 'an', + '美': 'bi', '古': 'ko', '典': 'ten', '藏': 'zou', '品': 'hin', + '展': 'ten', '覽': 'ran', '紀': 'ki', '念': 'nen', + '寺': 'ji', '宮': 'kyuu', '殿': 'den', '城': 'jou', '堡': 'hou', + '樓': 'rou', '閣': 'kaku', '亭': 'tei', '園': 'en', + '研': 'ken', '究': 'kyuu', '所': 'sho', '心': 'shin', + '學': 'gaku', '学': 'gaku', '校': 'kou', '系': 'kei', + '會': 'kai', '会': 'kai', '社': 'sha', '團': 'dan', + '東': 'tou', '西': 'sei', '南': 'nan', '北': 'hoku', + '京': 'kyou', '都': 'to', '海': 'kai', '山': 'zan', '河': 'ka', '川': 'kawa', + '大': 'dai', '小': 'shou', '新': 'shin', '老': 'rou', + '自': 'ji', '然': 'nen', '科': 'ka', '技': 'gi', + '故': 'ko', '金': 'kin', '銀': 'gin', + '教': 'kyou', '育': 'iku', '傳': 'den', '統': 'tou', + '陶': 'tou', '瓷': 'ji', '玉': 'gyoku', '石': 'seki', '銅': 'dou', + '畫': 'ga', '画': 'ga', '雕': 'chou', '塑': 'so', + '俗': 'zoku', '風': 'fuu', '土': 'do', + '革': 'kaku', '命': 'mei', '戰': 'sen', '爭': 'sou', + '軍': 'gun', '事': 'ji', '航': 'kou', '空': 'kuu', '天': 'ten', + '宗': 'shuu', '佛': 'butsu', '道': 'dou', '儒': 'ju', + '琴': 'kin', '樂': 'gaku', '舞': 'bu', '劇': 'geki', '戲': 'gi', + '茶': 'cha', '酒': 'shu', '食': 'shoku', '餐': 'san', + '衣': 'i', '服': 'fuku', '紡': 'bou', '織': 'shoku', + '建': 'ken', '築': 'chiku', '房': 'bou', '屋': 'oku', + '水': 'sui', '電': 'den', '火': 'ka', '木': 'moku', + '農': 'nou', '業': 'gyou', '工': 'kou', '商': 'shou', + '醫': 'i', '藥': 'yaku', '人': 'jin', '生': 'sei', '活': 'katsu', + '和': 'wa', '平': 'hei', '友': 'yuu', '誼': 'gi', + '港': 'kou', '灣': 'wan', '華': 'ka', '僑': 'kyou', + '外': 'gai', '交': 'kou', '流': 'ryuu', + '保': 'ho', '護': 'go', '修': 'shuu', '復': 'fuku', + '鑒': 'kan', '定': 'tei', '評': 'hyou', '估': 'ko', + '記': 'ki', '録': 'roku', '資': 'shi', '料': 'ryou', + # Hiragana + 'あ': 'a', 'い': 'i', 'う': 'u', 'え': 'e', 'お': 'o', + 'か': 'ka', 'き': 'ki', 'く': 'ku', 'け': 'ke', 'こ': 'ko', + 'さ': 'sa', 'し': 'shi', 'す': 'su', 'せ': 'se', 'そ': 'so', + 'た': 'ta', 'ち': 'chi', 'つ': 'tsu', 'て': 'te', 'と': 'to', + 'な': 'na', 'に': 'ni', 'ぬ': 'nu', 'ね': 'ne', 'の': 'no', + 'は': 'ha', 'ひ': 'hi', 'ふ': 'fu', 'へ': 'he', 'ほ': 'ho', + 'ま': 'ma', 'み': 'mi', 'む': 'mu', 'め': 'me', 'も': 'mo', + 'や': 'ya', 'ゆ': 'yu', 'よ': 'yo', + 'ら': 'ra', 'り': 'ri', 'る': 'ru', 'れ': 're', 'ろ': 'ro', + 'わ': 'wa', 'を': 'wo', 'ん': 'n', + # Katakana + 'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o', + 'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke', 'コ': 'ko', + 'サ': 'sa', 'シ': 'shi', 'ス': 'su', 'セ': 'se', 'ソ': 'so', + 'タ': 'ta', 'チ': 'chi', 'ツ': 'tsu', 'テ': 'te', 'ト': 'to', + 'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no', + 'ハ': 'ha', 'ヒ': 'hi', 'フ': 'fu', 'ヘ': 'he', 'ホ': 'ho', + 'マ': 'ma', 'ミ': 'mi', 'ム': 'mu', 'メ': 'me', 'モ': 'mo', + 'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo', + 'ラ': 'ra', 'リ': 'ri', 'ル': 'ru', 'レ': 're', 'ロ': 'ro', + 'ワ': 'wa', 'ヲ': 'wo', 'ン': 'n', +} + + +def transliterate_japanese(text: str) -> str: + """Transliterate Japanese to Romaji using Modified Hepburn.""" + if AVAILABLE_LIBS.get('pykakasi'): + kakasi = pykakasi.kakasi() + result = kakasi.convert(text) + return ' '.join([item['hepburn'] for item in result]) + + # Fallback: use basic vocabulary mapping + result = [] + for char in text: + if char in JAPANESE_ROMAJI_MAP: + result.append(JAPANESE_ROMAJI_MAP[char]) + elif char == ' ': + result.append(' ') + elif char.isalnum(): + result.append(char) + + # If we got no result, return warning + if not result: + return '[REQUIRES_PYKAKASI]' + + return ''.join(result) + + +# ============================================================================= +# KOREAN TRANSLITERATION (Revised Romanization) +# ============================================================================= + +# Basic Hangul syllable decomposition tables +HANGUL_INITIALS = [ + 'g', 'kk', 'n', 'd', 'tt', 'r', 'm', 'b', 'pp', 's', 'ss', '', + 'j', 'jj', 'ch', 'k', 't', 'p', 'h' +] + +HANGUL_MEDIALS = [ + 'a', 'ae', 'ya', 'yae', 'eo', 'e', 'yeo', 'ye', 'o', 'wa', 'wae', + 'oe', 'yo', 'u', 'wo', 'we', 'wi', 'yu', 'eu', 'ui', 'i' +] + +HANGUL_FINALS = [ + '', 'k', 'k', 'k', 'n', 'n', 'n', 't', 'l', 'l', 'l', 'l', 'l', + 'l', 'l', 'l', 'm', 'p', 'p', 's', 's', 'ng', 't', 't', 'k', 't', 'p', 't' +] + + +def transliterate_korean(text: str) -> str: + """Transliterate Korean Hangul to Revised Romanization.""" + try: + from korean_romanizer.romanizer import Romanizer + r = Romanizer(text) + return r.romanize() + except ImportError: + pass + + # Fallback: basic syllable decomposition + result = [] + for char in text: + code = ord(char) + if 0xAC00 <= code <= 0xD7AF: # Hangul syllable + code -= 0xAC00 + initial = code // (21 * 28) + medial = (code % (21 * 28)) // 28 + final = code % 28 + + syllable = HANGUL_INITIALS[initial] + HANGUL_MEDIALS[medial] + if final > 0: + syllable += HANGUL_FINALS[final] + result.append(syllable) + else: + result.append(char) + + return ''.join(result) + + +# ============================================================================= +# ARABIC TRANSLITERATION (ISO 233-2) +# ============================================================================= + +ARABIC_MAP = { + 'ا': 'a', 'أ': 'a', 'إ': 'i', 'آ': 'a', 'ء': "'", + 'ب': 'b', 'ت': 't', 'ث': 'th', 'ج': 'j', + 'ح': 'h', 'خ': 'kh', 'د': 'd', 'ذ': 'dh', + 'ر': 'r', 'ز': 'z', 'س': 's', 'ش': 'sh', + 'ص': 's', 'ض': 'd', 'ط': 't', 'ظ': 'z', + 'ع': "'", 'غ': 'gh', 'ف': 'f', 'ق': 'q', + 'ك': 'k', 'ل': 'l', 'م': 'm', 'ن': 'n', + 'ه': 'h', 'و': 'w', 'ي': 'y', 'ى': 'a', + 'ة': 'a', + # Persian additions + 'پ': 'p', 'چ': 'ch', 'ژ': 'zh', 'گ': 'g', + 'ک': 'k', 'ی': 'i', + # Urdu additions + 'ٹ': 't', 'ڈ': 'd', 'ڑ': 'r', 'ں': 'n', + # Diacritics (vowel marks) + 'َ': 'a', 'ِ': 'i', 'ُ': 'u', + 'ً': 'an', 'ٍ': 'in', 'ٌ': 'un', + 'ّ': '', # Shadda (gemination) - simplified +} + + +def transliterate_arabic(text: str) -> str: + """Transliterate Arabic script to Latin (ISO 233 simplified).""" + result = [] + for c in text: + if c in ARABIC_MAP: + result.append(ARABIC_MAP[c]) + elif c == ' ' or c.isalnum(): + result.append(c) + elif c == '\u200c': # Zero-width non-joiner (Persian) + result.append('-') + return ''.join(result) + + +# ============================================================================= +# HEBREW TRANSLITERATION (ISO 259-3) +# ============================================================================= + +HEBREW_MAP = { + 'א': '', 'ב': 'v', 'ג': 'g', 'ד': 'd', 'ה': 'h', + 'ו': 'v', 'ז': 'z', 'ח': 'ch', 'ט': 't', 'י': 'y', + 'כ': 'k', 'ך': 'k', 'ל': 'l', 'מ': 'm', 'ם': 'm', + 'נ': 'n', 'ן': 'n', 'ס': 's', 'ע': '', 'פ': 'f', + 'ף': 'f', 'צ': 'ts', 'ץ': 'ts', 'ק': 'k', 'ר': 'r', + 'ש': 'sh', 'ת': 't', + # With dagesh + 'בּ': 'b', 'כּ': 'k', 'פּ': 'p', +} + + +def transliterate_hebrew(text: str) -> str: + """Transliterate Hebrew to Latin (ISO 259-3 simplified).""" + result = [] + for c in text: + if c in HEBREW_MAP: + result.append(HEBREW_MAP[c]) + elif c == ' ' or c.isalnum(): + result.append(c) + return ''.join(result) + + +# ============================================================================= +# GREEK TRANSLITERATION (ISO 843) +# ============================================================================= + +GREEK_MAP = { + 'Α': 'A', 'α': 'a', 'Β': 'V', 'β': 'v', 'Γ': 'G', 'γ': 'g', + 'Δ': 'D', 'δ': 'd', 'Ε': 'E', 'ε': 'e', 'Ζ': 'Z', 'ζ': 'z', + 'Η': 'I', 'η': 'i', 'Θ': 'Th', 'θ': 'th', 'Ι': 'I', 'ι': 'i', + 'Κ': 'K', 'κ': 'k', 'Λ': 'L', 'λ': 'l', 'Μ': 'M', 'μ': 'm', + 'Ν': 'N', 'ν': 'n', 'Ξ': 'X', 'ξ': 'x', 'Ο': 'O', 'ο': 'o', + 'Π': 'P', 'π': 'p', 'Ρ': 'R', 'ρ': 'r', 'Σ': 'S', 'σ': 's', + 'ς': 's', 'Τ': 'T', 'τ': 't', 'Υ': 'Y', 'υ': 'y', 'Φ': 'F', + 'φ': 'f', 'Χ': 'Ch', 'χ': 'ch', 'Ψ': 'Ps', 'ψ': 'ps', + 'Ω': 'O', 'ω': 'o', + # With accents + 'Ά': 'A', 'ά': 'a', 'Έ': 'E', 'έ': 'e', 'Ή': 'I', 'ή': 'i', + 'Ί': 'I', 'ί': 'i', 'Ό': 'O', 'ό': 'o', 'Ύ': 'Y', 'ύ': 'y', + 'Ώ': 'O', 'ώ': 'o', 'ϊ': 'i', 'ϋ': 'y', 'ΐ': 'i', 'ΰ': 'y', +} + + +def transliterate_greek(text: str) -> str: + """Transliterate Greek to Latin (ISO 843).""" + return ''.join(GREEK_MAP.get(c, c) for c in text) + + +# ============================================================================= +# DEVANAGARI TRANSLITERATION (ISO 15919) +# ============================================================================= + +DEVANAGARI_MAP = { + # Vowels + 'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ii', 'उ': 'u', 'ऊ': 'uu', + 'ऋ': 'ri', 'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au', + # Vowel marks + 'ा': 'a', 'ि': 'i', 'ी': 'i', 'ु': 'u', 'ू': 'u', + 'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au', 'ं': 'm', 'ः': 'h', + # Consonants + 'क': 'ka', 'ख': 'kha', 'ग': 'ga', 'घ': 'gha', 'ङ': 'nga', + 'च': 'cha', 'छ': 'chha', 'ज': 'ja', 'झ': 'jha', 'ञ': 'nya', + 'ट': 'ta', 'ठ': 'tha', 'ड': 'da', 'ढ': 'dha', 'ण': 'na', + 'त': 'ta', 'थ': 'tha', 'द': 'da', 'ध': 'dha', 'न': 'na', + 'प': 'pa', 'फ': 'pha', 'ब': 'ba', 'भ': 'bha', 'म': 'ma', + 'य': 'ya', 'र': 'ra', 'ल': 'la', 'व': 'va', + 'श': 'sha', 'ष': 'sha', 'स': 'sa', 'ह': 'ha', + '्': '', # Virama (removes inherent 'a') + # Hindi-specific + 'ड़': 'da', 'ढ़': 'dha', 'क़': 'qa', 'ख़': 'kha', 'ग़': 'gha', + 'ज़': 'za', 'फ़': 'fa', +} + + +def transliterate_devanagari(text: str) -> str: + """Transliterate Devanagari to Latin (ISO 15919 simplified).""" + try: + from indic_transliteration import sanscript + from indic_transliteration.sanscript import transliterate as indic_translit + return indic_translit(text, sanscript.DEVANAGARI, sanscript.IAST) + except ImportError: + pass + + # Fallback: basic mapping + result = [] + for c in text: + if c in DEVANAGARI_MAP: + result.append(DEVANAGARI_MAP[c]) + elif c == ' ': + result.append(' ') + elif c.isalnum(): + result.append(c) + return ''.join(result) + + +# ============================================================================= +# THAI TRANSLITERATION (ISO 11940-2 / Royal Thai General System) +# ============================================================================= + +# Thai consonants with RTGS romanization +# Note: Thai consonants have inherent vowel 'o' or 'a' depending on syllable structure +THAI_CONSONANTS = { + # Initial consonants (high, mid, low class) + 'ก': 'k', 'ข': 'kh', 'ฃ': 'kh', 'ค': 'kh', 'ฅ': 'kh', 'ฆ': 'kh', + 'ง': 'ng', + 'จ': 'ch', 'ฉ': 'ch', 'ช': 'ch', 'ซ': 's', 'ฌ': 'ch', + 'ญ': 'y', # Initial: y, Final: n + 'ฎ': 'd', 'ฏ': 't', 'ฐ': 'th', 'ฑ': 'th', 'ฒ': 'th', + 'ณ': 'n', + 'ด': 'd', 'ต': 't', 'ถ': 'th', 'ท': 'th', 'ธ': 'th', + 'น': 'n', + 'บ': 'b', 'ป': 'p', 'ผ': 'ph', 'ฝ': 'f', 'พ': 'ph', 'ฟ': 'f', 'ภ': 'ph', + 'ม': 'm', + 'ย': 'y', 'ร': 'r', 'ล': 'l', 'ว': 'w', + 'ศ': 's', 'ษ': 's', 'ส': 's', 'ห': 'h', 'ฬ': 'l', 'อ': '', # อ is silent initial + 'ฮ': 'h', +} + +# Thai vowels (can appear before, after, above, or below consonants) +THAI_VOWELS = { + # Following vowels + 'ะ': 'a', 'า': 'a', 'ำ': 'am', + 'ิ': 'i', 'ี': 'i', + 'ึ': 'ue', 'ื': 'ue', + 'ุ': 'u', 'ู': 'u', + 'เ': 'e', # Leading vowel + 'แ': 'ae', # Leading vowel + 'โ': 'o', # Leading vowel + 'ใ': 'ai', # Leading vowel + 'ไ': 'ai', # Leading vowel + 'ๅ': 'a', # Lakkhangyao (rare) + # Vowel combinations are handled by position +} + +# Thai tone marks (don't affect RTGS romanization - just skip) +THAI_TONE_MARKS = {'่', '้', '๊', '๋'} + +# Thai special characters +THAI_SPECIAL = { + '็': '', # Maitaikhu (shortens vowel) + '์': '', # Thanthakhat (silent letter marker) + 'ๆ': '', # Maiyamok (repetition) + '฿': 'B', # Baht symbol + 'ฯ': '', # Paiyannoi (abbreviation) + '๏': '', # Fongman (obsolete) + 'ํ': 'm', # Nikhahit (nasalization, often 'm') + 'ฺ': '', # Phinthu (Sanskrit virama) + 'ๆ': '', # Mai yamok (repeat previous) +} + +# Thai numerals +THAI_NUMERALS = { + '๐': '0', '๑': '1', '๒': '2', '๓': '3', '๔': '4', + '๕': '5', '๖': '6', '๗': '7', '๘': '8', '๙': '9', +} + +# Common Thai heritage vocabulary - direct mappings for accuracy +# These handle complex syllable combinations correctly +THAI_HERITAGE_VOCAB = { + # Common institutional terms + 'สำนัก': 'samnak', + 'หอจดหมายเหตุ': 'ho chotmaihet', + 'หอสมุด': 'ho samut', + 'แห่งชาติ': 'haeng chat', + 'พิพิธภัณฑ': 'phiphitthaphan', + 'พิพิธภัณฑสถาน': 'phiphitthaphanthasathan', + 'พระนคร': 'phra nakhon', + 'สยาม': 'sayam', + 'สมาคม': 'samakhom', + 'ใน': 'nai', + 'พระบรมราชูปถัมภ์': 'phra borommarachuppatham', + 'พระที่นั่ง': 'phra thi nang', + 'ศิวโมกข': 'siwamok', + 'พิมาน': 'phiman', + 'วัด': 'wat', + 'โพธิ์': 'pho', + 'ราม': 'ram', + # Geographic terms + 'กรุงเทพ': 'krung thep', + 'กรุงเทพมหานคร': 'krung thep maha nakhon', + 'เชียงใหม่': 'chiang mai', + 'ภูเก็ต': 'phuket', + # Institution types + 'มหาวิทยาลัย': 'mahawitthayalai', + 'ศูนย์': 'sun', + 'สถาบัน': 'sathaban', + 'องค์กร': 'ongkon', + 'กรม': 'krom', + 'กระทรวง': 'krasuang', + # Cultural terms + 'วัฒนธรรม': 'watthanatham', + 'ศิลปะ': 'sinlapa', + 'ประวัติศาสตร์': 'prawattisat', + 'โบราณ': 'boran', + 'มรดก': 'moradok', +} + + +def transliterate_thai(text: str) -> str: + """Transliterate Thai to Latin (Royal Thai General System). + + Uses pythainlp if available, otherwise falls back to vocabulary lookup + and character-by-character transliteration. + """ + try: + from pythainlp.transliterate import romanize + return romanize(text, engine='royin') # Royal Institute standard + except ImportError: + pass + + # Fallback: vocabulary lookup + character mapping + result = text + + # First pass: replace known vocabulary items (longest match first) + for thai, latin in sorted(THAI_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])): + result = result.replace(thai, f' {latin} ') + + # Second pass: transliterate remaining Thai characters + output = [] + i = 0 + while i < len(result): + c = result[i] + + # Skip if already Latin + if c.isascii(): + output.append(c) + i += 1 + continue + + # Check consonants + if c in THAI_CONSONANTS: + output.append(THAI_CONSONANTS[c]) + i += 1 + continue + + # Check vowels + if c in THAI_VOWELS: + output.append(THAI_VOWELS[c]) + i += 1 + continue + + # Skip tone marks + if c in THAI_TONE_MARKS: + i += 1 + continue + + # Check special characters + if c in THAI_SPECIAL: + output.append(THAI_SPECIAL[c]) + i += 1 + continue + + # Check numerals + if c in THAI_NUMERALS: + output.append(THAI_NUMERALS[c]) + i += 1 + continue + + # Unknown character - keep as is + output.append(c) + i += 1 + + # Clean up spacing + result = ''.join(output) + result = ' '.join(result.split()) # Normalize whitespace + + return result + + +# ============================================================================= +# ARMENIAN TRANSLITERATION (ISO 9985) +# ============================================================================= + +ARMENIAN_MAP = { + 'Ա': 'A', 'ա': 'a', 'Բ': 'B', 'բ': 'b', 'Գ': 'G', 'գ': 'g', + 'Դ': 'D', 'դ': 'd', 'Ե': 'E', 'ե': 'e', 'Զ': 'Z', 'զ': 'z', + 'Է': 'E', 'է': 'e', 'Ը': 'Y', 'ը': 'y', 'Թ': 'T', 'թ': 't', + 'Ժ': 'Zh', 'ժ': 'zh', 'Ի': 'I', 'ի': 'i', 'Լ': 'L', 'լ': 'l', + 'Խ': 'Kh', 'խ': 'kh', 'Ծ': 'Ts', 'ծ': 'ts', 'Կ': 'K', 'կ': 'k', + 'Հ': 'H', 'հ': 'h', 'Ձ': 'Dz', 'ձ': 'dz', 'Ղ': 'Gh', 'ղ': 'gh', + 'Ճ': 'Ch', 'ճ': 'ch', 'Մ': 'M', 'մ': 'm', 'Յ': 'Y', 'յ': 'y', + 'Ն': 'N', 'ն': 'n', 'Շ': 'Sh', 'շ': 'sh', 'Ո': 'O', 'ո': 'o', + 'Չ': 'Ch', 'չ': 'ch', 'Պ': 'P', 'պ': 'p', 'Ջ': 'J', 'ջ': 'j', + ' Delays': 'R', ' delays': 'r', ' Delays': 'S', 'ս': 's', 'Ვ': 'V', 'վ': 'v', + 'Տ': 'T', 'տ': 't', 'ร': 'R', 'ր': 'r', 'Ց': 'Ts', 'ց': 'ts', + 'Ւ': 'W', 'ւ': 'w', 'Փ': 'P', 'փ': 'p', 'Ք': 'K', 'ք': 'k', + 'Օ': 'O', 'օ': 'o', 'Ֆ': 'F', 'ֆ': 'f', +} + + +def transliterate_armenian(text: str) -> str: + """Transliterate Armenian to Latin (ISO 9985).""" + return ''.join(ARMENIAN_MAP.get(c, c) for c in text) + + +# ============================================================================= +# GEORGIAN TRANSLITERATION (ISO 9984) +# ============================================================================= + +GEORGIAN_MAP = { + 'ა': 'a', 'ბ': 'b', 'გ': 'g', 'დ': 'd', 'ე': 'e', 'ვ': 'v', + 'ზ': 'z', 'თ': 't', 'ი': 'i', 'კ': 'k', 'ლ': 'l', 'მ': 'm', + 'ნ': 'n', 'ო': 'o', 'პ': 'p', 'ჟ': 'zh', 'რ': 'r', 'ს': 's', + 'ტ': 't', 'უ': 'u', 'ფ': 'p', 'ქ': 'k', 'ღ': 'gh', 'ყ': 'q', + 'შ': 'sh', 'ჩ': 'ch', 'ც': 'ts', 'ძ': 'dz', 'წ': 'ts', 'ჭ': 'ch', + 'ხ': 'kh', 'ჯ': 'j', 'ჰ': 'h', +} + + +def transliterate_georgian(text: str) -> str: + """Transliterate Georgian to Latin (ISO 9984).""" + return ''.join(GEORGIAN_MAP.get(c, c) for c in text) + + +# ============================================================================= +# BENGALI TRANSLITERATION (ISO 15919) +# ============================================================================= + +BENGALI_MAP = { + # Vowels + 'অ': 'a', 'আ': 'aa', 'ই': 'i', 'ঈ': 'ii', 'উ': 'u', 'ঊ': 'uu', + 'এ': 'e', 'ঐ': 'ai', 'ও': 'o', 'ঔ': 'au', + # Consonants + 'ক': 'ka', 'খ': 'kha', 'গ': 'ga', 'ঘ': 'gha', 'ঙ': 'nga', + 'চ': 'cha', 'ছ': 'chha', 'জ': 'ja', 'ঝ': 'jha', 'ঞ': 'nya', + 'ট': 'ta', 'ঠ': 'tha', 'ড': 'da', 'ঢ': 'dha', 'ণ': 'na', + 'ত': 'ta', 'থ': 'tha', 'দ': 'da', 'ধ': 'dha', 'ন': 'na', + 'প': 'pa', 'ফ': 'pha', 'ব': 'ba', 'ভ': 'bha', 'ম': 'ma', + 'য': 'ya', 'র': 'ra', 'ল': 'la', 'শ': 'sha', 'ষ': 'sha', + 'স': 'sa', 'হ': 'ha', 'ড়': 'ra', 'ঢ়': 'rha', 'য়': 'ya', + '়': '', # Nukta + '্': '', # Virama + # Vowel marks + 'া': 'a', 'ি': 'i', 'ী': 'i', 'ু': 'u', 'ূ': 'u', + 'ে': 'e', 'ৈ': 'ai', 'ো': 'o', 'ৌ': 'au', + 'ং': 'ng', 'ঃ': 'h', 'ঁ': 'n', +} + + +def transliterate_bengali(text: str) -> str: + """Transliterate Bengali to Latin (ISO 15919 simplified).""" + result = [] + for c in text: + if c in BENGALI_MAP: + result.append(BENGALI_MAP[c]) + elif c == ' ': + result.append(' ') + elif c.isalnum(): + result.append(c) + return ''.join(result) + + +# ============================================================================= +# SINHALA TRANSLITERATION (ISO 15919) +# ============================================================================= + +# Sinhala character map (ISO 15919 romanization) +SINHALA_MAP = { + # Independent vowels + 'අ': 'a', 'ආ': 'aa', 'ඇ': 'ae', 'ඈ': 'aae', + 'ඉ': 'i', 'ඊ': 'ii', 'උ': 'u', 'ඌ': 'uu', + 'එ': 'e', 'ඒ': 'ee', 'ඓ': 'ai', + 'ඔ': 'o', 'ඕ': 'oo', 'ඖ': 'au', + 'ඍ': 'ri', 'ඎ': 'rii', + + # Consonants (with inherent 'a' vowel) + 'ක': 'ka', 'ඛ': 'kha', 'ග': 'ga', 'ඝ': 'gha', 'ඞ': 'nga', 'ඟ': 'nnga', + 'ච': 'cha', 'ඡ': 'chha', 'ජ': 'ja', 'ඣ': 'jha', 'ඤ': 'nya', 'ඥ': 'gnya', + 'ට': 'ta', 'ඨ': 'tha', 'ඩ': 'da', 'ඪ': 'dha', 'ණ': 'na', 'ඬ': 'nda', + 'ත': 'tha', 'ථ': 'thha', 'ද': 'da', 'ධ': 'dha', 'න': 'na', 'ඳ': 'nda', + 'ප': 'pa', 'ඵ': 'pha', 'බ': 'ba', 'භ': 'bha', 'ම': 'ma', 'ඹ': 'mba', + 'ය': 'ya', 'ර': 'ra', 'ල': 'la', 'ව': 'va', 'ළ': 'la', + 'ශ': 'sha', 'ෂ': 'sha', 'ස': 'sa', 'හ': 'ha', + 'ෆ': 'fa', # Used for foreign words + + # Dependent vowel signs (matras) + 'ා': 'a', 'ැ': 'ae', 'ෑ': 'aae', + 'ි': 'i', 'ී': 'ii', 'ු': 'u', 'ූ': 'uu', + 'ෙ': 'e', 'ේ': 'ee', 'ෛ': 'ai', + 'ො': 'o', 'ෝ': 'oo', 'ෞ': 'au', + 'ෘ': 'ri', 'ෲ': 'rii', + + # Special marks + '්': '', # Virama (hal kirima) - removes inherent vowel + 'ං': 'ng', # Anusvara + 'ඃ': 'h', # Visarga + '෴': '', # Kunddaliya (punctuation) + + # Numerals (Sinhala uses both Sinhala and Arabic numerals) + '෦': '0', '෧': '1', '෨': '2', '෩': '3', '෪': '4', + '෫': '5', '෬': '6', '෭': '7', '෮': '8', '෯': '9', +} + +# Common Sinhala heritage vocabulary +SINHALA_HERITAGE_VOCAB = { + # University/Education + 'විශ්වවිද්‍යාලය': 'vishvavidyalaya', + 'විශ්වවිද්': 'vishvavid', + 'යාලය': 'yalaya', + 'පේරාදෙණිය': 'peradeniya', + + # National/Government + 'ජාතික': 'jathika', + 'දෙපාර්තමේන්තුව': 'departmentuwa', + + # Museums/Archives + 'කෞතුකාගාර': 'kauthukagara', + 'කෞතුකාගාරය': 'kauthukagaaraya', + 'ලේඛනාගාරය': 'lekhanagaaraya', + 'පුස්තකාලය': 'pusthakaalaya', + + # Places + 'කොළඹ': 'colombo', + 'ශ්‍රී': 'sri', + 'ලංකාව': 'lankava', +} + + +def transliterate_sinhala(text: str) -> str: + """Transliterate Sinhala to Latin (ISO 15919). + + Args: + text: Text in Sinhala script + + Returns: + Romanized text using ISO 15919 standard + """ + # First pass: replace known vocabulary (longest match first) + result = text + for sinhala, latin in sorted(SINHALA_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])): + result = result.replace(sinhala, f' {latin} ') + + # Second pass: transliterate remaining characters + output = [] + i = 0 + while i < len(result): + c = result[i] + + # Skip if already Latin + if c.isascii(): + output.append(c) + i += 1 + continue + + # Check character map + if c in SINHALA_MAP: + output.append(SINHALA_MAP[c]) + i += 1 + continue + + # Unknown character - keep as is or skip + if c == ' ': + output.append(' ') + elif c.isalnum(): + output.append(c) + i += 1 + + # Clean up spacing + result = ''.join(output) + result = ' '.join(result.split()) + + return result + + +# ============================================================================= +# KHMER TRANSLITERATION (UNGEGN Romanization) +# ============================================================================= + +# Khmer consonants (with inherent 'a' or 'o' vowel depending on register) +KHMER_CONSONANTS = { + # First series (inherent 'aa' in open syllables) + 'ក': 'k', 'ខ': 'kh', 'គ': 'k', 'ឃ': 'kh', 'ង': 'ng', + 'ច': 'ch', 'ឆ': 'chh', 'ជ': 'ch', 'ឈ': 'chh', 'ញ': 'nh', + 'ដ': 'd', 'ឋ': 'th', 'ឌ': 'd', 'ឍ': 'th', 'ណ': 'n', + 'ត': 't', 'ថ': 'th', 'ទ': 't', 'ធ': 'th', 'ន': 'n', + 'ប': 'b', 'ផ': 'ph', 'ព': 'p', 'ភ': 'ph', 'ម': 'm', + 'យ': 'y', 'រ': 'r', 'ល': 'l', 'វ': 'v', 'ឝ': 'sh', + 'ឞ': 's', 'ស': 's', 'ហ': 'h', 'ឡ': 'l', 'អ': '', +} + +# Khmer dependent vowels +KHMER_VOWELS = { + 'ា': 'a', 'ិ': 'i', 'ី': 'ii', 'ឹ': 'eu', 'ឺ': 'eu', + 'ុ': 'o', 'ូ': 'ou', 'ួ': 'ua', 'ើ': 'ae', + 'ែ': 'ae', 'ៃ': 'ai', 'ោ': 'ao', 'ៅ': 'au', + '្': '', # Subscript consonant marker (coeng) +} + +# Khmer independent vowels +KHMER_INDEP_VOWELS = { + 'ឥ': 'i', 'ឦ': 'ii', 'ឧ': 'u', 'ឨ': 'uk', + 'ឩ': 'uu', 'ឪ': 'ou', 'ឫ': 'ry', 'ឬ': 'ryy', + 'ឭ': 'ly', 'ឮ': 'lyy', 'ឯ': 'ae', 'ឰ': 'ai', + 'ឱ': 'ao', 'ឲ': 'ao', 'ឳ': 'au', +} + +# Khmer special signs +KHMER_SPECIAL = { + 'ំ': 'm', # Nikahit (anusvara) + 'ះ': 'h', # Visarga + '់': '', # Bantoc (shortens vowel) + '៌': 'r', # Robat (repha) + '៍': '', # Toandakhiat (silent letter) + '៎': '', # Kakabat (emphasis) + '៏': '', # Ahsda (obsolete) + '៑': '', # Viriam (obsolete punctuation) + '៖': ':', # Camnuc pii kuuh (colon) + '។': '.', # Khan (period) + '៕': '.', # Bariyoosan (end mark) + '៚': '', # Koomuut (section mark) +} + +# Khmer numerals +KHMER_NUMERALS = { + '០': '0', '១': '1', '២': '2', '៣': '3', '៤': '4', + '៥': '5', '៦': '6', '៧': '7', '៨': '8', '៩': '9', +} + +# Common Khmer heritage vocabulary +KHMER_HERITAGE_VOCAB = { + # Museums/Memorials + 'សារមន្ទីរ': 'saaramontir', + 'សារមន្ទីរទួលស្លែង': 'saaramontir tuol sleng', + 'ទួលស្លែង': 'tuol sleng', + + # Archives/Libraries + 'បណ្ណាល័យ': 'bannaalay', + 'ឯកសារដ្ឋាន': 'aeksaarathan', + 'ជាតិ': 'cheate', + + # Places + 'ភ្នំពេញ': 'phnom penh', + 'អង្គរ': 'angkor', + 'សៀមរាប': 'siem reap', + + # Cultural terms + 'វប្បធម៌': 'vabpatham', + 'បេតិកភណ្ឌ': 'betekaphon', + 'ប្រវត្តិសាស្ត្រ': 'pravattisaas', +} + + +def transliterate_khmer(text: str) -> str: + """Transliterate Khmer to Latin (UNGEGN system). + + Args: + text: Text in Khmer script + + Returns: + Romanized text using UNGEGN standard + """ + # First pass: replace known vocabulary (longest match first) + result = text + for khmer, latin in sorted(KHMER_HERITAGE_VOCAB.items(), key=lambda x: -len(x[0])): + result = result.replace(khmer, f' {latin} ') + + # Second pass: transliterate remaining characters + output = [] + i = 0 + while i < len(result): + c = result[i] + + # Skip if already Latin + if c.isascii(): + output.append(c) + i += 1 + continue + + # Check consonants + if c in KHMER_CONSONANTS: + output.append(KHMER_CONSONANTS[c]) + i += 1 + continue + + # Check vowels + if c in KHMER_VOWELS: + output.append(KHMER_VOWELS[c]) + i += 1 + continue + + # Check independent vowels + if c in KHMER_INDEP_VOWELS: + output.append(KHMER_INDEP_VOWELS[c]) + i += 1 + continue + + # Check special signs + if c in KHMER_SPECIAL: + output.append(KHMER_SPECIAL[c]) + i += 1 + continue + + # Check numerals + if c in KHMER_NUMERALS: + output.append(KHMER_NUMERALS[c]) + i += 1 + continue + + # Unknown character - keep as is or skip + if c == ' ': + output.append(' ') + elif c.isalnum(): + output.append(c) + i += 1 + + # Clean up spacing + result = ''.join(output) + result = ' '.join(result.split()) + + return result + + +# ============================================================================= +# MAIN TRANSLITERATION DISPATCHER +# ============================================================================= + +# Language to script mapping +LANG_SCRIPT_MAP = { + 'ru': 'cyrillic', 'uk': 'cyrillic', 'bg': 'cyrillic', + 'sr': 'cyrillic', 'kk': 'cyrillic', + 'zh': 'chinese', + 'ja': 'japanese', + 'ko': 'korean', + 'ar': 'arabic', 'fa': 'arabic', 'ur': 'arabic', + 'he': 'hebrew', + 'el': 'greek', + 'hi': 'devanagari', 'ne': 'devanagari', + 'bn': 'bengali', + 'th': 'thai', + 'hy': 'armenian', + 'ka': 'georgian', + 'si': 'sinhala', + 'km': 'khmer', +} + +TRANSLITERATORS = { + 'cyrillic': transliterate_cyrillic, + 'chinese': transliterate_chinese, + 'japanese': transliterate_japanese, + 'korean': transliterate_korean, + 'arabic': transliterate_arabic, + 'hebrew': transliterate_hebrew, + 'greek': transliterate_greek, + 'devanagari': transliterate_devanagari, + 'bengali': transliterate_bengali, + 'thai': transliterate_thai, + 'armenian': transliterate_armenian, + 'georgian': transliterate_georgian, + 'sinhala': transliterate_sinhala, + 'khmer': transliterate_khmer, + 'latin': lambda t: t, # No transliteration needed +} + + +def transliterate(text: str, lang: Optional[str] = None) -> str: + """ + Transliterate text from non-Latin script to Latin. + + Args: + text: Input text in any script + lang: Optional ISO 639-1 language code (e.g., 'ru', 'zh', 'ko') + If not provided, script is auto-detected. + + Returns: + Transliterated text in Latin characters. + """ + if not text: + return text + + # Determine script + if lang and lang in LANG_SCRIPT_MAP: + script = LANG_SCRIPT_MAP[lang] + else: + script = detect_script(text) + + # Get transliterator + translit_func = TRANSLITERATORS.get(script, lambda t: t) + + # For Cyrillic, pass language for dialect-specific handling + if script == 'cyrillic' and lang: + result = translit_func(text, lang) + else: + result = translit_func(text) + + # Normalize diacritics to ASCII + normalized = unicodedata.normalize('NFD', result) + ascii_result = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + + return ascii_result + + +def transliterate_for_abbreviation(emic_name: str, lang: str) -> str: + """ + Transliterate emic name for GHCID abbreviation generation. + + This is the main entry point for GHCID generation scripts. + + Args: + emic_name: Institution name in original script + lang: ISO 639-1 language code + + Returns: + Transliterated name ready for abbreviation extraction + """ + # Step 1: Transliterate to Latin + latin = transliterate(emic_name, lang) + + # Step 2: Remove special characters (except spaces and hyphens) + clean = re.sub(r"[^a-zA-Z\s\-']", ' ', latin) + + # Step 3: Normalize whitespace + clean = ' '.join(clean.split()) + + return clean + + +# ============================================================================= +# CLI INTERFACE +# ============================================================================= + +def main(): + parser = argparse.ArgumentParser( + description='Transliterate non-Latin script text to Latin characters', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +Examples: + python transliterate_emic_names.py --text "Институт" --lang ru + python transliterate_emic_names.py --text "东巴文化博物院" --lang zh + python transliterate_emic_names.py --file data/custodian/example.yaml + +Supported languages: + ru (Russian), uk (Ukrainian), bg (Bulgarian), sr (Serbian), kk (Kazakh) + zh (Chinese), ja (Japanese), ko (Korean) + ar (Arabic), fa (Persian), ur (Urdu), he (Hebrew) + el (Greek), hi (Hindi), ne (Nepali), bn (Bengali) + th (Thai), hy (Armenian), ka (Georgian) + ''' + ) + + parser.add_argument('--text', '-t', help='Text to transliterate') + parser.add_argument('--lang', '-l', help='ISO 639-1 language code') + parser.add_argument('--file', '-f', help='YAML file to process') + parser.add_argument('--detect', '-d', action='store_true', + help='Only detect script, do not transliterate') + parser.add_argument('--libs', action='store_true', + help='Show available transliteration libraries') + + args = parser.parse_args() + + if args.libs: + print("Available transliteration libraries:") + for lib, available in AVAILABLE_LIBS.items(): + status = "✓ installed" if available else "✗ not installed" + print(f" {lib}: {status}") + return + + if args.file: + import yaml + with open(args.file, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + emic_name = data.get('custodian_name', {}).get('emic_name') + lang = data.get('custodian_name', {}).get('name_language') + + if not emic_name: + print(f"Error: No emic_name found in {args.file}") + return + + print(f"Emic name: {emic_name}") + print(f"Language: {lang or '(auto-detect)'}") + + if args.detect: + script = detect_script(emic_name) + print(f"Detected script: {script}") + else: + result = transliterate_for_abbreviation(emic_name, lang) + print(f"Transliterated: {result}") + return + + if args.text: + if args.detect: + script = detect_script(args.text) + print(f"Input: {args.text}") + print(f"Detected script: {script}") + else: + result = transliterate_for_abbreviation(args.text, args.lang) + print(f"Input: {args.text}") + print(f"Language: {args.lang or '(auto-detect)'}") + print(f"Output: {result}") + return + + parser.print_help() + + +if __name__ == '__main__': + main() diff --git a/tests/test_transliteration.py b/tests/test_transliteration.py new file mode 100644 index 0000000000..6231a747c4 --- /dev/null +++ b/tests/test_transliteration.py @@ -0,0 +1,350 @@ +""" +Unit tests for transliteration functions. + +Tests the scripts/transliterate_emic_names.py module for converting +non-Latin script institution names to Latin characters. +""" + +import sys +from pathlib import Path +import pytest + +# Add project root to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.transliterate_emic_names import ( + detect_script, + transliterate, + transliterate_for_abbreviation, + transliterate_cyrillic, + transliterate_chinese, + transliterate_japanese, + transliterate_korean, + transliterate_arabic, + transliterate_hebrew, + transliterate_greek, + transliterate_devanagari, + transliterate_armenian, + transliterate_georgian, + transliterate_thai, + transliterate_sinhala, + transliterate_khmer, +) + + +class TestScriptDetection: + """Tests for script detection function.""" + + def test_detect_latin(self): + assert detect_script("Hello World") == "latin" + assert detect_script("Rijksmuseum Amsterdam") == "latin" + + def test_detect_cyrillic(self): + assert detect_script("Институт") == "cyrillic" + assert detect_script("Музей") == "cyrillic" + + def test_detect_chinese(self): + assert detect_script("故宮博物院") == "chinese" + assert detect_script("中国国家图书馆") == "chinese" + + def test_detect_japanese(self): + # Japanese with hiragana or katakana + assert detect_script("こんにちは") == "japanese" + assert detect_script("カタカナ") == "japanese" + + def test_detect_korean(self): + assert detect_script("국립중앙박물관") == "korean" + + def test_detect_arabic(self): + assert detect_script("المكتبة الوطنية") == "arabic" + + def test_detect_hebrew(self): + assert detect_script("ארכיון") == "hebrew" + + def test_detect_greek(self): + assert detect_script("Μουσείο") == "greek" + + def test_detect_devanagari(self): + assert detect_script("राजस्थान") == "devanagari" + + def test_detect_thai(self): + assert detect_script("สำนักหอจดหมายเหตุ") == "thai" + assert detect_script("กรุงเทพ") == "thai" + + def test_detect_sinhala(self): + assert detect_script("පේරාදෙණිය") == "sinhala" + assert detect_script("ජාතික කෞතුකාගාර") == "sinhala" + + def test_detect_khmer(self): + assert detect_script("សារមន្ទីរ") == "khmer" + assert detect_script("ភ្នំពេញ") == "khmer" + + +class TestCyrillicTransliteration: + """Tests for Cyrillic (Russian/Ukrainian/etc.) transliteration.""" + + def test_russian_basic(self): + result = transliterate_cyrillic("Музей", "ru") + assert result == "Muzey" + + def test_russian_institute(self): + result = transliterate_cyrillic("Институт восточных рукописей РАН", "ru") + assert "Institut" in result + assert "vostochnykh" in result + + def test_russian_hard_soft_signs(self): + # Hard and soft signs should be removed + result = transliterate_cyrillic("объект", "ru") + assert "ъ" not in result + assert "ь" not in result + + def test_ukrainian(self): + result = transliterate_cyrillic("Київ", "uk") + # Should handle Ukrainian-specific letters + assert "K" in result or "k" in result + + +class TestChineseTransliteration: + """Tests for Chinese (Hanzi to Pinyin) transliteration.""" + + def test_museum_vocabulary(self): + result = transliterate_chinese("博物館") + assert "bo" in result.lower() or "haku" in result.lower() + + def test_national_palace_museum(self): + result = transliterate_chinese("故宮博物院") + # Should contain pinyin for these characters + assert len(result) > 0 + assert result != "故宮博物院" # Should be transliterated + + def test_dongba_museum(self): + result = transliterate_chinese("东巴文化博物院") + assert "dong" in result.lower() + assert "wen" in result.lower() + + +class TestJapaneseTransliteration: + """Tests for Japanese (Kanji/Kana to Romaji) transliteration.""" + + def test_national_museum(self): + result = transliterate_japanese("国立博物館") + assert "koku" in result.lower() + assert "ritsu" in result.lower() + + def test_tokyo_national_museum(self): + result = transliterate_japanese("東京国立博物館") + assert "tou" in result.lower() or "to" in result.lower() + assert "kyou" in result.lower() or "kyo" in result.lower() + + def test_hiragana(self): + result = transliterate_japanese("あいうえお") + assert result == "aiueo" + + def test_katakana(self): + result = transliterate_japanese("アイウエオ") + assert result == "aiueo" + + +class TestKoreanTransliteration: + """Tests for Korean (Hangul to Revised Romanization) transliteration.""" + + def test_national_museum(self): + result = transliterate_korean("국립중앙박물관") + # Should contain romanized syllables + assert len(result) > 0 + assert "guk" in result.lower() or "kuk" in result.lower() + + def test_simple_hangul(self): + result = transliterate_korean("한글") + assert "han" in result.lower() + + +class TestArabicTransliteration: + """Tests for Arabic script transliteration.""" + + def test_national_library(self): + result = transliterate_arabic("المكتبة الوطنية") + assert "mktb" in result.lower() or "maktab" in result.lower() + + def test_basic_letters(self): + result = transliterate_arabic("كتاب") + assert "k" in result.lower() + assert "t" in result.lower() + + +class TestHebrewTransliteration: + """Tests for Hebrew script transliteration.""" + + def test_archive(self): + result = transliterate_hebrew("ארכיון") + # Should contain transliterated letters + assert len(result) > 0 + + def test_basic_letters(self): + result = transliterate_hebrew("שלום") + assert "sh" in result.lower() + + +class TestGreekTransliteration: + """Tests for Greek script transliteration.""" + + def test_museum(self): + result = transliterate_greek("Μουσείο") + assert "Moyseio" in result or "Mouseio" in result + + def test_archaeological(self): + result = transliterate_greek("Αρχαιολογικό") + assert "Archaiologiko" in result + + +class TestDevanagariTransliteration: + """Tests for Devanagari (Hindi/Nepali) transliteration.""" + + def test_rajasthan(self): + result = transliterate_devanagari("राजस्थान") + # ISO 15919 uses "aa" for long vowels, so "raaj" not "raj" + assert "raaj" in result.lower() or "raj" in result.lower() + + def test_basic_consonants(self): + result = transliterate_devanagari("क") + assert "k" in result.lower() + + +class TestThaiTransliteration: + """Tests for Thai script transliteration (RTGS).""" + + def test_national_archives(self): + # สำนักหอจดหมายเหตุแห่งชาติ = National Archives of Thailand + result = transliterate_thai("สำนักหอจดหมายเหตุแห่งชาติ") + assert "samnak" in result.lower() + assert "haeng chat" in result.lower() + + def test_national_library(self): + # สำนักหอสมุดแห่งชาติ = National Library of Thailand + result = transliterate_thai("สำนักหอสมุดแห่งชาติ") + assert "ho samut" in result.lower() + + def test_national_museum(self): + # พิพิธภัณฑสถานแห่งชาติ พระนคร = Bangkok National Museum + result = transliterate_thai("พิพิธภัณฑสถานแห่งชาติ พระนคร") + assert "phiphitthaphan" in result.lower() + assert "phra nakhon" in result.lower() + + def test_siam_society(self): + # สยามสมาคมในพระบรมราชูปถัมภ์ = Siam Society + result = transliterate_thai("สยามสมาคมในพระบรมราชูปถัมภ์") + assert "sayam" in result.lower() + assert "samakhom" in result.lower() + + def test_wat_temple(self): + # วัดโพธิ์ราม = Wat Pho Ram + result = transliterate_thai("วัดโพธิ์ราม") + assert "wat" in result.lower() + assert "pho" in result.lower() + assert "ram" in result.lower() + + def test_empty_without_library(self): + # Even without pythainlp, should return transliterated result (not empty) + result = transliterate_thai("กรุงเทพ") + # Should get 'krung thep' from vocabulary lookup + assert len(result) > 0 + + +class TestSinhalaTransliteration: + """Tests for Sinhala script transliteration (ISO 15919).""" + + def test_university_peradeniya(self): + # පේරාදෙණිය විශ්වවිද් යාලය = University of Peradeniya + result = transliterate_sinhala("පේරාදෙණිය විශ්වවිද් යාලය") + assert "peradeniya" in result.lower() + assert "vishvavid" in result.lower() + + def test_national_museums(self): + # ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව = Department of National Museums + result = transliterate_sinhala("ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව") + assert "jathika" in result.lower() + assert "kauthukagara" in result.lower() + + def test_basic_consonants(self): + # Basic consonant test + result = transliterate_sinhala("ක") # ka + assert "k" in result.lower() + + def test_output_not_empty(self): + # Sinhala should never return empty string + result = transliterate_sinhala("කොළඹ") # Colombo + assert len(result) > 0 + + +class TestKhmerTransliteration: + """Tests for Khmer script transliteration (UNGEGN).""" + + def test_tuol_sleng(self): + # សារមន្ទីរទួលស្លែង = Tuol Sleng Genocide Museum + result = transliterate_khmer("សារមន្ទីរទួលស្លែង") + assert "tuol sleng" in result.lower() + + def test_phnom_penh(self): + # ភ្នំពេញ = Phnom Penh + result = transliterate_khmer("ភ្នំពេញ") + assert "phnom penh" in result.lower() + + def test_angkor(self): + # អង្គរ = Angkor + result = transliterate_khmer("អង្គរ") + assert "angkor" in result.lower() + + def test_output_not_empty(self): + # Khmer should never return empty string + result = transliterate_khmer("សារមន្ទីរ") + assert len(result) > 0 + + +class TestTransliterateForAbbreviation: + """Tests for the main abbreviation function.""" + + def test_russian_cleanup(self): + result = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru") + # Should be clean Latin text + assert result.isascii() or all(c.isalnum() or c in " -'" for c in result) + + def test_chinese_cleanup(self): + result = transliterate_for_abbreviation("东巴文化博物院", "zh") + # Should be clean Latin text or warning + assert result.isascii() or "[REQUIRES" in result + + def test_korean_cleanup(self): + result = transliterate_for_abbreviation("국립중앙박물관", "ko") + assert result.isascii() + + def test_special_characters_removed(self): + # Special characters should be removed for abbreviation + result = transliterate_for_abbreviation("Test (Museum) & Gallery", "en") + assert "&" not in result + assert "(" not in result + + +class TestIntegration: + """Integration tests using the main transliterate function.""" + + def test_auto_detect_russian(self): + result = transliterate("Музей") + assert result.isascii() + + def test_auto_detect_korean(self): + result = transliterate("박물관") + assert result.isascii() + + def test_latin_passthrough(self): + result = transliterate("Rijksmuseum Amsterdam") + assert result == "Rijksmuseum Amsterdam" + + def test_with_explicit_language(self): + result = transliterate("故宮博物院", lang="zh") + assert len(result) > 0 + # Should not be original Chinese + assert "故" not in result or "[REQUIRES" in result + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])