feat(ghcid): add diacritics normalization and transliteration scripts

- Add fix_ghcid_diacritics.py for normalizing non-ASCII in GHCIDs
- Add resolve_diacritics_collisions.py for collision handling
- Add transliterate_emic_names.py for non-Latin script handling
- Add transliteration tests
This commit is contained in:
kempersc 2025-12-08 14:59:28 +01:00
parent 6a6557bbe8
commit 891692a4d6
4 changed files with 2212 additions and 0 deletions

View file

@ -0,0 +1,325 @@
#!/usr/bin/env python3
"""
Fix GHCID abbreviations containing diacritics.
This script normalizes diacritics in GHCID abbreviation components to ASCII,
regenerates UUIDs and numeric IDs, updates GHCID history, and renames files.
Rule: ABBREV-DIACRITICS
See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md
Usage:
python scripts/fix_ghcid_diacritics.py --dry-run # Preview changes
python scripts/fix_ghcid_diacritics.py # Apply changes
"""
import argparse
import hashlib
import os
import re
import shutil
import unicodedata
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import yaml
# GHCID namespace UUID for deterministic UUID generation
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # URL namespace
# Regex pattern for common diacritics
DIACRITICS_PATTERN = re.compile(r'[ČŘŠŽĚŮÁÉÍÓÚÝÄÖÜßÑÇÀÈÌÒÙŁŃŚŹŻĄĘÅØÆŐŰÂÊÎÔÛčřšžěůáéíóúýäöüñçàèìòùłńśźżąęåøæőűâêîôû]')
def normalize_diacritics(text: str) -> str:
"""
Normalize diacritics to ASCII equivalents.
Uses Unicode NFD decomposition to separate base characters from
combining marks, then removes the combining marks.
Examples:
"Č" "C"
"Ř" "R"
"Ö" "O"
"ñ" "n"
"""
# NFD decomposition separates base characters from combining marks
normalized = unicodedata.normalize('NFD', text)
# Remove combining marks (category 'Mn' = Mark, Nonspacing)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_text
def has_diacritics_in_ghcid(ghcid: str) -> bool:
"""Check if GHCID contains any diacritics (in any component).
Diacritics can appear in:
- Region code (e.g., '31' is fine, but city code 'ČB' has diacritics)
- City code (e.g., 'TŘE' for Třebíč)
- Abbreviation (e.g., 'VHSPAOČRZS')
"""
return bool(DIACRITICS_PATTERN.search(ghcid))
def has_diacritics_in_abbreviation(ghcid: str) -> bool:
"""Check if GHCID abbreviation component contains diacritics."""
# GHCID format: CC-RR-CCC-T-ABBREV or CC-RR-CCC-T-ABBREV-suffix
parts = ghcid.split('-')
if len(parts) >= 5:
# Abbreviation is the 5th component (index 4)
abbrev = parts[4]
return bool(DIACRITICS_PATTERN.search(abbrev))
return False
def fix_ghcid_diacritics(ghcid: str) -> str:
"""
Fix diacritics in ALL GHCID components.
Normalizes diacritics in all parts: country, region, city, type,
abbreviation, and any suffix components.
"""
parts = ghcid.split('-')
# Normalize all parts
normalized_parts = [normalize_diacritics(part) for part in parts]
return '-'.join(normalized_parts)
def generate_uuid_v5(ghcid_string: str) -> str:
"""Generate deterministic UUID v5 from GHCID string."""
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
def generate_uuid_v8_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 from SHA-256 hash of GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
# Take first 16 bytes for UUID
uuid_bytes = bytearray(sha256_hash[:16])
# Set version to 8 (custom)
uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80
# Set variant to RFC 4122
uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
def generate_numeric_id(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from SHA-256 hash."""
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
# Take first 8 bytes as 64-bit unsigned integer
numeric_id = int.from_bytes(sha256_hash[:8], byteorder='big')
return numeric_id
def process_file(file_path: Path, dry_run: bool = True) -> Optional[dict]:
"""
Process a single YAML file to fix GHCID diacritics.
Returns dict with change info, or None if no change needed.
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
print(f" Error reading {file_path}: {e}")
return None
if not data or 'ghcid' not in data:
return None
ghcid_section = data.get('ghcid', {})
old_ghcid = ghcid_section.get('ghcid_current', '')
if not has_diacritics_in_ghcid(old_ghcid):
return None
# Fix the GHCID
new_ghcid = fix_ghcid_diacritics(old_ghcid)
if new_ghcid == old_ghcid:
return None
# Generate new identifiers
new_uuid_v5 = generate_uuid_v5(new_ghcid)
new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
new_numeric = generate_numeric_id(new_ghcid)
timestamp_now = datetime.now(timezone.utc).isoformat()
change_info = {
'file': str(file_path),
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'old_uuid': ghcid_section.get('ghcid_uuid', ''),
'new_uuid': new_uuid_v5,
'old_numeric': ghcid_section.get('ghcid_numeric', 0),
'new_numeric': new_numeric,
}
if dry_run:
return change_info
# Update ghcid section
ghcid_section['ghcid_current'] = new_ghcid
ghcid_section['ghcid_uuid'] = new_uuid_v5
ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8
ghcid_section['ghcid_numeric'] = new_numeric
# Keep original as-is (for historical reference)
# Add history entry for the fix
ghcid_history = ghcid_section.get('ghcid_history', [])
# Add new entry at the beginning
new_history_entry = {
'ghcid': new_ghcid,
'ghcid_numeric': new_numeric,
'valid_from': timestamp_now,
'reason': f"Normalized diacritics to ASCII per ABBREV-DIACRITICS rule (was: {old_ghcid})"
}
# Mark previous entry as superseded
if ghcid_history:
if 'valid_to' not in ghcid_history[0]:
ghcid_history[0]['valid_to'] = timestamp_now
ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history
data['ghcid'] = ghcid_section
# Update identifiers section
identifiers = data.get('identifiers', [])
for ident in identifiers:
if ident.get('identifier_scheme') == 'GHCID':
ident['identifier_value'] = new_ghcid
elif ident.get('identifier_scheme') == 'GHCID_UUID':
ident['identifier_value'] = new_uuid_v5
elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256':
ident['identifier_value'] = new_uuid_v8
elif ident.get('identifier_scheme') == 'GHCID_NUMERIC':
ident['identifier_value'] = str(new_numeric)
data['identifiers'] = identifiers
# Write updated file
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rename file to match new GHCID
old_filename = file_path.name
new_filename = f"{new_ghcid}.yaml"
if old_filename != new_filename:
new_file_path = file_path.parent / new_filename
if new_file_path.exists():
print(f" Warning: Target file already exists: {new_file_path}")
# Don't rename if target exists
else:
shutil.move(str(file_path), str(new_file_path))
change_info['new_file'] = str(new_file_path)
return change_info
def find_affected_files(custodian_dir: Path) -> list[Path]:
"""Find all YAML files with diacritics in GHCID abbreviation.
Uses filename-based detection for speed, since filenames match GHCID.
"""
import subprocess
# Use find with regex for speed - filenames contain the GHCID
try:
result = subprocess.run(
['find', str(custodian_dir), '-name', '*.yaml', '-type', 'f'],
capture_output=True,
text=True,
timeout=30
)
all_files = [Path(p) for p in result.stdout.strip().split('\n') if p]
except Exception:
# Fallback to glob
all_files = list(custodian_dir.glob("*.yaml"))
affected = []
for yaml_file in all_files:
# Check filename for diacritics (faster than parsing YAML)
if DIACRITICS_PATTERN.search(yaml_file.stem):
affected.append(yaml_file)
return affected
def main():
parser = argparse.ArgumentParser(
description="Fix GHCID abbreviations containing diacritics"
)
parser.add_argument(
'--dry-run',
action='store_true',
help="Preview changes without modifying files"
)
parser.add_argument(
'--limit',
type=int,
default=0,
help="Limit number of files to process (0 = no limit)"
)
parser.add_argument(
'--custodian-dir',
type=Path,
default=Path('data/custodian'),
help="Path to custodian directory"
)
args = parser.parse_args()
custodian_dir = args.custodian_dir
if not custodian_dir.exists():
print(f"Error: Directory not found: {custodian_dir}")
return 1
print(f"Scanning {custodian_dir} for files with diacritics in GHCID abbreviation...")
affected_files = find_affected_files(custodian_dir)
print(f"Found {len(affected_files)} affected files")
if args.limit > 0:
affected_files = affected_files[:args.limit]
print(f"Limited to {args.limit} files")
if args.dry_run:
print("\n=== DRY RUN (no changes will be made) ===\n")
else:
print("\n=== APPLYING CHANGES ===\n")
changes = []
for i, file_path in enumerate(affected_files, 1):
print(f"[{i}/{len(affected_files)}] Processing {file_path.name}...")
change = process_file(file_path, dry_run=args.dry_run)
if change:
changes.append(change)
print(f" {change['old_ghcid']}{change['new_ghcid']}")
print(f"\n=== SUMMARY ===")
print(f"Files processed: {len(affected_files)}")
print(f"Files changed: {len(changes)}")
if args.dry_run and changes:
print("\nTo apply changes, run without --dry-run flag")
# Show country distribution
if changes:
countries = {}
for c in changes:
cc = c['old_ghcid'].split('-')[0]
countries[cc] = countries.get(cc, 0) + 1
print("\nBy country:")
for cc, count in sorted(countries.items(), key=lambda x: -x[1]):
print(f" {cc}: {count}")
return 0
if __name__ == '__main__':
exit(main())

View file

@ -0,0 +1,270 @@
#!/usr/bin/env python3
"""
Resolve GHCID collisions caused by diacritics normalization.
When a file with diacritics normalizes to the same GHCID as an existing file,
the diacritics file gets a name suffix per AGENTS.md collision rules.
Usage:
python scripts/resolve_diacritics_collisions.py --dry-run # Preview changes
python scripts/resolve_diacritics_collisions.py # Apply changes
"""
import argparse
import hashlib
import os
import re
import shutil
import unicodedata
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import yaml
# GHCID namespace UUID for deterministic UUID generation
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")
# Regex pattern for common diacritics
DIACRITICS_PATTERN = re.compile(r'[ČŘŠŽĚŮÁÉÍÓÚÝÄÖÜßÑÇÀÈÌÒÙŁŃŚŹŻĄĘÅØÆŐŰÂÊÎÔÛčřšžěůáéíóúýäöüñçàèìòùłńśźżąęåøæőűâêîôûĎŇŤďňť]')
def normalize_diacritics(text: str) -> str:
"""Normalize diacritics to ASCII equivalents."""
normalized = unicodedata.normalize('NFD', text)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_text
def generate_name_suffix(native_name: str) -> str:
"""Convert native language institution name to snake_case suffix."""
# Normalize unicode (NFD decomposition) and remove diacritics
normalized = unicodedata.normalize('NFD', native_name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Convert to lowercase
lowercase = ascii_name.lower()
# Remove apostrophes, commas, and other punctuation
no_punct = re.sub(r"[''`\",.:;!?()[\]{}‒–—]", '', lowercase)
# Replace spaces and hyphens with underscores
underscored = re.sub(r'[\s\-]+', '_', no_punct)
# Remove any remaining non-alphanumeric characters (except underscores)
clean = re.sub(r'[^a-z0-9_]', '', underscored)
# Collapse multiple underscores
final = re.sub(r'_+', '_', clean).strip('_')
return final
def generate_uuid_v5(ghcid_string: str) -> str:
"""Generate deterministic UUID v5 from GHCID string."""
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
def generate_uuid_v8_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 from SHA-256 hash of GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
uuid_bytes = bytearray(sha256_hash[:16])
uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80
uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
def generate_numeric_id(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from SHA-256 hash."""
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
return int.from_bytes(sha256_hash[:8], byteorder='big')
def find_collision_pairs(custodian_dir: Path) -> list[tuple[Path, Path, str]]:
"""Find files with diacritics that collide with existing ASCII files.
Returns list of (diacritics_file, ascii_file, ascii_ghcid).
"""
collisions = []
for yaml_file in custodian_dir.glob("*.yaml"):
filename = yaml_file.stem # Without .yaml
if not DIACRITICS_PATTERN.search(filename):
continue
# Normalize to ASCII
ascii_filename = normalize_diacritics(filename)
ascii_file = custodian_dir / f"{ascii_filename}.yaml"
if ascii_file.exists():
collisions.append((yaml_file, ascii_file, ascii_filename))
return collisions
def resolve_collision(diacritics_file: Path, ascii_ghcid: str, dry_run: bool = True) -> Optional[dict]:
"""
Resolve a collision by adding a name suffix to the diacritics file.
The diacritics file gets a name suffix since it's being added later.
"""
try:
with open(diacritics_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
print(f" Error reading {diacritics_file}: {e}")
return None
if not data:
return None
# Get institution name for suffix
original_entry = data.get('original_entry', {})
inst_name = original_entry.get('name', '')
if not inst_name:
print(f" Warning: No institution name found in {diacritics_file}")
return None
# Generate name suffix
name_suffix = generate_name_suffix(inst_name)
# Create new GHCID with name suffix
new_ghcid = f"{ascii_ghcid}-{name_suffix}"
# Get old GHCID from file
ghcid_section = data.get('ghcid', {})
old_ghcid = ghcid_section.get('ghcid_current', diacritics_file.stem)
# Generate new identifiers
new_uuid_v5 = generate_uuid_v5(new_ghcid)
new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
new_numeric = generate_numeric_id(new_ghcid)
timestamp_now = datetime.now(timezone.utc).isoformat()
change_info = {
'file': str(diacritics_file),
'institution_name': inst_name,
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'name_suffix': name_suffix,
}
if dry_run:
return change_info
# Update ghcid section
ghcid_section['ghcid_current'] = new_ghcid
ghcid_section['ghcid_uuid'] = new_uuid_v5
ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8
ghcid_section['ghcid_numeric'] = new_numeric
# Add history entry
ghcid_history = ghcid_section.get('ghcid_history', [])
new_history_entry = {
'ghcid': new_ghcid,
'ghcid_numeric': new_numeric,
'valid_from': timestamp_now,
'reason': f"Name suffix added to resolve collision with {ascii_ghcid} (was: {old_ghcid})"
}
if ghcid_history and 'valid_to' not in ghcid_history[0]:
ghcid_history[0]['valid_to'] = timestamp_now
ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history
data['ghcid'] = ghcid_section
# Update identifiers section
identifiers = data.get('identifiers', [])
for ident in identifiers:
if ident.get('identifier_scheme') == 'GHCID':
ident['identifier_value'] = new_ghcid
elif ident.get('identifier_scheme') == 'GHCID_UUID':
ident['identifier_value'] = new_uuid_v5
elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256':
ident['identifier_value'] = new_uuid_v8
elif ident.get('identifier_scheme') == 'GHCID_NUMERIC':
ident['identifier_value'] = str(new_numeric)
data['identifiers'] = identifiers
# Write updated file
with open(diacritics_file, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Rename file to match new GHCID
new_filename = f"{new_ghcid}.yaml"
new_file_path = diacritics_file.parent / new_filename
if new_file_path.exists():
print(f" Warning: Target file already exists: {new_file_path}")
else:
shutil.move(str(diacritics_file), str(new_file_path))
change_info['new_file'] = str(new_file_path)
return change_info
def main():
parser = argparse.ArgumentParser(
description="Resolve GHCID collisions caused by diacritics normalization"
)
parser.add_argument(
'--dry-run',
action='store_true',
help="Preview changes without modifying files"
)
parser.add_argument(
'--custodian-dir',
type=Path,
default=Path('data/custodian'),
help="Path to custodian directory"
)
args = parser.parse_args()
custodian_dir = args.custodian_dir
if not custodian_dir.exists():
print(f"Error: Directory not found: {custodian_dir}")
return 1
print(f"Scanning {custodian_dir} for diacritics collision pairs...")
collisions = find_collision_pairs(custodian_dir)
print(f"Found {len(collisions)} collision pairs\n")
if args.dry_run:
print("=== DRY RUN (no changes will be made) ===\n")
else:
print("=== APPLYING CHANGES ===\n")
changes = []
for i, (diacritics_file, ascii_file, ascii_ghcid) in enumerate(collisions, 1):
print(f"[{i}/{len(collisions)}] Collision:")
print(f" Diacritics file: {diacritics_file.name}")
print(f" Collides with: {ascii_file.name}")
change = resolve_collision(diacritics_file, ascii_ghcid, dry_run=args.dry_run)
if change:
changes.append(change)
print(f" Institution: {change['institution_name']}")
print(f" GHCID change: {change['old_ghcid']}{change['new_ghcid']}")
if 'new_file' in change:
print(f" New file: {Path(change['new_file']).name}")
print()
print(f"=== SUMMARY ===")
print(f"Collisions found: {len(collisions)}")
print(f"Files resolved: {len(changes)}")
if args.dry_run and changes:
print("\nTo apply changes, run without --dry-run flag")
return 0
if __name__ == '__main__':
exit(main())

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,350 @@
"""
Unit tests for transliteration functions.
Tests the scripts/transliterate_emic_names.py module for converting
non-Latin script institution names to Latin characters.
"""
import sys
from pathlib import Path
import pytest
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.transliterate_emic_names import (
detect_script,
transliterate,
transliterate_for_abbreviation,
transliterate_cyrillic,
transliterate_chinese,
transliterate_japanese,
transliterate_korean,
transliterate_arabic,
transliterate_hebrew,
transliterate_greek,
transliterate_devanagari,
transliterate_armenian,
transliterate_georgian,
transliterate_thai,
transliterate_sinhala,
transliterate_khmer,
)
class TestScriptDetection:
"""Tests for script detection function."""
def test_detect_latin(self):
assert detect_script("Hello World") == "latin"
assert detect_script("Rijksmuseum Amsterdam") == "latin"
def test_detect_cyrillic(self):
assert detect_script("Институт") == "cyrillic"
assert detect_script("Музей") == "cyrillic"
def test_detect_chinese(self):
assert detect_script("故宮博物院") == "chinese"
assert detect_script("中国国家图书馆") == "chinese"
def test_detect_japanese(self):
# Japanese with hiragana or katakana
assert detect_script("こんにちは") == "japanese"
assert detect_script("カタカナ") == "japanese"
def test_detect_korean(self):
assert detect_script("국립중앙박물관") == "korean"
def test_detect_arabic(self):
assert detect_script("المكتبة الوطنية") == "arabic"
def test_detect_hebrew(self):
assert detect_script("ארכיון") == "hebrew"
def test_detect_greek(self):
assert detect_script("Μουσείο") == "greek"
def test_detect_devanagari(self):
assert detect_script("राजस्थान") == "devanagari"
def test_detect_thai(self):
assert detect_script("สำนักหอจดหมายเหตุ") == "thai"
assert detect_script("กรุงเทพ") == "thai"
def test_detect_sinhala(self):
assert detect_script("පේරාදෙණිය") == "sinhala"
assert detect_script("ජාතික කෞතුකාගාර") == "sinhala"
def test_detect_khmer(self):
assert detect_script("សារមន្ទីរ") == "khmer"
assert detect_script("ភ្នំពេញ") == "khmer"
class TestCyrillicTransliteration:
"""Tests for Cyrillic (Russian/Ukrainian/etc.) transliteration."""
def test_russian_basic(self):
result = transliterate_cyrillic("Музей", "ru")
assert result == "Muzey"
def test_russian_institute(self):
result = transliterate_cyrillic("Институт восточных рукописей РАН", "ru")
assert "Institut" in result
assert "vostochnykh" in result
def test_russian_hard_soft_signs(self):
# Hard and soft signs should be removed
result = transliterate_cyrillic("объект", "ru")
assert "ъ" not in result
assert "ь" not in result
def test_ukrainian(self):
result = transliterate_cyrillic("Київ", "uk")
# Should handle Ukrainian-specific letters
assert "K" in result or "k" in result
class TestChineseTransliteration:
"""Tests for Chinese (Hanzi to Pinyin) transliteration."""
def test_museum_vocabulary(self):
result = transliterate_chinese("博物館")
assert "bo" in result.lower() or "haku" in result.lower()
def test_national_palace_museum(self):
result = transliterate_chinese("故宮博物院")
# Should contain pinyin for these characters
assert len(result) > 0
assert result != "故宮博物院" # Should be transliterated
def test_dongba_museum(self):
result = transliterate_chinese("东巴文化博物院")
assert "dong" in result.lower()
assert "wen" in result.lower()
class TestJapaneseTransliteration:
"""Tests for Japanese (Kanji/Kana to Romaji) transliteration."""
def test_national_museum(self):
result = transliterate_japanese("国立博物館")
assert "koku" in result.lower()
assert "ritsu" in result.lower()
def test_tokyo_national_museum(self):
result = transliterate_japanese("東京国立博物館")
assert "tou" in result.lower() or "to" in result.lower()
assert "kyou" in result.lower() or "kyo" in result.lower()
def test_hiragana(self):
result = transliterate_japanese("あいうえお")
assert result == "aiueo"
def test_katakana(self):
result = transliterate_japanese("アイウエオ")
assert result == "aiueo"
class TestKoreanTransliteration:
"""Tests for Korean (Hangul to Revised Romanization) transliteration."""
def test_national_museum(self):
result = transliterate_korean("국립중앙박물관")
# Should contain romanized syllables
assert len(result) > 0
assert "guk" in result.lower() or "kuk" in result.lower()
def test_simple_hangul(self):
result = transliterate_korean("한글")
assert "han" in result.lower()
class TestArabicTransliteration:
"""Tests for Arabic script transliteration."""
def test_national_library(self):
result = transliterate_arabic("المكتبة الوطنية")
assert "mktb" in result.lower() or "maktab" in result.lower()
def test_basic_letters(self):
result = transliterate_arabic("كتاب")
assert "k" in result.lower()
assert "t" in result.lower()
class TestHebrewTransliteration:
"""Tests for Hebrew script transliteration."""
def test_archive(self):
result = transliterate_hebrew("ארכיון")
# Should contain transliterated letters
assert len(result) > 0
def test_basic_letters(self):
result = transliterate_hebrew("שלום")
assert "sh" in result.lower()
class TestGreekTransliteration:
"""Tests for Greek script transliteration."""
def test_museum(self):
result = transliterate_greek("Μουσείο")
assert "Moyseio" in result or "Mouseio" in result
def test_archaeological(self):
result = transliterate_greek("Αρχαιολογικό")
assert "Archaiologiko" in result
class TestDevanagariTransliteration:
"""Tests for Devanagari (Hindi/Nepali) transliteration."""
def test_rajasthan(self):
result = transliterate_devanagari("राजस्थान")
# ISO 15919 uses "aa" for long vowels, so "raaj" not "raj"
assert "raaj" in result.lower() or "raj" in result.lower()
def test_basic_consonants(self):
result = transliterate_devanagari("")
assert "k" in result.lower()
class TestThaiTransliteration:
"""Tests for Thai script transliteration (RTGS)."""
def test_national_archives(self):
# สำนักหอจดหมายเหตุแห่งชาติ = National Archives of Thailand
result = transliterate_thai("สำนักหอจดหมายเหตุแห่งชาติ")
assert "samnak" in result.lower()
assert "haeng chat" in result.lower()
def test_national_library(self):
# สำนักหอสมุดแห่งชาติ = National Library of Thailand
result = transliterate_thai("สำนักหอสมุดแห่งชาติ")
assert "ho samut" in result.lower()
def test_national_museum(self):
# พิพิธภัณฑสถานแห่งชาติ พระนคร = Bangkok National Museum
result = transliterate_thai("พิพิธภัณฑสถานแห่งชาติ พระนคร")
assert "phiphitthaphan" in result.lower()
assert "phra nakhon" in result.lower()
def test_siam_society(self):
# สยามสมาคมในพระบรมราชูปถัมภ์ = Siam Society
result = transliterate_thai("สยามสมาคมในพระบรมราชูปถัมภ์")
assert "sayam" in result.lower()
assert "samakhom" in result.lower()
def test_wat_temple(self):
# วัดโพธิ์ราม = Wat Pho Ram
result = transliterate_thai("วัดโพธิ์ราม")
assert "wat" in result.lower()
assert "pho" in result.lower()
assert "ram" in result.lower()
def test_empty_without_library(self):
# Even without pythainlp, should return transliterated result (not empty)
result = transliterate_thai("กรุงเทพ")
# Should get 'krung thep' from vocabulary lookup
assert len(result) > 0
class TestSinhalaTransliteration:
"""Tests for Sinhala script transliteration (ISO 15919)."""
def test_university_peradeniya(self):
# පේරාදෙණිය විශ්වවිද් යාලය = University of Peradeniya
result = transliterate_sinhala("පේරාදෙණිය විශ්වවිද් යාලය")
assert "peradeniya" in result.lower()
assert "vishvavid" in result.lower()
def test_national_museums(self):
# ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව = Department of National Museums
result = transliterate_sinhala("ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව")
assert "jathika" in result.lower()
assert "kauthukagara" in result.lower()
def test_basic_consonants(self):
# Basic consonant test
result = transliterate_sinhala("") # ka
assert "k" in result.lower()
def test_output_not_empty(self):
# Sinhala should never return empty string
result = transliterate_sinhala("කොළඹ") # Colombo
assert len(result) > 0
class TestKhmerTransliteration:
"""Tests for Khmer script transliteration (UNGEGN)."""
def test_tuol_sleng(self):
# សារមន្ទីរទួលស្លែង = Tuol Sleng Genocide Museum
result = transliterate_khmer("សារមន្ទីរទួលស្លែង")
assert "tuol sleng" in result.lower()
def test_phnom_penh(self):
# ភ្នំពេញ = Phnom Penh
result = transliterate_khmer("ភ្នំពេញ")
assert "phnom penh" in result.lower()
def test_angkor(self):
# អង្គរ = Angkor
result = transliterate_khmer("អង្គរ")
assert "angkor" in result.lower()
def test_output_not_empty(self):
# Khmer should never return empty string
result = transliterate_khmer("សារមន្ទីរ")
assert len(result) > 0
class TestTransliterateForAbbreviation:
"""Tests for the main abbreviation function."""
def test_russian_cleanup(self):
result = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru")
# Should be clean Latin text
assert result.isascii() or all(c.isalnum() or c in " -'" for c in result)
def test_chinese_cleanup(self):
result = transliterate_for_abbreviation("东巴文化博物院", "zh")
# Should be clean Latin text or warning
assert result.isascii() or "[REQUIRES" in result
def test_korean_cleanup(self):
result = transliterate_for_abbreviation("국립중앙박물관", "ko")
assert result.isascii()
def test_special_characters_removed(self):
# Special characters should be removed for abbreviation
result = transliterate_for_abbreviation("Test (Museum) & Gallery", "en")
assert "&" not in result
assert "(" not in result
class TestIntegration:
"""Integration tests using the main transliterate function."""
def test_auto_detect_russian(self):
result = transliterate("Музей")
assert result.isascii()
def test_auto_detect_korean(self):
result = transliterate("박물관")
assert result.isascii()
def test_latin_passthrough(self):
result = transliterate("Rijksmuseum Amsterdam")
assert result == "Rijksmuseum Amsterdam"
def test_with_explicit_language(self):
result = transliterate("故宮博物院", lang="zh")
assert len(result) > 0
# Should not be original Chinese
assert "" not in result or "[REQUIRES" in result
if __name__ == "__main__":
pytest.main([__file__, "-v"])