feat(ghcid): add diacritics normalization and transliteration scripts

- Add fix_ghcid_diacritics.py for normalizing non-ASCII in GHCIDs - Add resolve_diacritics_collisions.py for collision handling - Add transliterate_emic_names.py for non-Latin script handling - Add transliteration tests
2025-12-08 14:59:28 +01:00 · 2025-12-08 14:59:28 +01:00 · 891692a4d6
commit 891692a4d6
parent 6a6557bbe8
4 changed files with 2212 additions and 0 deletions
--- a/scripts/fix_ghcid_diacritics.py
+++ b/scripts/fix_ghcid_diacritics.py
@ -0,0 +1,325 @@
+#!/usr/bin/env python3
+"""
+Fix GHCID abbreviations containing diacritics.
+
+This script normalizes diacritics in GHCID abbreviation components to ASCII,
+regenerates UUIDs and numeric IDs, updates GHCID history, and renames files.
+
+Rule: ABBREV-DIACRITICS
+See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md
+
+Usage:
+    python scripts/fix_ghcid_diacritics.py --dry-run   # Preview changes
+    python scripts/fix_ghcid_diacritics.py             # Apply changes
+"""
+
+import argparse
+import hashlib
+import os
+import re
+import shutil
+import unicodedata
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+import yaml
+
+# GHCID namespace UUID for deterministic UUID generation
+GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")  # URL namespace
+
+# Regex pattern for common diacritics
+DIACRITICS_PATTERN = re.compile(r'[ČŘŠŽĚŮÁÉÍÓÚÝÄÖÜßÑÇÀÈÌÒÙŁŃŚŹŻĄĘÅØÆŐŰÂÊÎÔÛčřšžěůáéíóúýäöüñçàèìòùłńśźżąęåøæőűâêîôû]')
+
+
+def normalize_diacritics(text: str) -> str:
+    """
+    Normalize diacritics to ASCII equivalents.
+    
+    Uses Unicode NFD decomposition to separate base characters from
+    combining marks, then removes the combining marks.
+    
+    Examples:
+        "Č" → "C"
+        "Ř" → "R"
+        "Ö" → "O"
+        "ñ" → "n"
+    """
+    # NFD decomposition separates base characters from combining marks
+    normalized = unicodedata.normalize('NFD', text)
+    # Remove combining marks (category 'Mn' = Mark, Nonspacing)
+    ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
+    return ascii_text
+
+
+def has_diacritics_in_ghcid(ghcid: str) -> bool:
+    """Check if GHCID contains any diacritics (in any component).
+    
+    Diacritics can appear in:
+    - Region code (e.g., '31' is fine, but city code 'ČB' has diacritics)
+    - City code (e.g., 'TŘE' for Třebíč)
+    - Abbreviation (e.g., 'VHSPAOČRZS')
+    """
+    return bool(DIACRITICS_PATTERN.search(ghcid))
+
+
+def has_diacritics_in_abbreviation(ghcid: str) -> bool:
+    """Check if GHCID abbreviation component contains diacritics."""
+    # GHCID format: CC-RR-CCC-T-ABBREV or CC-RR-CCC-T-ABBREV-suffix
+    parts = ghcid.split('-')
+    if len(parts) >= 5:
+        # Abbreviation is the 5th component (index 4)
+        abbrev = parts[4]
+        return bool(DIACRITICS_PATTERN.search(abbrev))
+    return False
+
+
+def fix_ghcid_diacritics(ghcid: str) -> str:
+    """
+    Fix diacritics in ALL GHCID components.
+    
+    Normalizes diacritics in all parts: country, region, city, type, 
+    abbreviation, and any suffix components.
+    """
+    parts = ghcid.split('-')
+    # Normalize all parts
+    normalized_parts = [normalize_diacritics(part) for part in parts]
+    return '-'.join(normalized_parts)
+
+
+def generate_uuid_v5(ghcid_string: str) -> str:
+    """Generate deterministic UUID v5 from GHCID string."""
+    return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
+
+
+def generate_uuid_v8_sha256(ghcid_string: str) -> str:
+    """Generate UUID v8 from SHA-256 hash of GHCID string."""
+    sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
+    # Take first 16 bytes for UUID
+    uuid_bytes = bytearray(sha256_hash[:16])
+    # Set version to 8 (custom)
+    uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80
+    # Set variant to RFC 4122
+    uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80
+    return str(uuid.UUID(bytes=bytes(uuid_bytes)))
+
+
+def generate_numeric_id(ghcid_string: str) -> int:
+    """Generate 64-bit numeric ID from SHA-256 hash."""
+    sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
+    # Take first 8 bytes as 64-bit unsigned integer
+    numeric_id = int.from_bytes(sha256_hash[:8], byteorder='big')
+    return numeric_id
+
+
+def process_file(file_path: Path, dry_run: bool = True) -> Optional[dict]:
+    """
+    Process a single YAML file to fix GHCID diacritics.
+    
+    Returns dict with change info, or None if no change needed.
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = yaml.safe_load(f)
+    except Exception as e:
+        print(f"  Error reading {file_path}: {e}")
+        return None
+    
+    if not data or 'ghcid' not in data:
+        return None
+    
+    ghcid_section = data.get('ghcid', {})
+    old_ghcid = ghcid_section.get('ghcid_current', '')
+    
+    if not has_diacritics_in_ghcid(old_ghcid):
+        return None
+    
+    # Fix the GHCID
+    new_ghcid = fix_ghcid_diacritics(old_ghcid)
+    
+    if new_ghcid == old_ghcid:
+        return None
+    
+    # Generate new identifiers
+    new_uuid_v5 = generate_uuid_v5(new_ghcid)
+    new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
+    new_numeric = generate_numeric_id(new_ghcid)
+    timestamp_now = datetime.now(timezone.utc).isoformat()
+    
+    change_info = {
+        'file': str(file_path),
+        'old_ghcid': old_ghcid,
+        'new_ghcid': new_ghcid,
+        'old_uuid': ghcid_section.get('ghcid_uuid', ''),
+        'new_uuid': new_uuid_v5,
+        'old_numeric': ghcid_section.get('ghcid_numeric', 0),
+        'new_numeric': new_numeric,
+    }
+    
+    if dry_run:
+        return change_info
+    
+    # Update ghcid section
+    ghcid_section['ghcid_current'] = new_ghcid
+    ghcid_section['ghcid_uuid'] = new_uuid_v5
+    ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8
+    ghcid_section['ghcid_numeric'] = new_numeric
+    # Keep original as-is (for historical reference)
+    
+    # Add history entry for the fix
+    ghcid_history = ghcid_section.get('ghcid_history', [])
+    
+    # Add new entry at the beginning
+    new_history_entry = {
+        'ghcid': new_ghcid,
+        'ghcid_numeric': new_numeric,
+        'valid_from': timestamp_now,
+        'reason': f"Normalized diacritics to ASCII per ABBREV-DIACRITICS rule (was: {old_ghcid})"
+    }
+    
+    # Mark previous entry as superseded
+    if ghcid_history:
+        if 'valid_to' not in ghcid_history[0]:
+            ghcid_history[0]['valid_to'] = timestamp_now
+    
+    ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history
+    data['ghcid'] = ghcid_section
+    
+    # Update identifiers section
+    identifiers = data.get('identifiers', [])
+    for ident in identifiers:
+        if ident.get('identifier_scheme') == 'GHCID':
+            ident['identifier_value'] = new_ghcid
+        elif ident.get('identifier_scheme') == 'GHCID_UUID':
+            ident['identifier_value'] = new_uuid_v5
+        elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256':
+            ident['identifier_value'] = new_uuid_v8
+        elif ident.get('identifier_scheme') == 'GHCID_NUMERIC':
+            ident['identifier_value'] = str(new_numeric)
+    data['identifiers'] = identifiers
+    
+    # Write updated file
+    with open(file_path, 'w', encoding='utf-8') as f:
+        yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
+    
+    # Rename file to match new GHCID
+    old_filename = file_path.name
+    new_filename = f"{new_ghcid}.yaml"
+    
+    if old_filename != new_filename:
+        new_file_path = file_path.parent / new_filename
+        if new_file_path.exists():
+            print(f"  Warning: Target file already exists: {new_file_path}")
+            # Don't rename if target exists
+        else:
+            shutil.move(str(file_path), str(new_file_path))
+            change_info['new_file'] = str(new_file_path)
+    
+    return change_info
+
+
+def find_affected_files(custodian_dir: Path) -> list[Path]:
+    """Find all YAML files with diacritics in GHCID abbreviation.
+    
+    Uses filename-based detection for speed, since filenames match GHCID.
+    """
+    import subprocess
+    
+    # Use find with regex for speed - filenames contain the GHCID
+    try:
+        result = subprocess.run(
+            ['find', str(custodian_dir), '-name', '*.yaml', '-type', 'f'],
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+        
+        all_files = [Path(p) for p in result.stdout.strip().split('\n') if p]
+    except Exception:
+        # Fallback to glob
+        all_files = list(custodian_dir.glob("*.yaml"))
+    
+    affected = []
+    for yaml_file in all_files:
+        # Check filename for diacritics (faster than parsing YAML)
+        if DIACRITICS_PATTERN.search(yaml_file.stem):
+            affected.append(yaml_file)
+    
+    return affected
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Fix GHCID abbreviations containing diacritics"
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help="Preview changes without modifying files"
+    )
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=0,
+        help="Limit number of files to process (0 = no limit)"
+    )
+    parser.add_argument(
+        '--custodian-dir',
+        type=Path,
+        default=Path('data/custodian'),
+        help="Path to custodian directory"
+    )
+    
+    args = parser.parse_args()
+    
+    custodian_dir = args.custodian_dir
+    if not custodian_dir.exists():
+        print(f"Error: Directory not found: {custodian_dir}")
+        return 1
+    
+    print(f"Scanning {custodian_dir} for files with diacritics in GHCID abbreviation...")
+    affected_files = find_affected_files(custodian_dir)
+    
+    print(f"Found {len(affected_files)} affected files")
+    
+    if args.limit > 0:
+        affected_files = affected_files[:args.limit]
+        print(f"Limited to {args.limit} files")
+    
+    if args.dry_run:
+        print("\n=== DRY RUN (no changes will be made) ===\n")
+    else:
+        print("\n=== APPLYING CHANGES ===\n")
+    
+    changes = []
+    for i, file_path in enumerate(affected_files, 1):
+        print(f"[{i}/{len(affected_files)}] Processing {file_path.name}...")
+        change = process_file(file_path, dry_run=args.dry_run)
+        if change:
+            changes.append(change)
+            print(f"  {change['old_ghcid']} → {change['new_ghcid']}")
+    
+    print(f"\n=== SUMMARY ===")
+    print(f"Files processed: {len(affected_files)}")
+    print(f"Files changed: {len(changes)}")
+    
+    if args.dry_run and changes:
+        print("\nTo apply changes, run without --dry-run flag")
+    
+    # Show country distribution
+    if changes:
+        countries = {}
+        for c in changes:
+            cc = c['old_ghcid'].split('-')[0]
+            countries[cc] = countries.get(cc, 0) + 1
+        
+        print("\nBy country:")
+        for cc, count in sorted(countries.items(), key=lambda x: -x[1]):
+            print(f"  {cc}: {count}")
+    
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
--- a/scripts/resolve_diacritics_collisions.py
+++ b/scripts/resolve_diacritics_collisions.py
@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""
+Resolve GHCID collisions caused by diacritics normalization.
+
+When a file with diacritics normalizes to the same GHCID as an existing file,
+the diacritics file gets a name suffix per AGENTS.md collision rules.
+
+Usage:
+    python scripts/resolve_diacritics_collisions.py --dry-run   # Preview changes
+    python scripts/resolve_diacritics_collisions.py             # Apply changes
+"""
+
+import argparse
+import hashlib
+import os
+import re
+import shutil
+import unicodedata
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+import yaml
+
+# GHCID namespace UUID for deterministic UUID generation
+GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")
+
+# Regex pattern for common diacritics
+DIACRITICS_PATTERN = re.compile(r'[ČŘŠŽĚŮÁÉÍÓÚÝÄÖÜßÑÇÀÈÌÒÙŁŃŚŹŻĄĘÅØÆŐŰÂÊÎÔÛčřšžěůáéíóúýäöüñçàèìòùłńśźżąęåøæőűâêîôûĎŇŤďňť]')
+
+
+def normalize_diacritics(text: str) -> str:
+    """Normalize diacritics to ASCII equivalents."""
+    normalized = unicodedata.normalize('NFD', text)
+    ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
+    return ascii_text
+
+
+def generate_name_suffix(native_name: str) -> str:
+    """Convert native language institution name to snake_case suffix."""
+    # Normalize unicode (NFD decomposition) and remove diacritics
+    normalized = unicodedata.normalize('NFD', native_name)
+    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
+    
+    # Convert to lowercase
+    lowercase = ascii_name.lower()
+    
+    # Remove apostrophes, commas, and other punctuation
+    no_punct = re.sub(r"[''`\",.:;!?()[\]{}‒–—]", '', lowercase)
+    
+    # Replace spaces and hyphens with underscores
+    underscored = re.sub(r'[\s\-]+', '_', no_punct)
+    
+    # Remove any remaining non-alphanumeric characters (except underscores)
+    clean = re.sub(r'[^a-z0-9_]', '', underscored)
+    
+    # Collapse multiple underscores
+    final = re.sub(r'_+', '_', clean).strip('_')
+    
+    return final
+
+
+def generate_uuid_v5(ghcid_string: str) -> str:
+    """Generate deterministic UUID v5 from GHCID string."""
+    return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
+
+
+def generate_uuid_v8_sha256(ghcid_string: str) -> str:
+    """Generate UUID v8 from SHA-256 hash of GHCID string."""
+    sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
+    uuid_bytes = bytearray(sha256_hash[:16])
+    uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80
+    uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80
+    return str(uuid.UUID(bytes=bytes(uuid_bytes)))
+
+
+def generate_numeric_id(ghcid_string: str) -> int:
+    """Generate 64-bit numeric ID from SHA-256 hash."""
+    sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
+    return int.from_bytes(sha256_hash[:8], byteorder='big')
+
+
+def find_collision_pairs(custodian_dir: Path) -> list[tuple[Path, Path, str]]:
+    """Find files with diacritics that collide with existing ASCII files.
+    
+    Returns list of (diacritics_file, ascii_file, ascii_ghcid).
+    """
+    collisions = []
+    
+    for yaml_file in custodian_dir.glob("*.yaml"):
+        filename = yaml_file.stem  # Without .yaml
+        
+        if not DIACRITICS_PATTERN.search(filename):
+            continue
+        
+        # Normalize to ASCII
+        ascii_filename = normalize_diacritics(filename)
+        ascii_file = custodian_dir / f"{ascii_filename}.yaml"
+        
+        if ascii_file.exists():
+            collisions.append((yaml_file, ascii_file, ascii_filename))
+    
+    return collisions
+
+
+def resolve_collision(diacritics_file: Path, ascii_ghcid: str, dry_run: bool = True) -> Optional[dict]:
+    """
+    Resolve a collision by adding a name suffix to the diacritics file.
+    
+    The diacritics file gets a name suffix since it's being added later.
+    """
+    try:
+        with open(diacritics_file, 'r', encoding='utf-8') as f:
+            data = yaml.safe_load(f)
+    except Exception as e:
+        print(f"  Error reading {diacritics_file}: {e}")
+        return None
+    
+    if not data:
+        return None
+    
+    # Get institution name for suffix
+    original_entry = data.get('original_entry', {})
+    inst_name = original_entry.get('name', '')
+    
+    if not inst_name:
+        print(f"  Warning: No institution name found in {diacritics_file}")
+        return None
+    
+    # Generate name suffix
+    name_suffix = generate_name_suffix(inst_name)
+    
+    # Create new GHCID with name suffix
+    new_ghcid = f"{ascii_ghcid}-{name_suffix}"
+    
+    # Get old GHCID from file
+    ghcid_section = data.get('ghcid', {})
+    old_ghcid = ghcid_section.get('ghcid_current', diacritics_file.stem)
+    
+    # Generate new identifiers
+    new_uuid_v5 = generate_uuid_v5(new_ghcid)
+    new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
+    new_numeric = generate_numeric_id(new_ghcid)
+    timestamp_now = datetime.now(timezone.utc).isoformat()
+    
+    change_info = {
+        'file': str(diacritics_file),
+        'institution_name': inst_name,
+        'old_ghcid': old_ghcid,
+        'new_ghcid': new_ghcid,
+        'name_suffix': name_suffix,
+    }
+    
+    if dry_run:
+        return change_info
+    
+    # Update ghcid section
+    ghcid_section['ghcid_current'] = new_ghcid
+    ghcid_section['ghcid_uuid'] = new_uuid_v5
+    ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8
+    ghcid_section['ghcid_numeric'] = new_numeric
+    
+    # Add history entry
+    ghcid_history = ghcid_section.get('ghcid_history', [])
+    
+    new_history_entry = {
+        'ghcid': new_ghcid,
+        'ghcid_numeric': new_numeric,
+        'valid_from': timestamp_now,
+        'reason': f"Name suffix added to resolve collision with {ascii_ghcid} (was: {old_ghcid})"
+    }
+    
+    if ghcid_history and 'valid_to' not in ghcid_history[0]:
+        ghcid_history[0]['valid_to'] = timestamp_now
+    
+    ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history
+    data['ghcid'] = ghcid_section
+    
+    # Update identifiers section
+    identifiers = data.get('identifiers', [])
+    for ident in identifiers:
+        if ident.get('identifier_scheme') == 'GHCID':
+            ident['identifier_value'] = new_ghcid
+        elif ident.get('identifier_scheme') == 'GHCID_UUID':
+            ident['identifier_value'] = new_uuid_v5
+        elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256':
+            ident['identifier_value'] = new_uuid_v8
+        elif ident.get('identifier_scheme') == 'GHCID_NUMERIC':
+            ident['identifier_value'] = str(new_numeric)
+    data['identifiers'] = identifiers
+    
+    # Write updated file
+    with open(diacritics_file, 'w', encoding='utf-8') as f:
+        yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
+    
+    # Rename file to match new GHCID
+    new_filename = f"{new_ghcid}.yaml"
+    new_file_path = diacritics_file.parent / new_filename
+    
+    if new_file_path.exists():
+        print(f"  Warning: Target file already exists: {new_file_path}")
+    else:
+        shutil.move(str(diacritics_file), str(new_file_path))
+        change_info['new_file'] = str(new_file_path)
+    
+    return change_info
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Resolve GHCID collisions caused by diacritics normalization"
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help="Preview changes without modifying files"
+    )
+    parser.add_argument(
+        '--custodian-dir',
+        type=Path,
+        default=Path('data/custodian'),
+        help="Path to custodian directory"
+    )
+    
+    args = parser.parse_args()
+    custodian_dir = args.custodian_dir
+    
+    if not custodian_dir.exists():
+        print(f"Error: Directory not found: {custodian_dir}")
+        return 1
+    
+    print(f"Scanning {custodian_dir} for diacritics collision pairs...")
+    collisions = find_collision_pairs(custodian_dir)
+    
+    print(f"Found {len(collisions)} collision pairs\n")
+    
+    if args.dry_run:
+        print("=== DRY RUN (no changes will be made) ===\n")
+    else:
+        print("=== APPLYING CHANGES ===\n")
+    
+    changes = []
+    for i, (diacritics_file, ascii_file, ascii_ghcid) in enumerate(collisions, 1):
+        print(f"[{i}/{len(collisions)}] Collision:")
+        print(f"  Diacritics file: {diacritics_file.name}")
+        print(f"  Collides with:   {ascii_file.name}")
+        
+        change = resolve_collision(diacritics_file, ascii_ghcid, dry_run=args.dry_run)
+        
+        if change:
+            changes.append(change)
+            print(f"  Institution:     {change['institution_name']}")
+            print(f"  GHCID change:    {change['old_ghcid']} → {change['new_ghcid']}")
+            if 'new_file' in change:
+                print(f"  New file:        {Path(change['new_file']).name}")
+        print()
+    
+    print(f"=== SUMMARY ===")
+    print(f"Collisions found: {len(collisions)}")
+    print(f"Files resolved: {len(changes)}")
+    
+    if args.dry_run and changes:
+        print("\nTo apply changes, run without --dry-run flag")
+    
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
--- a/scripts/transliterate_emic_names.py
+++ b/scripts/transliterate_emic_names.py
--- a/tests/test_transliteration.py
+++ b/tests/test_transliteration.py
@ -0,0 +1,350 @@
+"""
+Unit tests for transliteration functions.
+
+Tests the scripts/transliterate_emic_names.py module for converting
+non-Latin script institution names to Latin characters.
+"""
+
+import sys
+from pathlib import Path
+import pytest
+
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.transliterate_emic_names import (
+    detect_script,
+    transliterate,
+    transliterate_for_abbreviation,
+    transliterate_cyrillic,
+    transliterate_chinese,
+    transliterate_japanese,
+    transliterate_korean,
+    transliterate_arabic,
+    transliterate_hebrew,
+    transliterate_greek,
+    transliterate_devanagari,
+    transliterate_armenian,
+    transliterate_georgian,
+    transliterate_thai,
+    transliterate_sinhala,
+    transliterate_khmer,
+)
+
+
+class TestScriptDetection:
+    """Tests for script detection function."""
+    
+    def test_detect_latin(self):
+        assert detect_script("Hello World") == "latin"
+        assert detect_script("Rijksmuseum Amsterdam") == "latin"
+    
+    def test_detect_cyrillic(self):
+        assert detect_script("Институт") == "cyrillic"
+        assert detect_script("Музей") == "cyrillic"
+    
+    def test_detect_chinese(self):
+        assert detect_script("故宮博物院") == "chinese"
+        assert detect_script("中国国家图书馆") == "chinese"
+    
+    def test_detect_japanese(self):
+        # Japanese with hiragana or katakana
+        assert detect_script("こんにちは") == "japanese"
+        assert detect_script("カタカナ") == "japanese"
+    
+    def test_detect_korean(self):
+        assert detect_script("국립중앙박물관") == "korean"
+    
+    def test_detect_arabic(self):
+        assert detect_script("المكتبة الوطنية") == "arabic"
+    
+    def test_detect_hebrew(self):
+        assert detect_script("ארכיון") == "hebrew"
+    
+    def test_detect_greek(self):
+        assert detect_script("Μουσείο") == "greek"
+    
+    def test_detect_devanagari(self):
+        assert detect_script("राजस्थान") == "devanagari"
+    
+    def test_detect_thai(self):
+        assert detect_script("สำนักหอจดหมายเหตุ") == "thai"
+        assert detect_script("กรุงเทพ") == "thai"
+    
+    def test_detect_sinhala(self):
+        assert detect_script("පේරාදෙණිය") == "sinhala"
+        assert detect_script("ජාතික කෞතුකාගාර") == "sinhala"
+    
+    def test_detect_khmer(self):
+        assert detect_script("សារមន្ទីរ") == "khmer"
+        assert detect_script("ភ្នំពេញ") == "khmer"
+
+
+class TestCyrillicTransliteration:
+    """Tests for Cyrillic (Russian/Ukrainian/etc.) transliteration."""
+    
+    def test_russian_basic(self):
+        result = transliterate_cyrillic("Музей", "ru")
+        assert result == "Muzey"
+    
+    def test_russian_institute(self):
+        result = transliterate_cyrillic("Институт восточных рукописей РАН", "ru")
+        assert "Institut" in result
+        assert "vostochnykh" in result
+    
+    def test_russian_hard_soft_signs(self):
+        # Hard and soft signs should be removed
+        result = transliterate_cyrillic("объект", "ru")
+        assert "ъ" not in result
+        assert "ь" not in result
+    
+    def test_ukrainian(self):
+        result = transliterate_cyrillic("Київ", "uk")
+        # Should handle Ukrainian-specific letters
+        assert "K" in result or "k" in result
+
+
+class TestChineseTransliteration:
+    """Tests for Chinese (Hanzi to Pinyin) transliteration."""
+    
+    def test_museum_vocabulary(self):
+        result = transliterate_chinese("博物館")
+        assert "bo" in result.lower() or "haku" in result.lower()
+    
+    def test_national_palace_museum(self):
+        result = transliterate_chinese("故宮博物院")
+        # Should contain pinyin for these characters
+        assert len(result) > 0
+        assert result != "故宮博物院"  # Should be transliterated
+    
+    def test_dongba_museum(self):
+        result = transliterate_chinese("东巴文化博物院")
+        assert "dong" in result.lower()
+        assert "wen" in result.lower()
+
+
+class TestJapaneseTransliteration:
+    """Tests for Japanese (Kanji/Kana to Romaji) transliteration."""
+    
+    def test_national_museum(self):
+        result = transliterate_japanese("国立博物館")
+        assert "koku" in result.lower()
+        assert "ritsu" in result.lower()
+    
+    def test_tokyo_national_museum(self):
+        result = transliterate_japanese("東京国立博物館")
+        assert "tou" in result.lower() or "to" in result.lower()
+        assert "kyou" in result.lower() or "kyo" in result.lower()
+    
+    def test_hiragana(self):
+        result = transliterate_japanese("あいうえお")
+        assert result == "aiueo"
+    
+    def test_katakana(self):
+        result = transliterate_japanese("アイウエオ")
+        assert result == "aiueo"
+
+
+class TestKoreanTransliteration:
+    """Tests for Korean (Hangul to Revised Romanization) transliteration."""
+    
+    def test_national_museum(self):
+        result = transliterate_korean("국립중앙박물관")
+        # Should contain romanized syllables
+        assert len(result) > 0
+        assert "guk" in result.lower() or "kuk" in result.lower()
+    
+    def test_simple_hangul(self):
+        result = transliterate_korean("한글")
+        assert "han" in result.lower()
+
+
+class TestArabicTransliteration:
+    """Tests for Arabic script transliteration."""
+    
+    def test_national_library(self):
+        result = transliterate_arabic("المكتبة الوطنية")
+        assert "mktb" in result.lower() or "maktab" in result.lower()
+    
+    def test_basic_letters(self):
+        result = transliterate_arabic("كتاب")
+        assert "k" in result.lower()
+        assert "t" in result.lower()
+
+
+class TestHebrewTransliteration:
+    """Tests for Hebrew script transliteration."""
+    
+    def test_archive(self):
+        result = transliterate_hebrew("ארכיון")
+        # Should contain transliterated letters
+        assert len(result) > 0
+    
+    def test_basic_letters(self):
+        result = transliterate_hebrew("שלום")
+        assert "sh" in result.lower()
+
+
+class TestGreekTransliteration:
+    """Tests for Greek script transliteration."""
+    
+    def test_museum(self):
+        result = transliterate_greek("Μουσείο")
+        assert "Moyseio" in result or "Mouseio" in result
+    
+    def test_archaeological(self):
+        result = transliterate_greek("Αρχαιολογικό")
+        assert "Archaiologiko" in result
+
+
+class TestDevanagariTransliteration:
+    """Tests for Devanagari (Hindi/Nepali) transliteration."""
+    
+    def test_rajasthan(self):
+        result = transliterate_devanagari("राजस्थान")
+        # ISO 15919 uses "aa" for long vowels, so "raaj" not "raj"
+        assert "raaj" in result.lower() or "raj" in result.lower()
+    
+    def test_basic_consonants(self):
+        result = transliterate_devanagari("क")
+        assert "k" in result.lower()
+
+
+class TestThaiTransliteration:
+    """Tests for Thai script transliteration (RTGS)."""
+    
+    def test_national_archives(self):
+        # สำนักหอจดหมายเหตุแห่งชาติ = National Archives of Thailand
+        result = transliterate_thai("สำนักหอจดหมายเหตุแห่งชาติ")
+        assert "samnak" in result.lower()
+        assert "haeng chat" in result.lower()
+    
+    def test_national_library(self):
+        # สำนักหอสมุดแห่งชาติ = National Library of Thailand
+        result = transliterate_thai("สำนักหอสมุดแห่งชาติ")
+        assert "ho samut" in result.lower()
+    
+    def test_national_museum(self):
+        # พิพิธภัณฑสถานแห่งชาติ พระนคร = Bangkok National Museum
+        result = transliterate_thai("พิพิธภัณฑสถานแห่งชาติ พระนคร")
+        assert "phiphitthaphan" in result.lower()
+        assert "phra nakhon" in result.lower()
+    
+    def test_siam_society(self):
+        # สยามสมาคมในพระบรมราชูปถัมภ์ = Siam Society
+        result = transliterate_thai("สยามสมาคมในพระบรมราชูปถัมภ์")
+        assert "sayam" in result.lower()
+        assert "samakhom" in result.lower()
+    
+    def test_wat_temple(self):
+        # วัดโพธิ์ราม = Wat Pho Ram
+        result = transliterate_thai("วัดโพธิ์ราม")
+        assert "wat" in result.lower()
+        assert "pho" in result.lower()
+        assert "ram" in result.lower()
+    
+    def test_empty_without_library(self):
+        # Even without pythainlp, should return transliterated result (not empty)
+        result = transliterate_thai("กรุงเทพ")
+        # Should get 'krung thep' from vocabulary lookup
+        assert len(result) > 0
+
+
+class TestSinhalaTransliteration:
+    """Tests for Sinhala script transliteration (ISO 15919)."""
+    
+    def test_university_peradeniya(self):
+        # පේරාදෙණිය විශ්වවිද් යාලය = University of Peradeniya
+        result = transliterate_sinhala("පේරාදෙණිය විශ්වවිද් යාලය")
+        assert "peradeniya" in result.lower()
+        assert "vishvavid" in result.lower()
+    
+    def test_national_museums(self):
+        # ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව = Department of National Museums
+        result = transliterate_sinhala("ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව")
+        assert "jathika" in result.lower()
+        assert "kauthukagara" in result.lower()
+    
+    def test_basic_consonants(self):
+        # Basic consonant test
+        result = transliterate_sinhala("ක")  # ka
+        assert "k" in result.lower()
+    
+    def test_output_not_empty(self):
+        # Sinhala should never return empty string
+        result = transliterate_sinhala("කොළඹ")  # Colombo
+        assert len(result) > 0
+
+
+class TestKhmerTransliteration:
+    """Tests for Khmer script transliteration (UNGEGN)."""
+    
+    def test_tuol_sleng(self):
+        # សារមន្ទីរទួលស្លែង = Tuol Sleng Genocide Museum
+        result = transliterate_khmer("សារមន្ទីរទួលស្លែង")
+        assert "tuol sleng" in result.lower()
+    
+    def test_phnom_penh(self):
+        # ភ្នំពេញ = Phnom Penh
+        result = transliterate_khmer("ភ្នំពេញ")
+        assert "phnom penh" in result.lower()
+    
+    def test_angkor(self):
+        # អង្គរ = Angkor
+        result = transliterate_khmer("អង្គរ")
+        assert "angkor" in result.lower()
+    
+    def test_output_not_empty(self):
+        # Khmer should never return empty string
+        result = transliterate_khmer("សារមន្ទីរ")
+        assert len(result) > 0
+
+
+class TestTransliterateForAbbreviation:
+    """Tests for the main abbreviation function."""
+    
+    def test_russian_cleanup(self):
+        result = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru")
+        # Should be clean Latin text
+        assert result.isascii() or all(c.isalnum() or c in " -'" for c in result)
+    
+    def test_chinese_cleanup(self):
+        result = transliterate_for_abbreviation("东巴文化博物院", "zh")
+        # Should be clean Latin text or warning
+        assert result.isascii() or "[REQUIRES" in result
+    
+    def test_korean_cleanup(self):
+        result = transliterate_for_abbreviation("국립중앙박물관", "ko")
+        assert result.isascii()
+    
+    def test_special_characters_removed(self):
+        # Special characters should be removed for abbreviation
+        result = transliterate_for_abbreviation("Test (Museum) & Gallery", "en")
+        assert "&" not in result
+        assert "(" not in result
+
+
+class TestIntegration:
+    """Integration tests using the main transliterate function."""
+    
+    def test_auto_detect_russian(self):
+        result = transliterate("Музей")
+        assert result.isascii()
+    
+    def test_auto_detect_korean(self):
+        result = transliterate("박물관")
+        assert result.isascii()
+    
+    def test_latin_passthrough(self):
+        result = transliterate("Rijksmuseum Amsterdam")
+        assert result == "Rijksmuseum Amsterdam"
+    
+    def test_with_explicit_language(self):
+        result = transliterate("故宮博物院", lang="zh")
+        assert len(result) > 0
+        # Should not be original Chinese
+        assert "故" not in result or "[REQUIRES" in result
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])