feat(ppid): add unidecode support for non-Latin script transliteration

Add optional unidecode dependency to handle Hebrew, Arabic, Chinese, and other non-Latin scripts when generating Person Persistent IDs.
2026-01-09 18:28:41 +01:00 · 2026-01-09 18:28:41 +01:00 · abe30cb302
commit abe30cb302
parent 932ec5438c
1 changed files with 31 additions and 3 deletions
--- a/scripts/generate_ppids.py
+++ b/scripts/generate_ppids.py
@ -26,6 +26,14 @@ from urllib.parse import unquote
 from typing import Optional
 import shutil
 try:
    from unidecode import unidecode as _unidecode
    HAS_UNIDECODE = True
 except ImportError:
    HAS_UNIDECODE = False
    _unidecode = None
    print("WARNING: unidecode not installed. Non-Latin names may not be transliterated correctly.")
 # Dutch tussenvoegsels (particles) to skip in last name token
 DUTCH_PARTICLES = {
@ -54,9 +62,29 @@ INTERNATIONAL_PARTICLES = {
 def normalize_name(name: str) -> str:
-    """Normalize diacritics to ASCII equivalents."""
+    """Normalize name to ASCII equivalents.
-    normalized = unicodedata.normalize('NFD', name)
+    
-    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
+    Uses NFD decomposition for Latin scripts with diacritics,
    and unidecode for non-Latin scripts (Hebrew, Arabic, Chinese, etc.)
    """
    if not name:
        return ""
    # Check if name contains non-Latin characters
    # If any character is not in Latin extended range, use unidecode
    has_non_latin = any(
        ord(c) > 0x024F and unicodedata.category(c).startswith('L')
        for c in name
    )
    if has_non_latin and HAS_UNIDECODE:
        # Use unidecode for Hebrew, Arabic, Chinese, etc.
        ascii_name = unidecode(name)
    else:
        # Use NFD decomposition for Latin scripts with diacritics
        normalized = unicodedata.normalize('NFD', name)
        ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    return ascii_name