feat(ppid): add unidecode support for non-Latin script transliteration

Add optional unidecode dependency to handle Hebrew, Arabic, Chinese, and other non-Latin scripts when generating Person Persistent IDs.
2026-01-09 18:28:41 +01:00 · 2026-01-09 18:28:41 +01:00 · abe30cb302
commit abe30cb302
parent 932ec5438c
1 changed files with 31 additions and 3 deletions
--- a/scripts/generate_ppids.py
+++ b/scripts/generate_ppids.py
@ -26,6 +26,14 @@ from urllib.parse import unquote
 from typing import Optional
 import shutil

+try:
+    from unidecode import unidecode as _unidecode
+    HAS_UNIDECODE = True
+except ImportError:
+    HAS_UNIDECODE = False
+    _unidecode = None
+    print("WARNING: unidecode not installed. Non-Latin names may not be transliterated correctly.")
+

 # Dutch tussenvoegsels (particles) to skip in last name token
 DUTCH_PARTICLES = {
@ -54,9 +62,29 @@ INTERNATIONAL_PARTICLES = {


 def normalize_name(name: str) -> str:
-    """Normalize diacritics to ASCII equivalents."""
-    normalized = unicodedata.normalize('NFD', name)
-    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
+    """Normalize name to ASCII equivalents.
+    
+    Uses NFD decomposition for Latin scripts with diacritics,
+    and unidecode for non-Latin scripts (Hebrew, Arabic, Chinese, etc.)
+    """
+    if not name:
+        return ""
+    
+    # Check if name contains non-Latin characters
+    # If any character is not in Latin extended range, use unidecode
+    has_non_latin = any(
+        ord(c) > 0x024F and unicodedata.category(c).startswith('L')
+        for c in name
+    )
+    
+    if has_non_latin and HAS_UNIDECODE:
+        # Use unidecode for Hebrew, Arabic, Chinese, etc.
+        ascii_name = unidecode(name)
+    else:
+        # Use NFD decomposition for Latin scripts with diacritics
+        normalized = unicodedata.normalize('NFD', name)
+        ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
+    
    return ascii_name