From abe30cb30226a780cc546fc6366675c0fb459c37 Mon Sep 17 00:00:00 2001
From: kempersc <sckemper@mailfence.com>
Date: Fri, 9 Jan 2026 18:28:41 +0100
Subject: [PATCH] feat(ppid): add unidecode support for non-Latin script
 transliteration

Add optional unidecode dependency to handle Hebrew, Arabic, Chinese,
and other non-Latin scripts when generating Person Persistent IDs.
---
 scripts/generate_ppids.py | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/scripts/generate_ppids.py b/scripts/generate_ppids.py
index 283a1a3658..628f3be648 100644
--- a/scripts/generate_ppids.py
+++ b/scripts/generate_ppids.py
@@ -26,6 +26,14 @@ from urllib.parse import unquote
 from typing import Optional
 import shutil
 
+try:
+    from unidecode import unidecode as _unidecode
+    HAS_UNIDECODE = True
+except ImportError:
+    HAS_UNIDECODE = False
+    _unidecode = None
+    print("WARNING: unidecode not installed. Non-Latin names may not be transliterated correctly.")
+
 
 # Dutch tussenvoegsels (particles) to skip in last name token
 DUTCH_PARTICLES = {
@@ -54,9 +62,29 @@ INTERNATIONAL_PARTICLES = {
 
 
 def normalize_name(name: str) -> str:
-    """Normalize diacritics to ASCII equivalents."""
-    normalized = unicodedata.normalize('NFD', name)
-    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
+    """Normalize name to ASCII equivalents.
+    
+    Uses NFD decomposition for Latin scripts with diacritics,
+    and unidecode for non-Latin scripts (Hebrew, Arabic, Chinese, etc.)
+    """
+    if not name:
+        return ""
+    
+    # Check if name contains non-Latin characters
+    # If any character is not in Latin extended range, use unidecode
+    has_non_latin = any(
+        ord(c) > 0x024F and unicodedata.category(c).startswith('L')
+        for c in name
+    )
+    
+    if has_non_latin and HAS_UNIDECODE:
+        # Use unidecode for Hebrew, Arabic, Chinese, etc.
+        ascii_name = unidecode(name)
+    else:
+        # Use NFD decomposition for Latin scripts with diacritics
+        normalized = unicodedata.normalize('NFD', name)
+        ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
+    
     return ascii_name