From abe30cb30226a780cc546fc6366675c0fb459c37 Mon Sep 17 00:00:00 2001 From: kempersc Date: Fri, 9 Jan 2026 18:28:41 +0100 Subject: [PATCH] feat(ppid): add unidecode support for non-Latin script transliteration Add optional unidecode dependency to handle Hebrew, Arabic, Chinese, and other non-Latin scripts when generating Person Persistent IDs. --- scripts/generate_ppids.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/scripts/generate_ppids.py b/scripts/generate_ppids.py index 283a1a3658..628f3be648 100644 --- a/scripts/generate_ppids.py +++ b/scripts/generate_ppids.py @@ -26,6 +26,14 @@ from urllib.parse import unquote from typing import Optional import shutil +try: + from unidecode import unidecode as _unidecode + HAS_UNIDECODE = True +except ImportError: + HAS_UNIDECODE = False + _unidecode = None + print("WARNING: unidecode not installed. Non-Latin names may not be transliterated correctly.") + # Dutch tussenvoegsels (particles) to skip in last name token DUTCH_PARTICLES = { @@ -54,9 +62,29 @@ INTERNATIONAL_PARTICLES = { def normalize_name(name: str) -> str: - """Normalize diacritics to ASCII equivalents.""" - normalized = unicodedata.normalize('NFD', name) - ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + """Normalize name to ASCII equivalents. + + Uses NFD decomposition for Latin scripts with diacritics, + and unidecode for non-Latin scripts (Hebrew, Arabic, Chinese, etc.) + """ + if not name: + return "" + + # Check if name contains non-Latin characters + # If any character is not in Latin extended range, use unidecode + has_non_latin = any( + ord(c) > 0x024F and unicodedata.category(c).startswith('L') + for c in name + ) + + if has_non_latin and HAS_UNIDECODE: + # Use unidecode for Hebrew, Arabic, Chinese, etc. + ascii_name = unidecode(name) + else: + # Use NFD decomposition for Latin scripts with diacritics + normalized = unicodedata.normalize('NFD', name) + ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + return ascii_name