feat(ppid): add unidecode support for non-Latin script transliteration

Add optional unidecode dependency to handle Hebrew, Arabic, Chinese,
and other non-Latin scripts when generating Person Persistent IDs.
This commit is contained in:
kempersc 2026-01-09 18:28:41 +01:00
parent 932ec5438c
commit abe30cb302

View file

@ -26,6 +26,14 @@ from urllib.parse import unquote
from typing import Optional
import shutil
try:
from unidecode import unidecode as _unidecode
HAS_UNIDECODE = True
except ImportError:
HAS_UNIDECODE = False
_unidecode = None
print("WARNING: unidecode not installed. Non-Latin names may not be transliterated correctly.")
# Dutch tussenvoegsels (particles) to skip in last name token
DUTCH_PARTICLES = {
@ -54,9 +62,29 @@ INTERNATIONAL_PARTICLES = {
def normalize_name(name: str) -> str:
"""Normalize diacritics to ASCII equivalents."""
normalized = unicodedata.normalize('NFD', name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
"""Normalize name to ASCII equivalents.
Uses NFD decomposition for Latin scripts with diacritics,
and unidecode for non-Latin scripts (Hebrew, Arabic, Chinese, etc.)
"""
if not name:
return ""
# Check if name contains non-Latin characters
# If any character is not in Latin extended range, use unidecode
has_non_latin = any(
ord(c) > 0x024F and unicodedata.category(c).startswith('L')
for c in name
)
if has_non_latin and HAS_UNIDECODE:
# Use unidecode for Hebrew, Arabic, Chinese, etc.
ascii_name = unidecode(name)
else:
# Use NFD decomposition for Latin scripts with diacritics
normalized = unicodedata.normalize('NFD', name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_name