feat(ppid): add unidecode support for non-Latin script transliteration
Add optional unidecode dependency to handle Hebrew, Arabic, Chinese, and other non-Latin scripts when generating Person Persistent IDs.
This commit is contained in:
parent
932ec5438c
commit
abe30cb302
1 changed files with 31 additions and 3 deletions
|
|
@ -26,6 +26,14 @@ from urllib.parse import unquote
|
|||
from typing import Optional
|
||||
import shutil
|
||||
|
||||
try:
|
||||
from unidecode import unidecode as _unidecode
|
||||
HAS_UNIDECODE = True
|
||||
except ImportError:
|
||||
HAS_UNIDECODE = False
|
||||
_unidecode = None
|
||||
print("WARNING: unidecode not installed. Non-Latin names may not be transliterated correctly.")
|
||||
|
||||
|
||||
# Dutch tussenvoegsels (particles) to skip in last name token
|
||||
DUTCH_PARTICLES = {
|
||||
|
|
@ -54,9 +62,29 @@ INTERNATIONAL_PARTICLES = {
|
|||
|
||||
|
||||
def normalize_name(name: str) -> str:
|
||||
"""Normalize diacritics to ASCII equivalents."""
|
||||
normalized = unicodedata.normalize('NFD', name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
"""Normalize name to ASCII equivalents.
|
||||
|
||||
Uses NFD decomposition for Latin scripts with diacritics,
|
||||
and unidecode for non-Latin scripts (Hebrew, Arabic, Chinese, etc.)
|
||||
"""
|
||||
if not name:
|
||||
return ""
|
||||
|
||||
# Check if name contains non-Latin characters
|
||||
# If any character is not in Latin extended range, use unidecode
|
||||
has_non_latin = any(
|
||||
ord(c) > 0x024F and unicodedata.category(c).startswith('L')
|
||||
for c in name
|
||||
)
|
||||
|
||||
if has_non_latin and HAS_UNIDECODE:
|
||||
# Use unidecode for Hebrew, Arabic, Chinese, etc.
|
||||
ascii_name = unidecode(name)
|
||||
else:
|
||||
# Use NFD decomposition for Latin scripts with diacritics
|
||||
normalized = unicodedata.normalize('NFD', name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
return ascii_name
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue