feat(ppid): add unidecode support for non-Latin script transliteration
Add optional unidecode dependency to handle Hebrew, Arabic, Chinese, and other non-Latin scripts when generating Person Persistent IDs.
This commit is contained in:
parent
932ec5438c
commit
abe30cb302
1 changed files with 31 additions and 3 deletions
|
|
@ -26,6 +26,14 @@ from urllib.parse import unquote
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
try:
|
||||||
|
from unidecode import unidecode as _unidecode
|
||||||
|
HAS_UNIDECODE = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_UNIDECODE = False
|
||||||
|
_unidecode = None
|
||||||
|
print("WARNING: unidecode not installed. Non-Latin names may not be transliterated correctly.")
|
||||||
|
|
||||||
|
|
||||||
# Dutch tussenvoegsels (particles) to skip in last name token
|
# Dutch tussenvoegsels (particles) to skip in last name token
|
||||||
DUTCH_PARTICLES = {
|
DUTCH_PARTICLES = {
|
||||||
|
|
@ -54,9 +62,29 @@ INTERNATIONAL_PARTICLES = {
|
||||||
|
|
||||||
|
|
||||||
def normalize_name(name: str) -> str:
|
def normalize_name(name: str) -> str:
|
||||||
"""Normalize diacritics to ASCII equivalents."""
|
"""Normalize name to ASCII equivalents.
|
||||||
normalized = unicodedata.normalize('NFD', name)
|
|
||||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
Uses NFD decomposition for Latin scripts with diacritics,
|
||||||
|
and unidecode for non-Latin scripts (Hebrew, Arabic, Chinese, etc.)
|
||||||
|
"""
|
||||||
|
if not name:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Check if name contains non-Latin characters
|
||||||
|
# If any character is not in Latin extended range, use unidecode
|
||||||
|
has_non_latin = any(
|
||||||
|
ord(c) > 0x024F and unicodedata.category(c).startswith('L')
|
||||||
|
for c in name
|
||||||
|
)
|
||||||
|
|
||||||
|
if has_non_latin and HAS_UNIDECODE:
|
||||||
|
# Use unidecode for Hebrew, Arabic, Chinese, etc.
|
||||||
|
ascii_name = unidecode(name)
|
||||||
|
else:
|
||||||
|
# Use NFD decomposition for Latin scripts with diacritics
|
||||||
|
normalized = unicodedata.normalize('NFD', name)
|
||||||
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||||
|
|
||||||
return ascii_name
|
return ascii_name
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue