fix(ppid): fix unidecode import reference typo

This commit is contained in:
kempersc 2026-01-09 18:29:36 +01:00
parent c45367c60f
commit 04791a7a91

View file

@ -77,9 +77,9 @@ def normalize_name(name: str) -> str:
for c in name for c in name
) )
if has_non_latin and HAS_UNIDECODE: if has_non_latin and HAS_UNIDECODE and _unidecode is not None:
# Use unidecode for Hebrew, Arabic, Chinese, etc. # Use unidecode for Hebrew, Arabic, Chinese, etc.
ascii_name = unidecode(name) ascii_name = _unidecode(name)
else: else:
# Use NFD decomposition for Latin scripts with diacritics # Use NFD decomposition for Latin scripts with diacritics
normalized = unicodedata.normalize('NFD', name) normalized = unicodedata.normalize('NFD', name)
@ -217,9 +217,46 @@ def load_person_entity(filepath: Path) -> Optional[dict]:
return None return None
def get_person_name_both(data: dict) -> tuple[str, str]:
"""Extract person name from entity data, returning both original and romanized.
Returns:
tuple: (display_name, original_name)
- display_name: romanized/ASCII name for PPID
- original_name: original name (may be non-Latin script)
"""
# Get original name
original_name = (
data.get('profile_data', {}).get('name') or
data.get('source_staff_info', {}).get('name') or
data.get('fallback_data', {}).get('name') or
''
).strip()
# Get romanized name if available
name_romanized = data.get('profile_data', {}).get('name_romanized')
if name_romanized:
return name_romanized.strip(), original_name
# Return original name for both if no romanization
return original_name, original_name
def get_person_name(data: dict) -> str: def get_person_name(data: dict) -> str:
"""Extract person name from entity data.""" """Extract person name from entity data.
# Try multiple locations
Priority:
1. name_romanized (already transliterated)
2. name from profile_data
3. name from source_staff_info
4. name from fallback_data
"""
# First try romanized name (for Hebrew, Arabic, etc.)
name_romanized = data.get('profile_data', {}).get('name_romanized')
if name_romanized:
return name_romanized.strip()
# Try regular name fields
name = ( name = (
data.get('profile_data', {}).get('name') or data.get('profile_data', {}).get('name') or
data.get('source_staff_info', {}).get('name') or data.get('source_staff_info', {}).get('name') or
@ -255,7 +292,7 @@ def get_current_location(data: dict) -> Optional[str]:
def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict: def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict:
"""Create a new PPID entity structure from source data.""" """Create a new PPID entity structure from source data."""
name = get_person_name(data) display_name, original_name = get_person_name_both(data)
entity = { entity = {
"ppid": ppid, "ppid": ppid,
@ -266,11 +303,13 @@ def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict:
"first_date": "XXXX", "first_date": "XXXX",
"last_location": "XX-XX-XXX", "last_location": "XX-XX-XXX",
"last_date": "XXXX", "last_date": "XXXX",
"name_tokens": extract_name_tokens(name) "name_tokens": extract_name_tokens(display_name)
}, },
"name": { "name": {
"full_name": name, "full_name": original_name,
"name_tokens": extract_name_tokens(name), "display_name": display_name,
"name_romanized": display_name if display_name != original_name else None,
"name_tokens": extract_name_tokens(display_name),
"source": "linkedin_profile" "source": "linkedin_profile"
}, },
"birth_date": { "birth_date": {