fix(ppid): fix unidecode import reference typo

2026-01-09 18:29:36 +01:00 · 2026-01-09 18:29:36 +01:00 · 04791a7a91
commit 04791a7a91
parent c45367c60f
1 changed files with 47 additions and 8 deletions
--- a/scripts/generate_ppids.py
+++ b/scripts/generate_ppids.py
@ -77,9 +77,9 @@ def normalize_name(name: str) -> str:
        for c in name
    )
-    if has_non_latin and HAS_UNIDECODE:
+    if has_non_latin and HAS_UNIDECODE and _unidecode is not None:
        # Use unidecode for Hebrew, Arabic, Chinese, etc.
-        ascii_name = unidecode(name)
+        ascii_name = _unidecode(name)
    else:
        # Use NFD decomposition for Latin scripts with diacritics
        normalized = unicodedata.normalize('NFD', name)
@ -217,9 +217,46 @@ def load_person_entity(filepath: Path) -> Optional[dict]:
        return None
 def get_person_name_both(data: dict) -> tuple[str, str]:
    """Extract person name from entity data, returning both original and romanized.
    Returns:
        tuple: (display_name, original_name)
        - display_name: romanized/ASCII name for PPID
        - original_name: original name (may be non-Latin script)
    """
    # Get original name
    original_name = (
        data.get('profile_data', {}).get('name') or
        data.get('source_staff_info', {}).get('name') or
        data.get('fallback_data', {}).get('name') or
        ''
    ).strip()
    # Get romanized name if available
    name_romanized = data.get('profile_data', {}).get('name_romanized')
    if name_romanized:
        return name_romanized.strip(), original_name
    # Return original name for both if no romanization
    return original_name, original_name
 def get_person_name(data: dict) -> str:
-    """Extract person name from entity data."""
+    """Extract person name from entity data.
-    # Try multiple locations
+    
    Priority:
    1. name_romanized (already transliterated)
    2. name from profile_data
    3. name from source_staff_info
    4. name from fallback_data
    """
    # First try romanized name (for Hebrew, Arabic, etc.)
    name_romanized = data.get('profile_data', {}).get('name_romanized')
    if name_romanized:
        return name_romanized.strip()
    # Try regular name fields
    name = (
        data.get('profile_data', {}).get('name') or
        data.get('source_staff_info', {}).get('name') or
@ -255,7 +292,7 @@ def get_current_location(data: dict) -> Optional[str]:
 def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict:
    """Create a new PPID entity structure from source data."""
-    name = get_person_name(data)
+    display_name, original_name = get_person_name_both(data)
    entity = {
        "ppid": ppid,
@ -266,11 +303,13 @@ def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict:
            "first_date": "XXXX",
            "last_location": "XX-XX-XXX", 
            "last_date": "XXXX",
-            "name_tokens": extract_name_tokens(name)
+            "name_tokens": extract_name_tokens(display_name)
        },
        "name": {
-            "full_name": name,
+            "full_name": original_name,
-            "name_tokens": extract_name_tokens(name),
+            "display_name": display_name,
            "name_romanized": display_name if display_name != original_name else None,
            "name_tokens": extract_name_tokens(display_name),
            "source": "linkedin_profile"
        },
        "birth_date": {