fix(ppid): fix unidecode import reference typo

2026-01-09 18:29:36 +01:00 · 2026-01-09 18:29:36 +01:00 · 04791a7a91
commit 04791a7a91
parent c45367c60f
1 changed files with 47 additions and 8 deletions
--- a/scripts/generate_ppids.py
+++ b/scripts/generate_ppids.py
@ -77,9 +77,9 @@ def normalize_name(name: str) -> str:
        for c in name
    )
    
-    if has_non_latin and HAS_UNIDECODE:
+    if has_non_latin and HAS_UNIDECODE and _unidecode is not None:
        # Use unidecode for Hebrew, Arabic, Chinese, etc.
-        ascii_name = unidecode(name)
+        ascii_name = _unidecode(name)
    else:
        # Use NFD decomposition for Latin scripts with diacritics
        normalized = unicodedata.normalize('NFD', name)
@ -217,9 +217,46 @@ def load_person_entity(filepath: Path) -> Optional[dict]:
        return None


+def get_person_name_both(data: dict) -> tuple[str, str]:
+    """Extract person name from entity data, returning both original and romanized.
+    
+    Returns:
+        tuple: (display_name, original_name)
+        - display_name: romanized/ASCII name for PPID
+        - original_name: original name (may be non-Latin script)
+    """
+    # Get original name
+    original_name = (
+        data.get('profile_data', {}).get('name') or
+        data.get('source_staff_info', {}).get('name') or
+        data.get('fallback_data', {}).get('name') or
+        ''
+    ).strip()
+    
+    # Get romanized name if available
+    name_romanized = data.get('profile_data', {}).get('name_romanized')
+    if name_romanized:
+        return name_romanized.strip(), original_name
+    
+    # Return original name for both if no romanization
+    return original_name, original_name
+
+
 def get_person_name(data: dict) -> str:
-    """Extract person name from entity data."""
-    # Try multiple locations
+    """Extract person name from entity data.
+    
+    Priority:
+    1. name_romanized (already transliterated)
+    2. name from profile_data
+    3. name from source_staff_info
+    4. name from fallback_data
+    """
+    # First try romanized name (for Hebrew, Arabic, etc.)
+    name_romanized = data.get('profile_data', {}).get('name_romanized')
+    if name_romanized:
+        return name_romanized.strip()
+    
+    # Try regular name fields
    name = (
        data.get('profile_data', {}).get('name') or
        data.get('source_staff_info', {}).get('name') or
@ -255,7 +292,7 @@ def get_current_location(data: dict) -> Optional[str]:

 def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict:
    """Create a new PPID entity structure from source data."""
-    name = get_person_name(data)
+    display_name, original_name = get_person_name_both(data)
    
    entity = {
        "ppid": ppid,
@ -266,11 +303,13 @@ def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict:
            "first_date": "XXXX",
            "last_location": "XX-XX-XXX", 
            "last_date": "XXXX",
-            "name_tokens": extract_name_tokens(name)
+            "name_tokens": extract_name_tokens(display_name)
        },
        "name": {
-            "full_name": name,
-            "name_tokens": extract_name_tokens(name),
+            "full_name": original_name,
+            "display_name": display_name,
+            "name_romanized": display_name if display_name != original_name else None,
+            "name_tokens": extract_name_tokens(display_name),
            "source": "linkedin_profile"
        },
        "birth_date": {