diff --git a/scripts/generate_ppids.py b/scripts/generate_ppids.py index 628f3be648..4173f039fa 100644 --- a/scripts/generate_ppids.py +++ b/scripts/generate_ppids.py @@ -77,9 +77,9 @@ def normalize_name(name: str) -> str: for c in name ) - if has_non_latin and HAS_UNIDECODE: + if has_non_latin and HAS_UNIDECODE and _unidecode is not None: # Use unidecode for Hebrew, Arabic, Chinese, etc. - ascii_name = unidecode(name) + ascii_name = _unidecode(name) else: # Use NFD decomposition for Latin scripts with diacritics normalized = unicodedata.normalize('NFD', name) @@ -217,9 +217,46 @@ def load_person_entity(filepath: Path) -> Optional[dict]: return None +def get_person_name_both(data: dict) -> tuple[str, str]: + """Extract person name from entity data, returning both original and romanized. + + Returns: + tuple: (display_name, original_name) + - display_name: romanized/ASCII name for PPID + - original_name: original name (may be non-Latin script) + """ + # Get original name + original_name = ( + data.get('profile_data', {}).get('name') or + data.get('source_staff_info', {}).get('name') or + data.get('fallback_data', {}).get('name') or + '' + ).strip() + + # Get romanized name if available + name_romanized = data.get('profile_data', {}).get('name_romanized') + if name_romanized: + return name_romanized.strip(), original_name + + # Return original name for both if no romanization + return original_name, original_name + + def get_person_name(data: dict) -> str: - """Extract person name from entity data.""" - # Try multiple locations + """Extract person name from entity data. + + Priority: + 1. name_romanized (already transliterated) + 2. name from profile_data + 3. name from source_staff_info + 4. name from fallback_data + """ + # First try romanized name (for Hebrew, Arabic, etc.) + name_romanized = data.get('profile_data', {}).get('name_romanized') + if name_romanized: + return name_romanized.strip() + + # Try regular name fields name = ( data.get('profile_data', {}).get('name') or data.get('source_staff_info', {}).get('name') or @@ -255,7 +292,7 @@ def get_current_location(data: dict) -> Optional[str]: def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict: """Create a new PPID entity structure from source data.""" - name = get_person_name(data) + display_name, original_name = get_person_name_both(data) entity = { "ppid": ppid, @@ -266,11 +303,13 @@ def create_ppid_entity(data: dict, ppid: str, source_file: str) -> dict: "first_date": "XXXX", "last_location": "XX-XX-XXX", "last_date": "XXXX", - "name_tokens": extract_name_tokens(name) + "name_tokens": extract_name_tokens(display_name) }, "name": { - "full_name": name, - "name_tokens": extract_name_tokens(name), + "full_name": original_name, + "display_name": display_name, + "name_romanized": display_name if display_name != original_name else None, + "name_tokens": extract_name_tokens(display_name), "source": "linkedin_profile" }, "birth_date": {