#!/usr/bin/env python3 """ Identify Person Profiles by Country/Origin for Targeted Verification. This script identifies profiles by: 1. PPID country code (direct detection) 2. Name origin patterns (heuristic detection) Useful for planning surname database expansion and targeted entity resolution. Usage: python scripts/identify_profiles_by_origin.py --summary python scripts/identify_profiles_by_origin.py --country ID python scripts/identify_profiles_by_origin.py --detect-arabic python scripts/identify_profiles_by_origin.py --detect-indonesian """ import argparse import json import re import sys from collections import defaultdict from pathlib import Path from typing import Dict, List, Optional, Set, Tuple # ============================================================================= # NAME ORIGIN PATTERNS # ============================================================================= # Indonesian name patterns (common surnames and name elements) INDONESIAN_PATTERNS = { # Common Indonesian surnames 'surnames': { 'sitompul', 'simatupang', 'siagian', 'siregar', 'simanjuntak', 'sinaga', 'simbolon', 'saragih', 'hutapea', 'hutabarat', 'panjaitan', 'pardede', 'purba', 'pasaribu', 'manullang', 'lumbantobing', 'nainggolan', 'napitupulu', 'hasibuan', 'harahap', 'lubis', 'nasution', 'daulay', 'ritonga', 'wijaya', 'susanto', 'setiawan', 'pratama', 'putra', 'putri', 'sari', 'wati', 'dewi', 'lestari', 'rahayu', 'handoko', 'santoso', 'hidayat', 'nugroho', 'kurniawan', 'saputra', 'hermawan', 'gunawan', 'budiman', 'hartono', 'suryono', 'supriyadi', 'sugiarto', 'wibowo', 'widodo', 'prasetyo', 'prabowo', 'yulianto', 'cahyono', 'firmansyah', 'syahputra', 'ramadhan', 'hidayanto', 'kusuma', 'wahyudi', 'setiabudi', 'sulistyo', }, # Common Indonesian first name patterns (refined to reduce false positives) # NOTE: These patterns alone are weak signals - best combined with surname matches 'first_name_patterns': [ r'^(dwi|eka|catur)\b', # Number prefixes (removed tri- to avoid Tristan) r'(wati|dewi|sari|putri|lestari)$', # Female suffixes (distinctive) r'(wanto|yanto|ianto)$', # Male -anto suffix (requires at least 5 chars before) r'^(sri|siti)\s', # Common female prefixes (with space, removed nur which is Arabic) r'^(agus|budi|eko|joko|bambang|dedi|heri|iwan|wawan)\b', # Common male names r'^(rina|rini|ratna|yuni|yuli|wulan|sinta)\b', # Common female names (removed rita, ani, eni) ], } # Arabic name patterns ARABIC_PATTERNS = { # Common Arabic surnames and family names 'surnames': { # Egyptian surnames 'mohamed', 'mohammed', 'muhammad', 'ahmed', 'ahmad', 'mahmoud', 'mahmud', 'hassan', 'hussein', 'hosni', 'hosny', 'mustafa', 'mostafa', 'omar', 'osman', 'ibrahim', 'ismail', 'youssef', 'yusuf', 'ali', 'aly', 'saleh', 'salih', 'khalil', 'nasser', 'naser', 'abdel', 'abdul', 'abou', 'abu', # Levantine surnames (Lebanon, Syria, Jordan, Palestine) # NOTE: Removed 'hanna' as it's also common German/Scandinavian 'khoury', 'khouri', 'haddad', 'sayegh', 'saad', 'salim', 'salem', 'mansour', 'nassar', 'nasser', 'issa', 'boutros', 'daher', 'habib', 'hariri', 'gemayel', 'jumblatt', 'berri', 'aoun', 'frangieh', # Gulf surnames 'al-saud', 'alsaud', 'al-thani', 'althani', 'al-maktoum', 'almaktoum', 'al-nahyan', 'alnahyan', 'al-sabah', 'alsabah', 'al-khalifa', 'alkhalifa', # Maghreb surnames (Morocco, Algeria, Tunisia) 'benali', 'bensalem', 'benyahia', 'bouazizi', 'bouteflika', 'zidane', 'benzema', 'belhaj', 'belkaid', 'benamor', 'benabdallah', 'benjelloun', # Common prefixes that indicate Arabic names 'el-', 'al-', 'bin', 'ibn', 'bint', 'abu', 'abou', 'um', 'umm', }, # Arabic first name patterns (with word boundaries to avoid false matches) 'first_name_patterns': [ r'^(abdul|abdel|abd[\s\-])', # Abdul-X compound names r'^(abu|abou|um|umm)[\s\-]', # Abu X, Umm X r'^(muhammad|mohammed|mohamed)\b', # Most common names (not Muhammadi) r'^(ahmad|ahmed)\b', # Ahmad (not Ahmad as part of longer name) r'\b(ali|omar|umar|othman|osman|hussein|hussain|hassan|hasan)\b', # Word boundary both sides r'^(ibrahim|ismail|yusuf|youssef|khalid|khaled|abdullah|abdallah)\b', r'^(fatima|fatimah|khadija|khadijah|aisha|aysha|maryam|mariam)\b', r'^(nour|noor|layla|leila)\b', # Shortened list to avoid common Western names r'^(karim|kareem|rahim|raheem|rashid|rasheed|hamid|hameed)\b', r'^(jamal|gamal|walid|waleed|tariq|tarek|samir|samer)\b', ], # Al- prefix surnames (refined - must be followed by Arabic-style name, not Germanic names) # Excludes: Albers, Albert, Allen, Allison, Almquist, Althaus, etc. 'al_prefix_pattern': r'\bal[\-]([a-z]{3,})', # Requires hyphen: al-mutairi, al-saud } def extract_country_from_ppid(filename: str) -> str: """Extract 2-letter country code from PPID filename.""" if not filename.startswith('ID_'): return 'XX' ppid = filename[3:].replace('.json', '') parts = ppid.split('_') if len(parts) >= 1: location_part = parts[0] location_components = location_part.split('-') if len(location_components) >= 1: country = location_components[0].upper() if len(country) == 2 and country.isalpha(): return country return 'XX' def extract_person_name(profile: dict) -> str: """Extract person name from profile.""" if 'profile_data' in profile: pd = profile['profile_data'] if pd.get('full_name'): return pd['full_name'] if pd.get('name'): return pd['name'] if profile.get('full_name'): return profile['full_name'] if profile.get('name'): return profile['name'] return "Unknown" def detect_indonesian_origin(name: str) -> Tuple[bool, List[str]]: """ Detect if a name appears to be of Indonesian origin. Returns: (is_likely_indonesian, list of matching indicators) """ name_lower = name.lower().strip() parts = name_lower.split() indicators = [] # Check surnames for part in parts: if part in INDONESIAN_PATTERNS['surnames']: indicators.append(f"surname:{part}") # Check first name patterns for pattern in INDONESIAN_PATTERNS['first_name_patterns']: if re.search(pattern, name_lower): indicators.append(f"pattern:{pattern}") break # Only add one pattern match # Check for Batak names (double surnames like Lumbantobing) if re.search(r'lumban|huta|pan[dj]aitan|sima[nt]upang', name_lower): indicators.append("batak_name") is_likely = len(indicators) >= 1 return is_likely, indicators def detect_arabic_origin(name: str) -> Tuple[bool, List[str]]: """ Detect if a name appears to be of Arabic origin. Returns: (is_likely_arabic, list of matching indicators) """ name_lower = name.lower().strip() parts = name_lower.split() indicators = [] # Check surnames for part in parts: # Remove common prefixes for matching clean_part = re.sub(r'^(el|al)[\-\s]?', '', part) if part in ARABIC_PATTERNS['surnames'] or clean_part in ARABIC_PATTERNS['surnames']: indicators.append(f"surname:{part}") # Check for Al-X pattern if re.search(ARABIC_PATTERNS['al_prefix_pattern'], name_lower): indicators.append("al_prefix") # Check first name patterns for pattern in ARABIC_PATTERNS['first_name_patterns']: if re.search(pattern, name_lower): indicators.append(f"pattern:{pattern}") break # Check for compound names (Abdul Rahman, Abu Bakr, Ibn Rushd, Bin Laden) # Requires word boundary before to avoid matching "Robin" → "bin" if re.search(r'\b(abdul?|abu|ibn)\s+[a-z]+', name_lower): indicators.append("compound_name") # "bin" and "bint" need special handling - must be followed by Arabic-style name if re.search(r'\bbin\s+(abdul|ahmad|ali|mohammed|muhammad|hassan|hussein|khalid|omar|saleh|yusuf|ibrahim)', name_lower): indicators.append("bin_compound") is_likely = len(indicators) >= 1 return is_likely, indicators def analyze_profiles(data_dir: Path, limit: Optional[int] = None) -> Dict: """Analyze all profiles and return statistics.""" profile_files = list(data_dir.glob('ID_*.json')) if limit: profile_files = profile_files[:limit] results = { 'total': len(profile_files), 'by_country': defaultdict(list), 'indonesian_origin': [], 'arabic_origin': [], 'errors': 0, } for file_path in profile_files: try: country = extract_country_from_ppid(file_path.name) with open(file_path, 'r', encoding='utf-8') as f: profile = json.load(f) name = extract_person_name(profile) profile_info = { 'file': file_path.name, 'name': name, 'country': country, } results['by_country'][country].append(profile_info) # Detect Indonesian origin is_indonesian, indonesian_indicators = detect_indonesian_origin(name) if is_indonesian: profile_info['indonesian_indicators'] = indonesian_indicators results['indonesian_origin'].append(profile_info) # Detect Arabic origin is_arabic, arabic_indicators = detect_arabic_origin(name) if is_arabic: profile_info['arabic_indicators'] = arabic_indicators results['arabic_origin'].append(profile_info) except Exception as e: results['errors'] += 1 return results def print_summary(results: Dict): """Print summary of profile analysis.""" print("=" * 70) print("PROFILE ORIGIN ANALYSIS SUMMARY") print("=" * 70) print(f"\nTotal profiles analyzed: {results['total']:,}") print(f"Errors: {results['errors']}") print("\n" + "-" * 70) print("PROFILES BY PPID COUNTRY CODE") print("-" * 70) countries = sorted(results['by_country'].items(), key=lambda x: -len(x[1])) for country, profiles in countries[:20]: print(f" {country}: {len(profiles):,} profiles") if len(countries) > 20: print(f" ... and {len(countries) - 20} more countries") print("\n" + "-" * 70) print(f"INDONESIAN ORIGIN DETECTED: {len(results['indonesian_origin'])} profiles") print("-" * 70) for p in results['indonesian_origin'][:15]: indicators = ', '.join(p.get('indonesian_indicators', [])) print(f" {p['name'][:40]:40} | {p['country']} | {indicators}") if len(results['indonesian_origin']) > 15: print(f" ... and {len(results['indonesian_origin']) - 15} more") print("\n" + "-" * 70) print(f"ARABIC ORIGIN DETECTED: {len(results['arabic_origin'])} profiles") print("-" * 70) for p in results['arabic_origin'][:15]: indicators = ', '.join(p.get('arabic_indicators', [])) print(f" {p['name'][:40]:40} | {p['country']} | {indicators}") if len(results['arabic_origin']) > 15: print(f" ... and {len(results['arabic_origin']) - 15} more") def print_country_profiles(results: Dict, country: str): """Print detailed list of profiles for a specific country.""" profiles = results['by_country'].get(country, []) print(f"\n{'=' * 70}") print(f"PROFILES FOR COUNTRY: {country}") print(f"{'=' * 70}") print(f"Total: {len(profiles)} profiles\n") for p in profiles: # Check origins is_indo, indo_ind = detect_indonesian_origin(p['name']) is_arab, arab_ind = detect_arabic_origin(p['name']) origin_flags = [] if is_indo: origin_flags.append("🇮🇩") if is_arab: origin_flags.append("🇸🇦") flags = ' '.join(origin_flags) if origin_flags else '' print(f" {p['name'][:45]:45} | {p['file'][:40]} {flags}") def print_origin_profiles(results: Dict, origin_type: str): """Print detailed list of profiles for a specific origin type.""" if origin_type == 'indonesian': profiles = results['indonesian_origin'] title = "INDONESIAN ORIGIN" elif origin_type == 'arabic': profiles = results['arabic_origin'] title = "ARABIC ORIGIN" else: print(f"Unknown origin type: {origin_type}") return print(f"\n{'=' * 70}") print(f"{title} PROFILES") print(f"{'=' * 70}") print(f"Total: {len(profiles)} profiles\n") # Group by PPID country by_country = defaultdict(list) for p in profiles: by_country[p['country']].append(p) for country, country_profiles in sorted(by_country.items(), key=lambda x: -len(x[1])): print(f"\n{country} ({len(country_profiles)} profiles):") for p in country_profiles: indicators_key = 'indonesian_indicators' if origin_type == 'indonesian' else 'arabic_indicators' indicators = ', '.join(p.get(indicators_key, [])) print(f" {p['name'][:40]:40} | {indicators}") def main(): parser = argparse.ArgumentParser( description='Identify person profiles by country/origin' ) parser.add_argument('--summary', action='store_true', help='Show summary of all profiles') parser.add_argument('--country', type=str, help='Show profiles for specific country code (e.g., ID, NL)') parser.add_argument('--detect-indonesian', action='store_true', help='Show profiles with detected Indonesian origin') parser.add_argument('--detect-arabic', action='store_true', help='Show profiles with detected Arabic origin') parser.add_argument('--data-dir', type=str, default='data/person', help='Person data directory') parser.add_argument('--limit', type=int, help='Limit number of profiles to analyze') parser.add_argument('--json', action='store_true', help='Output as JSON') args = parser.parse_args() if not any([args.summary, args.country, args.detect_indonesian, args.detect_arabic]): parser.print_help() return data_dir = Path(args.data_dir) if not data_dir.exists(): print(f"Error: Data directory not found: {data_dir}") sys.exit(1) print("Analyzing profiles...") results = analyze_profiles(data_dir, args.limit) if args.json: # Convert defaultdict to dict for JSON serialization output = { 'total': results['total'], 'errors': results['errors'], 'by_country': {k: len(v) for k, v in results['by_country'].items()}, 'indonesian_origin_count': len(results['indonesian_origin']), 'arabic_origin_count': len(results['arabic_origin']), 'indonesian_profiles': results['indonesian_origin'], 'arabic_profiles': results['arabic_origin'], } print(json.dumps(output, indent=2)) return if args.summary: print_summary(results) if args.country: print_country_profiles(results, args.country.upper()) if args.detect_indonesian: print_origin_profiles(results, 'indonesian') if args.detect_arabic: print_origin_profiles(results, 'arabic') if __name__ == '__main__': main()