408 lines
16 KiB
Python
408 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Identify Person Profiles by Country/Origin for Targeted Verification.
|
|
|
|
This script identifies profiles by:
|
|
1. PPID country code (direct detection)
|
|
2. Name origin patterns (heuristic detection)
|
|
|
|
Useful for planning surname database expansion and targeted entity resolution.
|
|
|
|
Usage:
|
|
python scripts/identify_profiles_by_origin.py --summary
|
|
python scripts/identify_profiles_by_origin.py --country ID
|
|
python scripts/identify_profiles_by_origin.py --detect-arabic
|
|
python scripts/identify_profiles_by_origin.py --detect-indonesian
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
|
|
|
|
# =============================================================================
|
|
# NAME ORIGIN PATTERNS
|
|
# =============================================================================
|
|
|
|
# Indonesian name patterns (common surnames and name elements)
|
|
INDONESIAN_PATTERNS = {
|
|
# Common Indonesian surnames
|
|
'surnames': {
|
|
'sitompul', 'simatupang', 'siagian', 'siregar', 'simanjuntak', 'sinaga',
|
|
'simbolon', 'saragih', 'hutapea', 'hutabarat', 'panjaitan', 'pardede',
|
|
'purba', 'pasaribu', 'manullang', 'lumbantobing', 'nainggolan', 'napitupulu',
|
|
'hasibuan', 'harahap', 'lubis', 'nasution', 'daulay', 'ritonga',
|
|
'wijaya', 'susanto', 'setiawan', 'pratama', 'putra', 'putri', 'sari',
|
|
'wati', 'dewi', 'lestari', 'rahayu', 'handoko', 'santoso', 'hidayat',
|
|
'nugroho', 'kurniawan', 'saputra', 'hermawan', 'gunawan', 'budiman',
|
|
'hartono', 'suryono', 'supriyadi', 'sugiarto', 'wibowo', 'widodo',
|
|
'prasetyo', 'prabowo', 'yulianto', 'cahyono', 'firmansyah', 'syahputra',
|
|
'ramadhan', 'hidayanto', 'kusuma', 'wahyudi', 'setiabudi', 'sulistyo',
|
|
},
|
|
# Common Indonesian first name patterns (refined to reduce false positives)
|
|
# NOTE: These patterns alone are weak signals - best combined with surname matches
|
|
'first_name_patterns': [
|
|
r'^(dwi|eka|catur)\b', # Number prefixes (removed tri- to avoid Tristan)
|
|
r'(wati|dewi|sari|putri|lestari)$', # Female suffixes (distinctive)
|
|
r'(wanto|yanto|ianto)$', # Male -anto suffix (requires at least 5 chars before)
|
|
r'^(sri|siti)\s', # Common female prefixes (with space, removed nur which is Arabic)
|
|
r'^(agus|budi|eko|joko|bambang|dedi|heri|iwan|wawan)\b', # Common male names
|
|
r'^(rina|rini|ratna|yuni|yuli|wulan|sinta)\b', # Common female names (removed rita, ani, eni)
|
|
],
|
|
}
|
|
|
|
# Arabic name patterns
|
|
ARABIC_PATTERNS = {
|
|
# Common Arabic surnames and family names
|
|
'surnames': {
|
|
# Egyptian surnames
|
|
'mohamed', 'mohammed', 'muhammad', 'ahmed', 'ahmad', 'mahmoud', 'mahmud',
|
|
'hassan', 'hussein', 'hosni', 'hosny', 'mustafa', 'mostafa', 'omar', 'osman',
|
|
'ibrahim', 'ismail', 'youssef', 'yusuf', 'ali', 'aly', 'saleh', 'salih',
|
|
'khalil', 'nasser', 'naser', 'abdel', 'abdul', 'abou', 'abu',
|
|
# Levantine surnames (Lebanon, Syria, Jordan, Palestine)
|
|
# NOTE: Removed 'hanna' as it's also common German/Scandinavian
|
|
'khoury', 'khouri', 'haddad', 'sayegh', 'saad', 'salim', 'salem',
|
|
'mansour', 'nassar', 'nasser', 'issa', 'boutros', 'daher',
|
|
'habib', 'hariri', 'gemayel', 'jumblatt', 'berri', 'aoun', 'frangieh',
|
|
# Gulf surnames
|
|
'al-saud', 'alsaud', 'al-thani', 'althani', 'al-maktoum', 'almaktoum',
|
|
'al-nahyan', 'alnahyan', 'al-sabah', 'alsabah', 'al-khalifa', 'alkhalifa',
|
|
# Maghreb surnames (Morocco, Algeria, Tunisia)
|
|
'benali', 'bensalem', 'benyahia', 'bouazizi', 'bouteflika', 'zidane',
|
|
'benzema', 'belhaj', 'belkaid', 'benamor', 'benabdallah', 'benjelloun',
|
|
# Common prefixes that indicate Arabic names
|
|
'el-', 'al-', 'bin', 'ibn', 'bint', 'abu', 'abou', 'um', 'umm',
|
|
},
|
|
# Arabic first name patterns (with word boundaries to avoid false matches)
|
|
'first_name_patterns': [
|
|
r'^(abdul|abdel|abd[\s\-])', # Abdul-X compound names
|
|
r'^(abu|abou|um|umm)[\s\-]', # Abu X, Umm X
|
|
r'^(muhammad|mohammed|mohamed)\b', # Most common names (not Muhammadi)
|
|
r'^(ahmad|ahmed)\b', # Ahmad (not Ahmad as part of longer name)
|
|
r'\b(ali|omar|umar|othman|osman|hussein|hussain|hassan|hasan)\b', # Word boundary both sides
|
|
r'^(ibrahim|ismail|yusuf|youssef|khalid|khaled|abdullah|abdallah)\b',
|
|
r'^(fatima|fatimah|khadija|khadijah|aisha|aysha|maryam|mariam)\b',
|
|
r'^(nour|noor|layla|leila)\b', # Shortened list to avoid common Western names
|
|
r'^(karim|kareem|rahim|raheem|rashid|rasheed|hamid|hameed)\b',
|
|
r'^(jamal|gamal|walid|waleed|tariq|tarek|samir|samer)\b',
|
|
],
|
|
# Al- prefix surnames (refined - must be followed by Arabic-style name, not Germanic names)
|
|
# Excludes: Albers, Albert, Allen, Allison, Almquist, Althaus, etc.
|
|
'al_prefix_pattern': r'\bal[\-]([a-z]{3,})', # Requires hyphen: al-mutairi, al-saud
|
|
}
|
|
|
|
|
|
def extract_country_from_ppid(filename: str) -> str:
|
|
"""Extract 2-letter country code from PPID filename."""
|
|
if not filename.startswith('ID_'):
|
|
return 'XX'
|
|
|
|
ppid = filename[3:].replace('.json', '')
|
|
parts = ppid.split('_')
|
|
if len(parts) >= 1:
|
|
location_part = parts[0]
|
|
location_components = location_part.split('-')
|
|
if len(location_components) >= 1:
|
|
country = location_components[0].upper()
|
|
if len(country) == 2 and country.isalpha():
|
|
return country
|
|
|
|
return 'XX'
|
|
|
|
|
|
def extract_person_name(profile: dict) -> str:
|
|
"""Extract person name from profile."""
|
|
if 'profile_data' in profile:
|
|
pd = profile['profile_data']
|
|
if pd.get('full_name'):
|
|
return pd['full_name']
|
|
if pd.get('name'):
|
|
return pd['name']
|
|
if profile.get('full_name'):
|
|
return profile['full_name']
|
|
if profile.get('name'):
|
|
return profile['name']
|
|
return "Unknown"
|
|
|
|
|
|
def detect_indonesian_origin(name: str) -> Tuple[bool, List[str]]:
|
|
"""
|
|
Detect if a name appears to be of Indonesian origin.
|
|
|
|
Returns: (is_likely_indonesian, list of matching indicators)
|
|
"""
|
|
name_lower = name.lower().strip()
|
|
parts = name_lower.split()
|
|
indicators = []
|
|
|
|
# Check surnames
|
|
for part in parts:
|
|
if part in INDONESIAN_PATTERNS['surnames']:
|
|
indicators.append(f"surname:{part}")
|
|
|
|
# Check first name patterns
|
|
for pattern in INDONESIAN_PATTERNS['first_name_patterns']:
|
|
if re.search(pattern, name_lower):
|
|
indicators.append(f"pattern:{pattern}")
|
|
break # Only add one pattern match
|
|
|
|
# Check for Batak names (double surnames like Lumbantobing)
|
|
if re.search(r'lumban|huta|pan[dj]aitan|sima[nt]upang', name_lower):
|
|
indicators.append("batak_name")
|
|
|
|
is_likely = len(indicators) >= 1
|
|
return is_likely, indicators
|
|
|
|
|
|
def detect_arabic_origin(name: str) -> Tuple[bool, List[str]]:
|
|
"""
|
|
Detect if a name appears to be of Arabic origin.
|
|
|
|
Returns: (is_likely_arabic, list of matching indicators)
|
|
"""
|
|
name_lower = name.lower().strip()
|
|
parts = name_lower.split()
|
|
indicators = []
|
|
|
|
# Check surnames
|
|
for part in parts:
|
|
# Remove common prefixes for matching
|
|
clean_part = re.sub(r'^(el|al)[\-\s]?', '', part)
|
|
if part in ARABIC_PATTERNS['surnames'] or clean_part in ARABIC_PATTERNS['surnames']:
|
|
indicators.append(f"surname:{part}")
|
|
|
|
# Check for Al-X pattern
|
|
if re.search(ARABIC_PATTERNS['al_prefix_pattern'], name_lower):
|
|
indicators.append("al_prefix")
|
|
|
|
# Check first name patterns
|
|
for pattern in ARABIC_PATTERNS['first_name_patterns']:
|
|
if re.search(pattern, name_lower):
|
|
indicators.append(f"pattern:{pattern}")
|
|
break
|
|
|
|
# Check for compound names (Abdul Rahman, Abu Bakr, Ibn Rushd, Bin Laden)
|
|
# Requires word boundary before to avoid matching "Robin" → "bin"
|
|
if re.search(r'\b(abdul?|abu|ibn)\s+[a-z]+', name_lower):
|
|
indicators.append("compound_name")
|
|
# "bin" and "bint" need special handling - must be followed by Arabic-style name
|
|
if re.search(r'\bbin\s+(abdul|ahmad|ali|mohammed|muhammad|hassan|hussein|khalid|omar|saleh|yusuf|ibrahim)', name_lower):
|
|
indicators.append("bin_compound")
|
|
|
|
is_likely = len(indicators) >= 1
|
|
return is_likely, indicators
|
|
|
|
|
|
def analyze_profiles(data_dir: Path, limit: Optional[int] = None) -> Dict:
|
|
"""Analyze all profiles and return statistics."""
|
|
profile_files = list(data_dir.glob('ID_*.json'))
|
|
|
|
if limit:
|
|
profile_files = profile_files[:limit]
|
|
|
|
results = {
|
|
'total': len(profile_files),
|
|
'by_country': defaultdict(list),
|
|
'indonesian_origin': [],
|
|
'arabic_origin': [],
|
|
'errors': 0,
|
|
}
|
|
|
|
for file_path in profile_files:
|
|
try:
|
|
country = extract_country_from_ppid(file_path.name)
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
profile = json.load(f)
|
|
|
|
name = extract_person_name(profile)
|
|
|
|
profile_info = {
|
|
'file': file_path.name,
|
|
'name': name,
|
|
'country': country,
|
|
}
|
|
|
|
results['by_country'][country].append(profile_info)
|
|
|
|
# Detect Indonesian origin
|
|
is_indonesian, indonesian_indicators = detect_indonesian_origin(name)
|
|
if is_indonesian:
|
|
profile_info['indonesian_indicators'] = indonesian_indicators
|
|
results['indonesian_origin'].append(profile_info)
|
|
|
|
# Detect Arabic origin
|
|
is_arabic, arabic_indicators = detect_arabic_origin(name)
|
|
if is_arabic:
|
|
profile_info['arabic_indicators'] = arabic_indicators
|
|
results['arabic_origin'].append(profile_info)
|
|
|
|
except Exception as e:
|
|
results['errors'] += 1
|
|
|
|
return results
|
|
|
|
|
|
def print_summary(results: Dict):
|
|
"""Print summary of profile analysis."""
|
|
print("=" * 70)
|
|
print("PROFILE ORIGIN ANALYSIS SUMMARY")
|
|
print("=" * 70)
|
|
print(f"\nTotal profiles analyzed: {results['total']:,}")
|
|
print(f"Errors: {results['errors']}")
|
|
|
|
print("\n" + "-" * 70)
|
|
print("PROFILES BY PPID COUNTRY CODE")
|
|
print("-" * 70)
|
|
|
|
countries = sorted(results['by_country'].items(), key=lambda x: -len(x[1]))
|
|
for country, profiles in countries[:20]:
|
|
print(f" {country}: {len(profiles):,} profiles")
|
|
|
|
if len(countries) > 20:
|
|
print(f" ... and {len(countries) - 20} more countries")
|
|
|
|
print("\n" + "-" * 70)
|
|
print(f"INDONESIAN ORIGIN DETECTED: {len(results['indonesian_origin'])} profiles")
|
|
print("-" * 70)
|
|
|
|
for p in results['indonesian_origin'][:15]:
|
|
indicators = ', '.join(p.get('indonesian_indicators', []))
|
|
print(f" {p['name'][:40]:40} | {p['country']} | {indicators}")
|
|
|
|
if len(results['indonesian_origin']) > 15:
|
|
print(f" ... and {len(results['indonesian_origin']) - 15} more")
|
|
|
|
print("\n" + "-" * 70)
|
|
print(f"ARABIC ORIGIN DETECTED: {len(results['arabic_origin'])} profiles")
|
|
print("-" * 70)
|
|
|
|
for p in results['arabic_origin'][:15]:
|
|
indicators = ', '.join(p.get('arabic_indicators', []))
|
|
print(f" {p['name'][:40]:40} | {p['country']} | {indicators}")
|
|
|
|
if len(results['arabic_origin']) > 15:
|
|
print(f" ... and {len(results['arabic_origin']) - 15} more")
|
|
|
|
|
|
def print_country_profiles(results: Dict, country: str):
|
|
"""Print detailed list of profiles for a specific country."""
|
|
profiles = results['by_country'].get(country, [])
|
|
|
|
print(f"\n{'=' * 70}")
|
|
print(f"PROFILES FOR COUNTRY: {country}")
|
|
print(f"{'=' * 70}")
|
|
print(f"Total: {len(profiles)} profiles\n")
|
|
|
|
for p in profiles:
|
|
# Check origins
|
|
is_indo, indo_ind = detect_indonesian_origin(p['name'])
|
|
is_arab, arab_ind = detect_arabic_origin(p['name'])
|
|
|
|
origin_flags = []
|
|
if is_indo:
|
|
origin_flags.append("🇮🇩")
|
|
if is_arab:
|
|
origin_flags.append("🇸🇦")
|
|
|
|
flags = ' '.join(origin_flags) if origin_flags else ''
|
|
print(f" {p['name'][:45]:45} | {p['file'][:40]} {flags}")
|
|
|
|
|
|
def print_origin_profiles(results: Dict, origin_type: str):
|
|
"""Print detailed list of profiles for a specific origin type."""
|
|
if origin_type == 'indonesian':
|
|
profiles = results['indonesian_origin']
|
|
title = "INDONESIAN ORIGIN"
|
|
elif origin_type == 'arabic':
|
|
profiles = results['arabic_origin']
|
|
title = "ARABIC ORIGIN"
|
|
else:
|
|
print(f"Unknown origin type: {origin_type}")
|
|
return
|
|
|
|
print(f"\n{'=' * 70}")
|
|
print(f"{title} PROFILES")
|
|
print(f"{'=' * 70}")
|
|
print(f"Total: {len(profiles)} profiles\n")
|
|
|
|
# Group by PPID country
|
|
by_country = defaultdict(list)
|
|
for p in profiles:
|
|
by_country[p['country']].append(p)
|
|
|
|
for country, country_profiles in sorted(by_country.items(), key=lambda x: -len(x[1])):
|
|
print(f"\n{country} ({len(country_profiles)} profiles):")
|
|
for p in country_profiles:
|
|
indicators_key = 'indonesian_indicators' if origin_type == 'indonesian' else 'arabic_indicators'
|
|
indicators = ', '.join(p.get(indicators_key, []))
|
|
print(f" {p['name'][:40]:40} | {indicators}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Identify person profiles by country/origin'
|
|
)
|
|
parser.add_argument('--summary', action='store_true',
|
|
help='Show summary of all profiles')
|
|
parser.add_argument('--country', type=str,
|
|
help='Show profiles for specific country code (e.g., ID, NL)')
|
|
parser.add_argument('--detect-indonesian', action='store_true',
|
|
help='Show profiles with detected Indonesian origin')
|
|
parser.add_argument('--detect-arabic', action='store_true',
|
|
help='Show profiles with detected Arabic origin')
|
|
parser.add_argument('--data-dir', type=str, default='data/person',
|
|
help='Person data directory')
|
|
parser.add_argument('--limit', type=int,
|
|
help='Limit number of profiles to analyze')
|
|
parser.add_argument('--json', action='store_true',
|
|
help='Output as JSON')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not any([args.summary, args.country, args.detect_indonesian, args.detect_arabic]):
|
|
parser.print_help()
|
|
return
|
|
|
|
data_dir = Path(args.data_dir)
|
|
if not data_dir.exists():
|
|
print(f"Error: Data directory not found: {data_dir}")
|
|
sys.exit(1)
|
|
|
|
print("Analyzing profiles...")
|
|
results = analyze_profiles(data_dir, args.limit)
|
|
|
|
if args.json:
|
|
# Convert defaultdict to dict for JSON serialization
|
|
output = {
|
|
'total': results['total'],
|
|
'errors': results['errors'],
|
|
'by_country': {k: len(v) for k, v in results['by_country'].items()},
|
|
'indonesian_origin_count': len(results['indonesian_origin']),
|
|
'arabic_origin_count': len(results['arabic_origin']),
|
|
'indonesian_profiles': results['indonesian_origin'],
|
|
'arabic_profiles': results['arabic_origin'],
|
|
}
|
|
print(json.dumps(output, indent=2))
|
|
return
|
|
|
|
if args.summary:
|
|
print_summary(results)
|
|
|
|
if args.country:
|
|
print_country_profiles(results, args.country.upper())
|
|
|
|
if args.detect_indonesian:
|
|
print_origin_profiles(results, 'indonesian')
|
|
|
|
if args.detect_arabic:
|
|
print_origin_profiles(results, 'arabic')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|