glam/scripts/identify_profiles_by_origin.py

408 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Identify Person Profiles by Country/Origin for Targeted Verification.
This script identifies profiles by:
1. PPID country code (direct detection)
2. Name origin patterns (heuristic detection)
Useful for planning surname database expansion and targeted entity resolution.
Usage:
python scripts/identify_profiles_by_origin.py --summary
python scripts/identify_profiles_by_origin.py --country ID
python scripts/identify_profiles_by_origin.py --detect-arabic
python scripts/identify_profiles_by_origin.py --detect-indonesian
"""
import argparse
import json
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
# =============================================================================
# NAME ORIGIN PATTERNS
# =============================================================================
# Indonesian name patterns (common surnames and name elements)
INDONESIAN_PATTERNS = {
# Common Indonesian surnames
'surnames': {
'sitompul', 'simatupang', 'siagian', 'siregar', 'simanjuntak', 'sinaga',
'simbolon', 'saragih', 'hutapea', 'hutabarat', 'panjaitan', 'pardede',
'purba', 'pasaribu', 'manullang', 'lumbantobing', 'nainggolan', 'napitupulu',
'hasibuan', 'harahap', 'lubis', 'nasution', 'daulay', 'ritonga',
'wijaya', 'susanto', 'setiawan', 'pratama', 'putra', 'putri', 'sari',
'wati', 'dewi', 'lestari', 'rahayu', 'handoko', 'santoso', 'hidayat',
'nugroho', 'kurniawan', 'saputra', 'hermawan', 'gunawan', 'budiman',
'hartono', 'suryono', 'supriyadi', 'sugiarto', 'wibowo', 'widodo',
'prasetyo', 'prabowo', 'yulianto', 'cahyono', 'firmansyah', 'syahputra',
'ramadhan', 'hidayanto', 'kusuma', 'wahyudi', 'setiabudi', 'sulistyo',
},
# Common Indonesian first name patterns (refined to reduce false positives)
# NOTE: These patterns alone are weak signals - best combined with surname matches
'first_name_patterns': [
r'^(dwi|eka|catur)\b', # Number prefixes (removed tri- to avoid Tristan)
r'(wati|dewi|sari|putri|lestari)$', # Female suffixes (distinctive)
r'(wanto|yanto|ianto)$', # Male -anto suffix (requires at least 5 chars before)
r'^(sri|siti)\s', # Common female prefixes (with space, removed nur which is Arabic)
r'^(agus|budi|eko|joko|bambang|dedi|heri|iwan|wawan)\b', # Common male names
r'^(rina|rini|ratna|yuni|yuli|wulan|sinta)\b', # Common female names (removed rita, ani, eni)
],
}
# Arabic name patterns
ARABIC_PATTERNS = {
# Common Arabic surnames and family names
'surnames': {
# Egyptian surnames
'mohamed', 'mohammed', 'muhammad', 'ahmed', 'ahmad', 'mahmoud', 'mahmud',
'hassan', 'hussein', 'hosni', 'hosny', 'mustafa', 'mostafa', 'omar', 'osman',
'ibrahim', 'ismail', 'youssef', 'yusuf', 'ali', 'aly', 'saleh', 'salih',
'khalil', 'nasser', 'naser', 'abdel', 'abdul', 'abou', 'abu',
# Levantine surnames (Lebanon, Syria, Jordan, Palestine)
# NOTE: Removed 'hanna' as it's also common German/Scandinavian
'khoury', 'khouri', 'haddad', 'sayegh', 'saad', 'salim', 'salem',
'mansour', 'nassar', 'nasser', 'issa', 'boutros', 'daher',
'habib', 'hariri', 'gemayel', 'jumblatt', 'berri', 'aoun', 'frangieh',
# Gulf surnames
'al-saud', 'alsaud', 'al-thani', 'althani', 'al-maktoum', 'almaktoum',
'al-nahyan', 'alnahyan', 'al-sabah', 'alsabah', 'al-khalifa', 'alkhalifa',
# Maghreb surnames (Morocco, Algeria, Tunisia)
'benali', 'bensalem', 'benyahia', 'bouazizi', 'bouteflika', 'zidane',
'benzema', 'belhaj', 'belkaid', 'benamor', 'benabdallah', 'benjelloun',
# Common prefixes that indicate Arabic names
'el-', 'al-', 'bin', 'ibn', 'bint', 'abu', 'abou', 'um', 'umm',
},
# Arabic first name patterns (with word boundaries to avoid false matches)
'first_name_patterns': [
r'^(abdul|abdel|abd[\s\-])', # Abdul-X compound names
r'^(abu|abou|um|umm)[\s\-]', # Abu X, Umm X
r'^(muhammad|mohammed|mohamed)\b', # Most common names (not Muhammadi)
r'^(ahmad|ahmed)\b', # Ahmad (not Ahmad as part of longer name)
r'\b(ali|omar|umar|othman|osman|hussein|hussain|hassan|hasan)\b', # Word boundary both sides
r'^(ibrahim|ismail|yusuf|youssef|khalid|khaled|abdullah|abdallah)\b',
r'^(fatima|fatimah|khadija|khadijah|aisha|aysha|maryam|mariam)\b',
r'^(nour|noor|layla|leila)\b', # Shortened list to avoid common Western names
r'^(karim|kareem|rahim|raheem|rashid|rasheed|hamid|hameed)\b',
r'^(jamal|gamal|walid|waleed|tariq|tarek|samir|samer)\b',
],
# Al- prefix surnames (refined - must be followed by Arabic-style name, not Germanic names)
# Excludes: Albers, Albert, Allen, Allison, Almquist, Althaus, etc.
'al_prefix_pattern': r'\bal[\-]([a-z]{3,})', # Requires hyphen: al-mutairi, al-saud
}
def extract_country_from_ppid(filename: str) -> str:
"""Extract 2-letter country code from PPID filename."""
if not filename.startswith('ID_'):
return 'XX'
ppid = filename[3:].replace('.json', '')
parts = ppid.split('_')
if len(parts) >= 1:
location_part = parts[0]
location_components = location_part.split('-')
if len(location_components) >= 1:
country = location_components[0].upper()
if len(country) == 2 and country.isalpha():
return country
return 'XX'
def extract_person_name(profile: dict) -> str:
"""Extract person name from profile."""
if 'profile_data' in profile:
pd = profile['profile_data']
if pd.get('full_name'):
return pd['full_name']
if pd.get('name'):
return pd['name']
if profile.get('full_name'):
return profile['full_name']
if profile.get('name'):
return profile['name']
return "Unknown"
def detect_indonesian_origin(name: str) -> Tuple[bool, List[str]]:
"""
Detect if a name appears to be of Indonesian origin.
Returns: (is_likely_indonesian, list of matching indicators)
"""
name_lower = name.lower().strip()
parts = name_lower.split()
indicators = []
# Check surnames
for part in parts:
if part in INDONESIAN_PATTERNS['surnames']:
indicators.append(f"surname:{part}")
# Check first name patterns
for pattern in INDONESIAN_PATTERNS['first_name_patterns']:
if re.search(pattern, name_lower):
indicators.append(f"pattern:{pattern}")
break # Only add one pattern match
# Check for Batak names (double surnames like Lumbantobing)
if re.search(r'lumban|huta|pan[dj]aitan|sima[nt]upang', name_lower):
indicators.append("batak_name")
is_likely = len(indicators) >= 1
return is_likely, indicators
def detect_arabic_origin(name: str) -> Tuple[bool, List[str]]:
"""
Detect if a name appears to be of Arabic origin.
Returns: (is_likely_arabic, list of matching indicators)
"""
name_lower = name.lower().strip()
parts = name_lower.split()
indicators = []
# Check surnames
for part in parts:
# Remove common prefixes for matching
clean_part = re.sub(r'^(el|al)[\-\s]?', '', part)
if part in ARABIC_PATTERNS['surnames'] or clean_part in ARABIC_PATTERNS['surnames']:
indicators.append(f"surname:{part}")
# Check for Al-X pattern
if re.search(ARABIC_PATTERNS['al_prefix_pattern'], name_lower):
indicators.append("al_prefix")
# Check first name patterns
for pattern in ARABIC_PATTERNS['first_name_patterns']:
if re.search(pattern, name_lower):
indicators.append(f"pattern:{pattern}")
break
# Check for compound names (Abdul Rahman, Abu Bakr, Ibn Rushd, Bin Laden)
# Requires word boundary before to avoid matching "Robin" → "bin"
if re.search(r'\b(abdul?|abu|ibn)\s+[a-z]+', name_lower):
indicators.append("compound_name")
# "bin" and "bint" need special handling - must be followed by Arabic-style name
if re.search(r'\bbin\s+(abdul|ahmad|ali|mohammed|muhammad|hassan|hussein|khalid|omar|saleh|yusuf|ibrahim)', name_lower):
indicators.append("bin_compound")
is_likely = len(indicators) >= 1
return is_likely, indicators
def analyze_profiles(data_dir: Path, limit: Optional[int] = None) -> Dict:
"""Analyze all profiles and return statistics."""
profile_files = list(data_dir.glob('ID_*.json'))
if limit:
profile_files = profile_files[:limit]
results = {
'total': len(profile_files),
'by_country': defaultdict(list),
'indonesian_origin': [],
'arabic_origin': [],
'errors': 0,
}
for file_path in profile_files:
try:
country = extract_country_from_ppid(file_path.name)
with open(file_path, 'r', encoding='utf-8') as f:
profile = json.load(f)
name = extract_person_name(profile)
profile_info = {
'file': file_path.name,
'name': name,
'country': country,
}
results['by_country'][country].append(profile_info)
# Detect Indonesian origin
is_indonesian, indonesian_indicators = detect_indonesian_origin(name)
if is_indonesian:
profile_info['indonesian_indicators'] = indonesian_indicators
results['indonesian_origin'].append(profile_info)
# Detect Arabic origin
is_arabic, arabic_indicators = detect_arabic_origin(name)
if is_arabic:
profile_info['arabic_indicators'] = arabic_indicators
results['arabic_origin'].append(profile_info)
except Exception as e:
results['errors'] += 1
return results
def print_summary(results: Dict):
"""Print summary of profile analysis."""
print("=" * 70)
print("PROFILE ORIGIN ANALYSIS SUMMARY")
print("=" * 70)
print(f"\nTotal profiles analyzed: {results['total']:,}")
print(f"Errors: {results['errors']}")
print("\n" + "-" * 70)
print("PROFILES BY PPID COUNTRY CODE")
print("-" * 70)
countries = sorted(results['by_country'].items(), key=lambda x: -len(x[1]))
for country, profiles in countries[:20]:
print(f" {country}: {len(profiles):,} profiles")
if len(countries) > 20:
print(f" ... and {len(countries) - 20} more countries")
print("\n" + "-" * 70)
print(f"INDONESIAN ORIGIN DETECTED: {len(results['indonesian_origin'])} profiles")
print("-" * 70)
for p in results['indonesian_origin'][:15]:
indicators = ', '.join(p.get('indonesian_indicators', []))
print(f" {p['name'][:40]:40} | {p['country']} | {indicators}")
if len(results['indonesian_origin']) > 15:
print(f" ... and {len(results['indonesian_origin']) - 15} more")
print("\n" + "-" * 70)
print(f"ARABIC ORIGIN DETECTED: {len(results['arabic_origin'])} profiles")
print("-" * 70)
for p in results['arabic_origin'][:15]:
indicators = ', '.join(p.get('arabic_indicators', []))
print(f" {p['name'][:40]:40} | {p['country']} | {indicators}")
if len(results['arabic_origin']) > 15:
print(f" ... and {len(results['arabic_origin']) - 15} more")
def print_country_profiles(results: Dict, country: str):
"""Print detailed list of profiles for a specific country."""
profiles = results['by_country'].get(country, [])
print(f"\n{'=' * 70}")
print(f"PROFILES FOR COUNTRY: {country}")
print(f"{'=' * 70}")
print(f"Total: {len(profiles)} profiles\n")
for p in profiles:
# Check origins
is_indo, indo_ind = detect_indonesian_origin(p['name'])
is_arab, arab_ind = detect_arabic_origin(p['name'])
origin_flags = []
if is_indo:
origin_flags.append("🇮🇩")
if is_arab:
origin_flags.append("🇸🇦")
flags = ' '.join(origin_flags) if origin_flags else ''
print(f" {p['name'][:45]:45} | {p['file'][:40]} {flags}")
def print_origin_profiles(results: Dict, origin_type: str):
"""Print detailed list of profiles for a specific origin type."""
if origin_type == 'indonesian':
profiles = results['indonesian_origin']
title = "INDONESIAN ORIGIN"
elif origin_type == 'arabic':
profiles = results['arabic_origin']
title = "ARABIC ORIGIN"
else:
print(f"Unknown origin type: {origin_type}")
return
print(f"\n{'=' * 70}")
print(f"{title} PROFILES")
print(f"{'=' * 70}")
print(f"Total: {len(profiles)} profiles\n")
# Group by PPID country
by_country = defaultdict(list)
for p in profiles:
by_country[p['country']].append(p)
for country, country_profiles in sorted(by_country.items(), key=lambda x: -len(x[1])):
print(f"\n{country} ({len(country_profiles)} profiles):")
for p in country_profiles:
indicators_key = 'indonesian_indicators' if origin_type == 'indonesian' else 'arabic_indicators'
indicators = ', '.join(p.get(indicators_key, []))
print(f" {p['name'][:40]:40} | {indicators}")
def main():
parser = argparse.ArgumentParser(
description='Identify person profiles by country/origin'
)
parser.add_argument('--summary', action='store_true',
help='Show summary of all profiles')
parser.add_argument('--country', type=str,
help='Show profiles for specific country code (e.g., ID, NL)')
parser.add_argument('--detect-indonesian', action='store_true',
help='Show profiles with detected Indonesian origin')
parser.add_argument('--detect-arabic', action='store_true',
help='Show profiles with detected Arabic origin')
parser.add_argument('--data-dir', type=str, default='data/person',
help='Person data directory')
parser.add_argument('--limit', type=int,
help='Limit number of profiles to analyze')
parser.add_argument('--json', action='store_true',
help='Output as JSON')
args = parser.parse_args()
if not any([args.summary, args.country, args.detect_indonesian, args.detect_arabic]):
parser.print_help()
return
data_dir = Path(args.data_dir)
if not data_dir.exists():
print(f"Error: Data directory not found: {data_dir}")
sys.exit(1)
print("Analyzing profiles...")
results = analyze_profiles(data_dir, args.limit)
if args.json:
# Convert defaultdict to dict for JSON serialization
output = {
'total': results['total'],
'errors': results['errors'],
'by_country': {k: len(v) for k, v in results['by_country'].items()},
'indonesian_origin_count': len(results['indonesian_origin']),
'arabic_origin_count': len(results['arabic_origin']),
'indonesian_profiles': results['indonesian_origin'],
'arabic_profiles': results['arabic_origin'],
}
print(json.dumps(output, indent=2))
return
if args.summary:
print_summary(results)
if args.country:
print_country_profiles(results, args.country.upper())
if args.detect_indonesian:
print_origin_profiles(results, 'indonesian')
if args.detect_arabic:
print_origin_profiles(results, 'arabic')
if __name__ == '__main__':
main()