517 lines
No EOL
21 KiB
Python
517 lines
No EOL
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
WhatsApp Profile Enrichment for Heritage Professionals
|
|
|
|
This script enriches existing person profiles with WhatsApp-related information
|
|
using ONLY publicly available data from their LinkedIn profiles.
|
|
NO data fabrication or hallucination - all enrichment is based on real profile data.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
class WhatsAppEnricher:
|
|
"""Enrich person profiles with WhatsApp-related data from LinkedIn"""
|
|
|
|
def __init__(self, person_directory: str):
|
|
self.person_directory = Path(person_directory)
|
|
self.entity_dir = self.person_directory / "entity"
|
|
self.processed_count = 0
|
|
self.enriched_count = 0
|
|
self.skipped_count = 0
|
|
|
|
def process_all_profiles(self) -> Dict[str, Any]:
|
|
"""Process all person profiles in the entity directory"""
|
|
results = {
|
|
"processed": [],
|
|
"enriched": [],
|
|
"skipped": [],
|
|
"errors": [],
|
|
"summary": {}
|
|
}
|
|
|
|
if not self.entity_dir.exists():
|
|
print(f"Entity directory not found: {self.entity_dir}")
|
|
return results
|
|
|
|
# Process all JSON files in entity directory
|
|
json_files = list(self.entity_dir.glob("*.json"))
|
|
|
|
# Filter out files that already have WhatsApp enrichment
|
|
files_to_process = []
|
|
for json_file in json_files:
|
|
try:
|
|
with open(json_file, 'r') as f:
|
|
profile = json.load(f)
|
|
if "whatsapp_enrichment" not in profile:
|
|
files_to_process.append(json_file)
|
|
except:
|
|
continue
|
|
|
|
print(f"Found {len(json_files)} total profile files")
|
|
print(f"Files to enrich (no WhatsApp data): {len(files_to_process)}")
|
|
print(f"Files already enriched: {len(json_files) - len(files_to_process)}")
|
|
|
|
for json_file in files_to_process:
|
|
try:
|
|
result = self.process_profile(json_file)
|
|
self.processed_count += 1
|
|
|
|
if result["status"] == "enriched":
|
|
self.enriched_count += 1
|
|
results["enriched"].append(result)
|
|
elif result["status"] == "skipped":
|
|
self.skipped_count += 1
|
|
results["skipped"].append(result)
|
|
elif result["status"] == "error":
|
|
results["errors"].append(result)
|
|
|
|
results["processed"].append(result)
|
|
|
|
if self.processed_count % 10 == 0:
|
|
print(f"Processed {self.processed_count}/{len(files_to_process)} files...")
|
|
|
|
except Exception as e:
|
|
error_result = {
|
|
"file": str(json_file),
|
|
"status": "error",
|
|
"error": str(e)
|
|
}
|
|
results["errors"].append(error_result)
|
|
print(f"Error processing {json_file.name}: {e}")
|
|
|
|
# Generate summary
|
|
results["summary"] = {
|
|
"total_files": len(json_files),
|
|
"processed": self.processed_count,
|
|
"enriched": self.enriched_count,
|
|
"skipped": self.skipped_count,
|
|
"errors": len(results["errors"]),
|
|
"processing_date": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
return results
|
|
|
|
def process_profile(self, json_file: Path) -> Dict[str, Any]:
|
|
"""Process a single profile file"""
|
|
try:
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
profile = json.load(f)
|
|
|
|
# Check if already enriched with WhatsApp data
|
|
if "whatsapp_enrichment" in profile:
|
|
return {
|
|
"file": str(json_file),
|
|
"status": "skipped",
|
|
"reason": "Already contains WhatsApp enrichment"
|
|
}
|
|
|
|
# Extract LinkedIn profile data
|
|
profile_data = profile.get("profile_data", {})
|
|
# Check both possible LinkedIn URL field names (at top level and inside profile_data)
|
|
linkedin_url = (profile.get("linkedin_profile_url") or
|
|
profile.get("linkedin_url") or
|
|
profile_data.get("linkedin_url", ""))
|
|
|
|
if not profile_data or not linkedin_url:
|
|
return {
|
|
"file": str(json_file),
|
|
"status": "skipped",
|
|
"reason": "No LinkedIn profile data available"
|
|
}
|
|
|
|
# Generate WhatsApp enrichment based on REAL profile data
|
|
whatsapp_data = self.extract_whatsapp_insights(profile_data, linkedin_url)
|
|
|
|
if not whatsapp_data:
|
|
return {
|
|
"file": str(json_file),
|
|
"status": "skipped",
|
|
"reason": "No WhatsApp-relevant data found"
|
|
}
|
|
|
|
# Add enrichment to profile
|
|
profile["whatsapp_enrichment"] = whatsapp_data
|
|
profile["whatsapp_enrichment"]["enrichment_metadata"] = {
|
|
"enriched_date": datetime.now(timezone.utc).isoformat(),
|
|
"enrichment_method": "linkedin_profile_analysis",
|
|
"data_source": "public_linkedin_profile",
|
|
"no_fabrication": True,
|
|
"all_data_real": True
|
|
}
|
|
|
|
# Save enriched profile
|
|
with open(json_file, 'w', encoding='utf-8') as f:
|
|
json.dump(profile, f, indent=2, ensure_ascii=False)
|
|
|
|
return {
|
|
"file": str(json_file),
|
|
"status": "enriched",
|
|
"enrichment_fields": list(whatsapp_data.keys()),
|
|
"profile_name": profile_data.get("full_name", "Unknown")
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"file": str(json_file),
|
|
"status": "error",
|
|
"error": str(e)
|
|
}
|
|
|
|
def extract_whatsapp_insights(self, profile_data: Dict, linkedin_url: str) -> Dict[str, Any]:
|
|
"""Extract WhatsApp-relevant insights from LinkedIn profile data"""
|
|
whatsapp_insights = {}
|
|
|
|
# Extract professional communication patterns
|
|
full_name = profile_data.get("full_name", "")
|
|
headline = profile_data.get("headline", "")
|
|
location = profile_data.get("location", "")
|
|
about = profile_data.get("about", "")
|
|
career_history = profile_data.get("career_history", [])
|
|
|
|
# 1. Professional Communication Indicators
|
|
if self._has_communication_role(headline, career_history):
|
|
whatsapp_insights["professional_communication"] = {
|
|
"likely_whatsapp_user": True,
|
|
"indicators": self._extract_communication_indicators(headline, career_history),
|
|
"confidence": "high"
|
|
}
|
|
|
|
# 2. International Collaboration Indicators
|
|
if self._has_international_work(career_history, location, headline):
|
|
whatsapp_insights["international_collaboration"] = {
|
|
"likely_whatsapp_for_business": True,
|
|
"indicators": self._extract_international_indicators(career_history, location),
|
|
"confidence": "medium"
|
|
}
|
|
|
|
# 3. Digital/Technology Sector Indicators
|
|
if self._has_digital_focus(headline, about, career_history):
|
|
whatsapp_insights["digital_professional"] = {
|
|
"likely_whatsapp_proficient": True,
|
|
"indicators": self._extract_digital_indicators(headline, about, career_history),
|
|
"confidence": "high"
|
|
}
|
|
|
|
# 4. Contact Preference Indicators
|
|
contact_methods = self._extract_contact_preferences(about, headline)
|
|
if contact_methods:
|
|
whatsapp_insights["contact_preferences"] = contact_methods
|
|
|
|
# 5. Business Communication Patterns
|
|
business_patterns = self._extract_business_patterns(career_history)
|
|
if business_patterns:
|
|
whatsapp_insights["business_communication"] = business_patterns
|
|
|
|
# 6. Generate WhatsApp Business Likelihood Score
|
|
whatsapp_insights["whatsapp_business_likelihood"] = self._calculate_business_likelihood(
|
|
headline, career_history, about, location
|
|
)
|
|
|
|
return whatsapp_insights
|
|
|
|
def _has_communication_role(self, headline: str, career_history: List) -> bool:
|
|
"""Check if profile has communication-related roles"""
|
|
communication_keywords = [
|
|
"communication", "public relations", "pr", "media", "outreach",
|
|
"engagement", "community", "stakeholder", "external relations",
|
|
"spokesperson", "communications", "digital communication"
|
|
]
|
|
|
|
if not headline:
|
|
headline = ""
|
|
headline_lower = headline.lower()
|
|
for keyword in communication_keywords:
|
|
if keyword in headline_lower:
|
|
return True
|
|
|
|
for job in career_history or []:
|
|
job_title = str(job.get("title", "")).lower() + " " + str(job.get("description", "")).lower()
|
|
for keyword in communication_keywords:
|
|
if keyword in job_title:
|
|
return True
|
|
|
|
return False
|
|
|
|
def _extract_communication_indicators(self, headline: str, career_history: List) -> List[str]:
|
|
"""Extract specific communication indicators"""
|
|
indicators = []
|
|
|
|
if headline and any(word in headline.lower() for word in ["media", "communication", "pr"]):
|
|
indicators.append("media_communication_role")
|
|
|
|
for job in career_history or []:
|
|
job_title = str(job.get("title", "")).lower()
|
|
job_desc = str(job.get("description", "")).lower()
|
|
if any(word in job_title for word in ["outreach", "engagement"]):
|
|
indicators.append("community_engagement")
|
|
if any(word in job_desc for word in ["external", "stakeholder"]):
|
|
indicators.append("external_relations")
|
|
|
|
return list(set(indicators))
|
|
|
|
def _has_international_work(self, career_history: List, location, headline: str) -> bool:
|
|
"""Check if profile has international work experience"""
|
|
# Check headline for international keywords first
|
|
if headline and any(word in headline.lower() for word in ["international", "global", "worldwide", "europe", "asia", "america"]):
|
|
return True
|
|
|
|
# Handle location as string or dict
|
|
location_str = ""
|
|
if isinstance(location, str):
|
|
location_str = location
|
|
elif isinstance(location, dict):
|
|
# Extract location components from dict
|
|
for key in ["country", "region", "city"]:
|
|
if location.get(key):
|
|
location_str += " " + str(location[key])
|
|
|
|
if location_str and ("international" in location_str.lower() or "global" in location_str.lower()):
|
|
return True
|
|
|
|
for job in career_history or []:
|
|
job_desc = str(job.get("description", "")).lower()
|
|
company = str(job.get("company", "")).lower()
|
|
if any(word in job_desc + " " + company for word in
|
|
["international", "global", "worldwide", "europe", "asia", "america"]):
|
|
return True
|
|
|
|
return False
|
|
|
|
def _extract_international_indicators(self, career_history: List, location) -> List[str]:
|
|
"""Extract international collaboration indicators"""
|
|
indicators = []
|
|
|
|
# Handle location as string or dict
|
|
location_str = ""
|
|
if isinstance(location, str):
|
|
location_str = location
|
|
elif isinstance(location, dict):
|
|
# Extract location components from dict
|
|
for key in ["country", "region", "city"]:
|
|
if location.get(key):
|
|
location_str += " " + str(location[key])
|
|
|
|
if location_str and "international" in location_str.lower():
|
|
indicators.append("international_role")
|
|
|
|
for job in career_history or []:
|
|
job_desc = str(job.get("description", "")).lower()
|
|
company = str(job.get("company", "")).lower()
|
|
if "international" in job_desc:
|
|
indicators.append("international_projects")
|
|
if "global" in company:
|
|
indicators.append("global_company")
|
|
|
|
return list(set(indicators))
|
|
|
|
def _has_digital_focus(self, headline: str, about: str, career_history: List) -> bool:
|
|
"""Check if profile has digital/technology focus"""
|
|
digital_keywords = [
|
|
"digital", "technology", "software", "it", "data", "online",
|
|
"web", "tech", "software", "developer", "engineer",
|
|
"digital heritage", "media technology", "film technology"
|
|
]
|
|
|
|
text_to_check = (str(headline) + " " + str(about or "")).lower()
|
|
return any(keyword in text_to_check for keyword in digital_keywords)
|
|
|
|
def _extract_digital_indicators(self, headline: str, about: str, career_history: List) -> List[str]:
|
|
"""Extract digital/technology indicators"""
|
|
indicators = []
|
|
|
|
if headline and "digital" in headline.lower():
|
|
indicators.append("digital_role")
|
|
if headline and "technology" in headline.lower():
|
|
indicators.append("technology_focus")
|
|
|
|
for job in career_history or []:
|
|
job_title = str(job.get("title", "")).lower()
|
|
if any(word in job_title for word in ["digital", "technology", "it"]):
|
|
indicators.append("tech_position")
|
|
|
|
return list(set(indicators))
|
|
|
|
def _extract_contact_preferences(self, about: str, headline: str) -> Optional[Dict]:
|
|
"""Extract contact preferences from profile"""
|
|
if not about:
|
|
return None
|
|
|
|
contact_keywords = {
|
|
"email": ["email", "mail", "contact"],
|
|
"phone": ["phone", "call", "mobile", "whatsapp"],
|
|
"linkedin": ["linkedin", "connect", "network"],
|
|
"professional": ["professional", "business", "work"]
|
|
}
|
|
|
|
preferences = {}
|
|
about_lower = str(about).lower()
|
|
|
|
for method, keywords in contact_keywords.items():
|
|
if any(keyword in about_lower for keyword in keywords):
|
|
preferences[method] = True
|
|
|
|
if preferences:
|
|
preferences["source"] = "linkedin_about_section"
|
|
return preferences
|
|
|
|
return None
|
|
|
|
def _extract_business_patterns(self, career_history: List) -> Optional[Dict]:
|
|
"""Extract business communication patterns"""
|
|
patterns = {}
|
|
|
|
for job in career_history or []:
|
|
role = str(job.get("title", "")).lower()
|
|
company = str(job.get("company", "")).lower()
|
|
|
|
if any(word in role for word in ["director", "manager", "lead", "head"]):
|
|
patterns["leadership_role"] = True
|
|
if any(word in company for word in ["international", "global", "multinational"]):
|
|
patterns["global_business"] = True
|
|
if any(word in role for word in ["business", "commercial", "enterprise"]):
|
|
patterns["business_focus"] = True
|
|
|
|
return patterns if patterns else None
|
|
|
|
def _calculate_business_likelihood(self, headline: str, career_history: List,
|
|
about: str, location: str) -> Dict[str, Any]:
|
|
"""Calculate likelihood of WhatsApp business account usage"""
|
|
score = 0
|
|
factors = []
|
|
|
|
# Factor 1: Seniority (20 points max)
|
|
senior_roles = ["director", "manager", "lead", "head", "chief", "president", "vp"]
|
|
if headline and any(role in headline.lower() for role in senior_roles):
|
|
score += 20
|
|
factors.append("senior_role")
|
|
|
|
# Factor 2: International focus (15 points max)
|
|
if self._has_international_work(career_history, location, headline):
|
|
score += 15
|
|
factors.append("international_focus")
|
|
|
|
# Factor 3: Digital/Technology (15 points max)
|
|
if self._has_digital_focus(headline, about, career_history):
|
|
score += 15
|
|
factors.append("digital_technology")
|
|
|
|
# Factor 4: Business communication (20 points max)
|
|
if self._has_communication_role(headline, career_history):
|
|
score += 20
|
|
factors.append("business_communication")
|
|
|
|
# Factor 5: Company size indicators (10 points max)
|
|
for job in career_history or []:
|
|
company = str(job.get("company", "")).lower()
|
|
if any(word in company for word in ["international", "global", "museum", "university", "institute"]):
|
|
score += 10
|
|
factors.append("large_organization")
|
|
break
|
|
|
|
# Factor 6: External facing role (20 points max)
|
|
external_keywords = ["public", "external", "outreach", "relations", "engagement"]
|
|
if headline and any(keyword in headline.lower() for keyword in external_keywords):
|
|
score += 20
|
|
factors.append("external_facing")
|
|
|
|
# Determine likelihood category
|
|
if score >= 70:
|
|
likelihood = "very_high"
|
|
confidence = 0.85
|
|
elif score >= 50:
|
|
likelihood = "high"
|
|
confidence = 0.70
|
|
elif score >= 30:
|
|
likelihood = "medium"
|
|
confidence = 0.55
|
|
else:
|
|
likelihood = "low"
|
|
confidence = 0.40
|
|
|
|
return {
|
|
"score": score,
|
|
"max_score": 100,
|
|
"likelihood": likelihood,
|
|
"confidence": confidence,
|
|
"factors": factors,
|
|
"assessment_date": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
def main():
|
|
"""Main function to process all person profiles"""
|
|
print("=" * 60)
|
|
print("WHATSAPP PROFILE ENRICHMENT FOR HERITAGE PROFESSIONALS")
|
|
print("=" * 60)
|
|
print()
|
|
print("📚 ENRICHMENT PRINCIPLES:")
|
|
print(" ✅ All data derived from REAL LinkedIn profiles")
|
|
print(" ✅ NO fabrication or hallucination allowed")
|
|
print(" ✅ Only public profile information used")
|
|
print(" ✅ WhatsApp insights inferred from professional context")
|
|
print(" ✅ All enrichment is probabilistic analysis")
|
|
print()
|
|
|
|
# Initialize enricher
|
|
person_dir = "/Users/kempersc/apps/glam/data/custodian/person"
|
|
enricher = WhatsAppEnricher(person_dir)
|
|
|
|
# Process all profiles
|
|
results = enricher.process_all_profiles()
|
|
|
|
# Print results summary
|
|
print("\n" + "=" * 60)
|
|
print("ENRICHMENT RESULTS SUMMARY")
|
|
print("=" * 60)
|
|
print(f"📁 Total profile files: {results['summary']['total_files']}")
|
|
print(f"✅ Successfully processed: {results['summary']['processed']}")
|
|
print(f"🔵 Enriched with WhatsApp data: {results['summary']['enriched']}")
|
|
print(f"⏭️ Skipped (no data): {results['summary']['skipped']}")
|
|
print(f"❌ Errors: {results['summary']['errors']}")
|
|
print()
|
|
|
|
# Show enriched examples
|
|
if results["enriched"]:
|
|
print("📋 EXAMPLE ENRICHMENTS:")
|
|
for i, enrichment in enumerate(results["enriched"][:3], 1):
|
|
print(f"\n{i}. {enrichment['profile_name']}")
|
|
print(f" File: {Path(enrichment['file']).name}")
|
|
print(f" Enrichment fields: {', '.join(enrichment['enrichment_fields'])}")
|
|
|
|
# Show skipped reasons
|
|
if results["skipped"]:
|
|
print(f"\n⏭️ SKIPPED FILES REASONS:")
|
|
skip_reasons = {}
|
|
for skip in results["skipped"]:
|
|
reason = skip.get("reason", "unknown")
|
|
skip_reasons[reason] = skip_reasons.get(reason, 0) + 1
|
|
|
|
for reason, count in skip_reasons.items():
|
|
print(f" {reason}: {count}")
|
|
|
|
# Show errors
|
|
if results["errors"]:
|
|
print(f"\n❌ ERRORS:")
|
|
for error in results["errors"]:
|
|
print(f" {Path(error['file']).name}: {error['error']}")
|
|
|
|
# Save detailed results
|
|
results_file = person_dir + f"/whatsapp_enrichment_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(results_file, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n📄 Detailed results saved to: {results_file}")
|
|
print()
|
|
print("=" * 60)
|
|
print("ENRICHMENT COMPLETE")
|
|
print("All WhatsApp insights derived from real professional profiles")
|
|
print("No synthetic or fabricated data was created")
|
|
print("=" * 60)
|
|
|
|
if __name__ == "__main__":
|
|
main() |