792 lines
28 KiB
Python
792 lines
28 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Profile Risk Scoring Script for Person Data Quality.
|
|
|
|
This script analyzes all person profiles and generates a comprehensive risk report
|
|
combining:
|
|
1. Name Commonality Risk (from name_commonality.py)
|
|
2. Claim Source Quality (domain-based risk)
|
|
3. Entity Resolution Confidence
|
|
|
|
OUTPUT: A prioritized list of profiles that need manual review.
|
|
|
|
DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data.
|
|
|
|
Usage:
|
|
python scripts/generate_profile_risk_report.py --analyze
|
|
python scripts/generate_profile_risk_report.py --analyze --limit 1000
|
|
python scripts/generate_profile_risk_report.py --top-risks 100
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
from urllib.parse import unquote
|
|
import re
|
|
|
|
# Add src to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.entity_resolution.name_commonality import (
|
|
NameCommonalityScorer,
|
|
RiskLevel,
|
|
score_name,
|
|
load_surname_data,
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# DOMAIN RISK CLASSIFICATION
|
|
# =============================================================================
|
|
|
|
class DomainRisk(Enum):
|
|
"""Risk level for claim source domains."""
|
|
HIGH = "high" # Auto-remove candidates
|
|
MEDIUM = "medium" # Manual verification required
|
|
LOW = "low" # Acceptable with caution
|
|
SAFE = "safe" # Trusted institutional sources
|
|
|
|
|
|
# HIGH RISK: Entity aggregators, social media, entertainment databases
|
|
HIGH_RISK_DOMAINS: Set[str] = {
|
|
# People aggregators (frequent entity resolution failures)
|
|
"rocketreach.co",
|
|
"www.zoominfo.com",
|
|
"www.idcrawl.com",
|
|
"www.peekyou.com",
|
|
"www.spokeo.com",
|
|
"www.whitepages.com",
|
|
"www.beenverified.com",
|
|
"www.truepeoplesearch.com",
|
|
"www.fastpeoplesearch.com",
|
|
"www.ussearch.com",
|
|
|
|
# Entertainment databases (actor/director collisions)
|
|
"www.imdb.com",
|
|
"imdb.com",
|
|
"www.themoviedb.org",
|
|
|
|
# E-commerce (wrong author/reviewer attribution)
|
|
"www.amazon.com",
|
|
"www.amazon.co.uk",
|
|
"www.amazon.de",
|
|
"www.goodreads.com",
|
|
|
|
# Social media (high collision risk)
|
|
"www.instagram.com",
|
|
"instagram.com",
|
|
"www.tiktok.com",
|
|
"tiktok.com",
|
|
"linktr.ee",
|
|
|
|
# Sports databases
|
|
"worldathletics.org",
|
|
"www.eliteprospects.com",
|
|
|
|
# Genealogy sites (frequent namesake confusion)
|
|
"www.geni.com",
|
|
"geni.com",
|
|
"www.ancestry.com",
|
|
"ancestry.com",
|
|
"www.myheritage.com",
|
|
"myheritage.com",
|
|
"www.familysearch.org",
|
|
"familysearch.org",
|
|
"www.findagrave.com",
|
|
"findagrave.com",
|
|
}
|
|
|
|
# MEDIUM RISK: Sites that need verification but aren't auto-remove
|
|
MEDIUM_RISK_DOMAINS: Set[str] = {
|
|
"twitter.com",
|
|
"x.com",
|
|
"www.facebook.com",
|
|
"facebook.com",
|
|
"www.youtube.com",
|
|
"youtube.com",
|
|
"medium.com",
|
|
"www.researchgate.net", # Different than ORCID - some collisions
|
|
"www.academia.edu",
|
|
}
|
|
|
|
# SAFE DOMAINS: Trusted institutional sources
|
|
SAFE_DOMAINS: Set[str] = {
|
|
# Dutch heritage institutions
|
|
"www.rijksmuseum.nl",
|
|
"www.nationaalarchief.nl",
|
|
"www.kb.nl",
|
|
"www.niod.nl",
|
|
"www.eyefilm.nl",
|
|
"www.amsterdammuseum.nl",
|
|
"www.geldersarchief.nl",
|
|
"www.codart.nl",
|
|
"www.kunsthistorici.nl",
|
|
|
|
# Dutch universities
|
|
"www.universiteitleiden.nl",
|
|
"www.uva.nl",
|
|
"www.uu.nl",
|
|
"www.rug.nl",
|
|
"www.tue.nl",
|
|
"www.tudelft.nl",
|
|
"www.ru.nl",
|
|
"www.maastrichtuniversity.nl",
|
|
"www.vu.nl",
|
|
"www.wur.nl",
|
|
"pure.knaw.nl",
|
|
|
|
# International academic identifiers
|
|
"orcid.org",
|
|
|
|
# Wikipedia (verified with article matching)
|
|
"en.wikipedia.org",
|
|
"nl.wikipedia.org",
|
|
"de.wikipedia.org",
|
|
"fr.wikipedia.org",
|
|
}
|
|
|
|
|
|
def get_domain_from_url(url: str) -> Optional[str]:
|
|
"""Extract domain from URL."""
|
|
if not url:
|
|
return None
|
|
# Simple regex to extract domain
|
|
match = re.search(r'https?://([^/]+)', url.lower())
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def classify_domain_risk(domain: str) -> DomainRisk:
|
|
"""Classify a domain's risk level."""
|
|
if not domain:
|
|
return DomainRisk.MEDIUM
|
|
|
|
domain_lower = domain.lower()
|
|
|
|
# Check high risk
|
|
if domain_lower in HIGH_RISK_DOMAINS:
|
|
return DomainRisk.HIGH
|
|
|
|
# Check medium risk
|
|
if domain_lower in MEDIUM_RISK_DOMAINS:
|
|
return DomainRisk.MEDIUM
|
|
|
|
# Check safe
|
|
if domain_lower in SAFE_DOMAINS:
|
|
return DomainRisk.SAFE
|
|
|
|
# Check for institutional patterns
|
|
if any(pattern in domain_lower for pattern in [
|
|
'.edu', '.ac.uk', '.uni-', 'university', 'museum', 'archief',
|
|
'bibliotheek', 'library', 'archive', 'gov.', '.gouv.'
|
|
]):
|
|
return DomainRisk.LOW
|
|
|
|
# LinkedIn requires special handling
|
|
if 'linkedin.com' in domain_lower:
|
|
return DomainRisk.LOW # Low if slug matches, but we check separately
|
|
|
|
return DomainRisk.MEDIUM
|
|
|
|
|
|
# =============================================================================
|
|
# PROFILE RISK SCORING
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class ClaimRiskAnalysis:
|
|
"""Risk analysis for claims in a profile."""
|
|
total_claims: int = 0
|
|
high_risk_claims: int = 0
|
|
medium_risk_claims: int = 0
|
|
safe_claims: int = 0
|
|
high_risk_domains: List[str] = field(default_factory=list)
|
|
medium_risk_domains: List[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class ProfileRiskScore:
|
|
"""Complete risk assessment for a person profile."""
|
|
file_path: str
|
|
file_name: str
|
|
person_name: str
|
|
linkedin_slug: Optional[str]
|
|
|
|
# Name risk
|
|
name_risk_level: str # RiskLevel.value
|
|
name_risk_score: float
|
|
name_required_attributes: int
|
|
|
|
# Claim risk
|
|
claim_analysis: ClaimRiskAnalysis
|
|
|
|
# Combined risk
|
|
combined_risk_score: float # 0.0 (safe) to 1.0 (highest risk)
|
|
priority_rank: int = 0 # Set after sorting
|
|
|
|
# Flags
|
|
has_high_risk_claims: bool = False
|
|
has_linkedin_mismatch: bool = False
|
|
needs_manual_review: bool = False
|
|
review_reasons: List[str] = field(default_factory=list)
|
|
|
|
|
|
def extract_country_from_ppid(filename: str) -> str:
|
|
"""
|
|
Extract country code from PPID filename.
|
|
|
|
PPID format: ID_{country}-{region}-{city}_{birth_decade}_{...}.json
|
|
Examples:
|
|
ID_NL-NH-AMS_198X_NL-NH-AMS_XXXX_JAN-DE-VRIES.json → NL
|
|
ID_BE-VL-ANT_199X_BE-VL-ANT_XXXX_FREDERIK-VANMEERT.json → BE
|
|
ID_US-CA-LAX_197X_US-CA-LAX_XXXX_JOHN-SMITH.json → US
|
|
|
|
Returns: 2-letter ISO country code or 'NL' as default
|
|
"""
|
|
if not filename.startswith('ID_'):
|
|
return 'NL'
|
|
|
|
# Remove ID_ prefix and .json suffix
|
|
ppid = filename[3:].replace('.json', '')
|
|
|
|
# First component contains country-region-city
|
|
parts = ppid.split('_')
|
|
if len(parts) >= 1:
|
|
location_part = parts[0]
|
|
# Split on hyphen: NL-NH-AMS → ['NL', 'NH', 'AMS']
|
|
location_components = location_part.split('-')
|
|
if len(location_components) >= 1:
|
|
country = location_components[0].upper()
|
|
# Validate it's a 2-letter code
|
|
if len(country) == 2 and country.isalpha():
|
|
return country
|
|
|
|
return 'NL' # Default to Netherlands
|
|
|
|
|
|
def extract_person_name(profile: dict) -> str:
|
|
"""Extract the person's name from a profile."""
|
|
# Try different locations where name might be stored
|
|
if 'profile_data' in profile:
|
|
pd = profile['profile_data']
|
|
if pd.get('full_name'):
|
|
return pd['full_name']
|
|
if pd.get('name'):
|
|
return pd['name']
|
|
if pd.get('first_name') and pd.get('last_name'):
|
|
return f"{pd['first_name']} {pd['last_name']}"
|
|
|
|
if profile.get('full_name'):
|
|
return profile['full_name']
|
|
if profile.get('name'):
|
|
return profile['name']
|
|
|
|
# Try to extract from file name
|
|
return "Unknown"
|
|
|
|
|
|
def extract_linkedin_slug(profile: dict) -> Optional[str]:
|
|
"""Extract LinkedIn slug from profile."""
|
|
if 'extraction_metadata' in profile:
|
|
em = profile['extraction_metadata']
|
|
if em.get('linkedin_slug'):
|
|
return em['linkedin_slug']
|
|
if em.get('source_linkedin_slug'):
|
|
return em['source_linkedin_slug']
|
|
|
|
if profile.get('linkedin_slug'):
|
|
return profile['linkedin_slug']
|
|
|
|
# Try to extract from linkedin_url
|
|
linkedin_url = profile.get('linkedin_url') or ''
|
|
if 'linkedin.com/in/' in linkedin_url:
|
|
match = re.search(r'linkedin\.com/in/([^/\?]+)', linkedin_url)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
return None
|
|
|
|
|
|
def normalize_linkedin_slug(slug: str) -> str:
|
|
"""
|
|
Normalize a LinkedIn slug by URL-decoding and lowercasing.
|
|
|
|
Handles URL-encoded diacritics like:
|
|
- %c3%a9 → é (e-acute)
|
|
- %c3%ab → ë (e-diaeresis)
|
|
- %c3%b6 → ö (o-umlaut)
|
|
|
|
This prevents false positive "mismatches" between:
|
|
- 'amélie-de-jong-60b507278' and 'am%c3%a9lie-de-jong-60b507278'
|
|
"""
|
|
if not slug:
|
|
return ""
|
|
# URL decode (handles %c3%a9 → é, etc.)
|
|
decoded = unquote(slug)
|
|
# Lowercase for comparison
|
|
return decoded.lower().strip()
|
|
|
|
|
|
def analyze_claims(profile: dict, profile_linkedin_slug: Optional[str]) -> Tuple[ClaimRiskAnalysis, bool, List[str]]:
|
|
"""
|
|
Analyze web claims in a profile for risk.
|
|
|
|
Returns: (ClaimRiskAnalysis, has_linkedin_mismatch, review_reasons)
|
|
"""
|
|
analysis = ClaimRiskAnalysis()
|
|
has_linkedin_mismatch = False
|
|
review_reasons = []
|
|
seen_high_risk_domains = set()
|
|
seen_medium_risk_domains = set()
|
|
|
|
# Normalize the profile's LinkedIn slug for comparison
|
|
normalized_profile_slug = normalize_linkedin_slug(profile_linkedin_slug) if profile_linkedin_slug else None
|
|
|
|
# Get claims from various locations
|
|
claims = []
|
|
if 'web_claims' in profile:
|
|
claims.extend(profile['web_claims'])
|
|
if 'web_enrichment' in profile and 'claims' in profile['web_enrichment']:
|
|
claims.extend(profile['web_enrichment']['claims'])
|
|
|
|
analysis.total_claims = len(claims)
|
|
|
|
for claim in claims:
|
|
source_url = claim.get('source_url', '')
|
|
domain = get_domain_from_url(source_url)
|
|
|
|
if not domain:
|
|
continue
|
|
|
|
risk = classify_domain_risk(domain)
|
|
|
|
if risk == DomainRisk.HIGH:
|
|
analysis.high_risk_claims += 1
|
|
seen_high_risk_domains.add(domain)
|
|
elif risk == DomainRisk.MEDIUM:
|
|
analysis.medium_risk_claims += 1
|
|
seen_medium_risk_domains.add(domain)
|
|
elif risk == DomainRisk.SAFE:
|
|
analysis.safe_claims += 1
|
|
|
|
# Check for LinkedIn slug mismatch (with proper URL decoding)
|
|
if 'linkedin.com/in/' in source_url and normalized_profile_slug:
|
|
url_slug_match = re.search(r'linkedin\.com/in/([^/\?]+)', source_url)
|
|
if url_slug_match:
|
|
url_slug = normalize_linkedin_slug(url_slug_match.group(1))
|
|
if url_slug != normalized_profile_slug:
|
|
has_linkedin_mismatch = True
|
|
review_reasons.append(f"LinkedIn mismatch: claim from '{url_slug}' but profile is '{profile_linkedin_slug}'")
|
|
|
|
# Check claim value for LinkedIn URLs pointing to other profiles (with URL decoding)
|
|
claim_value = str(claim.get('claim_value', ''))
|
|
if 'linkedin.com/in/' in claim_value and normalized_profile_slug:
|
|
value_slug_match = re.search(r'linkedin\.com/in/([^/\?]+)', claim_value)
|
|
if value_slug_match:
|
|
value_slug = normalize_linkedin_slug(value_slug_match.group(1))
|
|
if value_slug != normalized_profile_slug:
|
|
has_linkedin_mismatch = True
|
|
review_reasons.append(f"Claim value contains different LinkedIn profile: '{value_slug}'")
|
|
|
|
analysis.high_risk_domains = list(seen_high_risk_domains)
|
|
analysis.medium_risk_domains = list(seen_medium_risk_domains)
|
|
|
|
if analysis.high_risk_claims > 0:
|
|
review_reasons.append(f"{analysis.high_risk_claims} claims from HIGH RISK domains: {', '.join(analysis.high_risk_domains)}")
|
|
|
|
return analysis, has_linkedin_mismatch, review_reasons
|
|
|
|
|
|
def calculate_combined_risk(
|
|
name_risk_score: float,
|
|
claim_analysis: ClaimRiskAnalysis,
|
|
has_linkedin_mismatch: bool
|
|
) -> float:
|
|
"""
|
|
Calculate combined risk score (0.0 to 1.0).
|
|
|
|
Weights:
|
|
- Name commonality: 40%
|
|
- Claim source quality: 40%
|
|
- LinkedIn mismatch: 20%
|
|
"""
|
|
# Name risk component (0-1)
|
|
name_component = name_risk_score * 0.4
|
|
|
|
# Claim risk component (0-1)
|
|
if claim_analysis.total_claims == 0:
|
|
claim_component = 0.0
|
|
else:
|
|
# High risk claims count heavily
|
|
high_risk_ratio = claim_analysis.high_risk_claims / claim_analysis.total_claims
|
|
medium_risk_ratio = claim_analysis.medium_risk_claims / claim_analysis.total_claims
|
|
claim_component = (high_risk_ratio * 1.0 + medium_risk_ratio * 0.5) * 0.4
|
|
|
|
# LinkedIn mismatch is a strong signal
|
|
mismatch_component = 0.2 if has_linkedin_mismatch else 0.0
|
|
|
|
return min(1.0, name_component + claim_component + mismatch_component)
|
|
|
|
|
|
def analyze_profile(file_path: Path, scorers: Dict[str, NameCommonalityScorer], default_scorer: NameCommonalityScorer) -> Optional[ProfileRiskScore]:
|
|
"""
|
|
Analyze a single profile for risk.
|
|
|
|
Args:
|
|
file_path: Path to profile JSON file
|
|
scorers: Dictionary mapping country codes to NameCommonalityScorer instances
|
|
default_scorer: Fallback scorer for unknown countries
|
|
"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
profile = json.load(f)
|
|
except (json.JSONDecodeError, IOError) as e:
|
|
return None
|
|
|
|
# Extract country from PPID filename
|
|
country = extract_country_from_ppid(file_path.name)
|
|
|
|
# Select appropriate scorer for this country
|
|
scorer = scorers.get(country, default_scorer)
|
|
|
|
# Extract name
|
|
person_name = extract_person_name(profile)
|
|
if person_name == "Unknown":
|
|
# Try to extract from filename
|
|
fname = file_path.stem
|
|
if fname.startswith('ID_'):
|
|
# Format: ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_NAME-PARTS.json
|
|
parts = fname.split('_')
|
|
if len(parts) >= 5:
|
|
name_part = parts[-1] # Last part is the name
|
|
person_name = name_part.replace('-', ' ').title()
|
|
|
|
# Extract LinkedIn slug
|
|
linkedin_slug = extract_linkedin_slug(profile)
|
|
|
|
# Score name using country-appropriate scorer
|
|
name_result = scorer.score_name(full_name=person_name)
|
|
required_attrs = scorer.get_required_verification_attributes(name_result.risk_level)
|
|
|
|
# Analyze claims
|
|
claim_analysis, has_linkedin_mismatch, review_reasons = analyze_claims(profile, linkedin_slug)
|
|
|
|
# Calculate combined risk
|
|
combined_risk = calculate_combined_risk(
|
|
name_result.combined_score,
|
|
claim_analysis,
|
|
has_linkedin_mismatch
|
|
)
|
|
|
|
# Determine if manual review needed
|
|
needs_review = (
|
|
combined_risk >= 0.5 or
|
|
claim_analysis.high_risk_claims > 0 or
|
|
has_linkedin_mismatch or
|
|
(name_result.risk_level in (RiskLevel.VERY_HIGH, RiskLevel.HIGH) and claim_analysis.total_claims > 0)
|
|
)
|
|
|
|
if name_result.risk_level in (RiskLevel.VERY_HIGH, RiskLevel.HIGH) and claim_analysis.total_claims > 0:
|
|
review_reasons.append(f"Common name '{person_name}' with {claim_analysis.total_claims} web claims - verify all claims")
|
|
|
|
return ProfileRiskScore(
|
|
file_path=str(file_path),
|
|
file_name=file_path.name,
|
|
person_name=person_name,
|
|
linkedin_slug=linkedin_slug,
|
|
name_risk_level=name_result.risk_level.value,
|
|
name_risk_score=name_result.combined_score,
|
|
name_required_attributes=required_attrs,
|
|
claim_analysis=claim_analysis,
|
|
combined_risk_score=combined_risk,
|
|
has_high_risk_claims=claim_analysis.high_risk_claims > 0,
|
|
has_linkedin_mismatch=has_linkedin_mismatch,
|
|
needs_manual_review=needs_review,
|
|
review_reasons=review_reasons,
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# REPORT GENERATION
|
|
# =============================================================================
|
|
|
|
def generate_report(
|
|
results: List[ProfileRiskScore],
|
|
output_dir: Path,
|
|
top_n: int = 100
|
|
) -> Path:
|
|
"""Generate risk assessment report."""
|
|
|
|
# Sort by combined risk score (highest first)
|
|
results.sort(key=lambda x: x.combined_risk_score, reverse=True)
|
|
|
|
# Assign priority ranks
|
|
for i, result in enumerate(results):
|
|
result.priority_rank = i + 1
|
|
|
|
# Statistics
|
|
total = len(results)
|
|
needs_review = sum(1 for r in results if r.needs_manual_review)
|
|
high_risk_names = sum(1 for r in results if r.name_risk_level in ('very_high', 'high'))
|
|
has_high_risk_claims = sum(1 for r in results if r.has_high_risk_claims)
|
|
has_linkedin_mismatch = sum(1 for r in results if r.has_linkedin_mismatch)
|
|
|
|
# Generate markdown report
|
|
report_path = output_dir / '_PROFILE_RISK_REPORT.md'
|
|
|
|
with open(report_path, 'w', encoding='utf-8') as f:
|
|
f.write("# Person Profile Risk Assessment Report\n\n")
|
|
f.write(f"**Generated**: {datetime.now().isoformat()}\n\n")
|
|
|
|
f.write("## Summary Statistics\n\n")
|
|
f.write(f"| Metric | Count | Percentage |\n")
|
|
f.write(f"|--------|-------|------------|\n")
|
|
f.write(f"| Total Profiles Analyzed | {total:,} | 100% |\n")
|
|
f.write(f"| **Needs Manual Review** | **{needs_review:,}** | **{needs_review/total*100:.1f}%** |\n")
|
|
f.write(f"| High/Very High Risk Names | {high_risk_names:,} | {high_risk_names/total*100:.1f}% |\n")
|
|
f.write(f"| Has HIGH RISK Domain Claims | {has_high_risk_claims:,} | {has_high_risk_claims/total*100:.1f}% |\n")
|
|
f.write(f"| Has LinkedIn Mismatch | {has_linkedin_mismatch:,} | {has_linkedin_mismatch/total*100:.1f}% |\n")
|
|
f.write("\n")
|
|
|
|
f.write("## Risk Level Distribution\n\n")
|
|
risk_counts = defaultdict(int)
|
|
for r in results:
|
|
if r.combined_risk_score >= 0.7:
|
|
risk_counts['Critical (0.7+)'] += 1
|
|
elif r.combined_risk_score >= 0.5:
|
|
risk_counts['High (0.5-0.7)'] += 1
|
|
elif r.combined_risk_score >= 0.3:
|
|
risk_counts['Medium (0.3-0.5)'] += 1
|
|
else:
|
|
risk_counts['Low (<0.3)'] += 1
|
|
|
|
f.write(f"| Risk Level | Count | Percentage |\n")
|
|
f.write(f"|------------|-------|------------|\n")
|
|
for level in ['Critical (0.7+)', 'High (0.5-0.7)', 'Medium (0.3-0.5)', 'Low (<0.3)']:
|
|
count = risk_counts[level]
|
|
f.write(f"| {level} | {count:,} | {count/total*100:.1f}% |\n")
|
|
f.write("\n")
|
|
|
|
f.write(f"## Top {top_n} Highest Risk Profiles\n\n")
|
|
f.write("These profiles need immediate manual review.\n\n")
|
|
|
|
f.write("| Rank | Name | Risk Score | Name Risk | Claims | Issues |\n")
|
|
f.write("|------|------|------------|-----------|--------|--------|\n")
|
|
|
|
for result in results[:top_n]:
|
|
issues = []
|
|
if result.has_high_risk_claims:
|
|
issues.append(f"{result.claim_analysis.high_risk_claims} HIGH RISK")
|
|
if result.has_linkedin_mismatch:
|
|
issues.append("LinkedIn mismatch")
|
|
if result.name_risk_level in ('very_high', 'high'):
|
|
issues.append(f"Common name ({result.name_risk_level})")
|
|
|
|
issues_str = "; ".join(issues) if issues else "-"
|
|
|
|
f.write(f"| {result.priority_rank} | {result.person_name[:30]} | {result.combined_risk_score:.2f} | "
|
|
f"{result.name_risk_level} | {result.claim_analysis.total_claims} | {issues_str} |\n")
|
|
|
|
f.write("\n")
|
|
|
|
# LinkedIn mismatch section
|
|
mismatch_profiles = [r for r in results if r.has_linkedin_mismatch]
|
|
if mismatch_profiles:
|
|
f.write(f"## Profiles with LinkedIn Mismatches ({len(mismatch_profiles)})\n\n")
|
|
f.write("These profiles have claims from different LinkedIn profiles - likely entity resolution errors.\n\n")
|
|
|
|
for result in mismatch_profiles[:50]: # Top 50
|
|
f.write(f"### {result.person_name}\n")
|
|
f.write(f"- **File**: `{result.file_name}`\n")
|
|
f.write(f"- **Profile LinkedIn**: `{result.linkedin_slug or 'None'}`\n")
|
|
for reason in result.review_reasons:
|
|
if 'LinkedIn' in reason:
|
|
f.write(f"- **Issue**: {reason}\n")
|
|
f.write("\n")
|
|
|
|
# High risk domain claims section
|
|
high_risk_profiles = [r for r in results if r.has_high_risk_claims]
|
|
if high_risk_profiles:
|
|
f.write(f"## Profiles with HIGH RISK Domain Claims ({len(high_risk_profiles)})\n\n")
|
|
f.write("These profiles have claims from domains known for entity resolution failures.\n\n")
|
|
|
|
domain_counts = defaultdict(int)
|
|
for result in high_risk_profiles:
|
|
for domain in result.claim_analysis.high_risk_domains:
|
|
domain_counts[domain] += 1
|
|
|
|
f.write("### High Risk Domains Found\n\n")
|
|
f.write("| Domain | Profiles Affected |\n")
|
|
f.write("|--------|-------------------|\n")
|
|
for domain, count in sorted(domain_counts.items(), key=lambda x: x[1], reverse=True):
|
|
f.write(f"| {domain} | {count} |\n")
|
|
f.write("\n")
|
|
|
|
f.write("---\n\n")
|
|
f.write("**DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data.**\n")
|
|
|
|
# Also save JSON for programmatic access
|
|
json_path = output_dir / '_profile_risk_scores.json'
|
|
|
|
# Convert to serializable format
|
|
serializable_results = []
|
|
for r in results:
|
|
item = {
|
|
'file_path': r.file_path,
|
|
'file_name': r.file_name,
|
|
'person_name': r.person_name,
|
|
'linkedin_slug': r.linkedin_slug,
|
|
'name_risk_level': r.name_risk_level,
|
|
'name_risk_score': r.name_risk_score,
|
|
'name_required_attributes': r.name_required_attributes,
|
|
'combined_risk_score': r.combined_risk_score,
|
|
'priority_rank': r.priority_rank,
|
|
'has_high_risk_claims': r.has_high_risk_claims,
|
|
'has_linkedin_mismatch': r.has_linkedin_mismatch,
|
|
'needs_manual_review': r.needs_manual_review,
|
|
'review_reasons': r.review_reasons,
|
|
'claim_analysis': {
|
|
'total_claims': r.claim_analysis.total_claims,
|
|
'high_risk_claims': r.claim_analysis.high_risk_claims,
|
|
'medium_risk_claims': r.claim_analysis.medium_risk_claims,
|
|
'safe_claims': r.claim_analysis.safe_claims,
|
|
'high_risk_domains': r.claim_analysis.high_risk_domains,
|
|
}
|
|
}
|
|
serializable_results.append(item)
|
|
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'generated': datetime.now().isoformat(),
|
|
'total_profiles': total,
|
|
'needs_review': needs_review,
|
|
'profiles': serializable_results,
|
|
}, f, indent=2)
|
|
|
|
return report_path
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Generate profile risk assessment report'
|
|
)
|
|
parser.add_argument('--analyze', action='store_true',
|
|
help='Analyze all profiles')
|
|
parser.add_argument('--limit', type=int, default=None,
|
|
help='Limit number of profiles to analyze')
|
|
parser.add_argument('--top-risks', type=int, default=100,
|
|
help='Number of top risks to show in report')
|
|
parser.add_argument('--data-dir', type=str,
|
|
default='data/person',
|
|
help='Person data directory')
|
|
parser.add_argument('--country', type=str, default='NL',
|
|
help='Default country for name scoring')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.analyze:
|
|
parser.print_help()
|
|
return
|
|
|
|
# Setup
|
|
data_dir = Path(args.data_dir)
|
|
if not data_dir.exists():
|
|
print(f"Error: Data directory not found: {data_dir}")
|
|
sys.exit(1)
|
|
|
|
# Create country-specific scorers for accurate name risk assessment
|
|
# Supported countries: NL (Netherlands), BE (Belgium), US (United States)
|
|
print("Loading name commonality databases...")
|
|
scorers: Dict[str, NameCommonalityScorer] = {}
|
|
|
|
# Netherlands - primary
|
|
scorers['NL'] = NameCommonalityScorer(default_country='NL')
|
|
nl_data, _, nl_count = load_surname_data('NL')
|
|
print(f" Loaded NL: {len(nl_data)} surnames")
|
|
|
|
# Belgium - added this session
|
|
scorers['BE'] = NameCommonalityScorer(default_country='BE')
|
|
be_data, _, be_count = load_surname_data('BE')
|
|
print(f" Loaded BE: {len(be_data)} surnames")
|
|
|
|
# United States
|
|
scorers['US'] = NameCommonalityScorer(default_country='US')
|
|
us_data, _, us_count = load_surname_data('US')
|
|
print(f" Loaded US: {len(us_data)} surnames")
|
|
|
|
# Default scorer for unknown countries (fallback to NL)
|
|
default_scorer = scorers.get(args.country, scorers['NL'])
|
|
|
|
# Find all profile files
|
|
profile_files = list(data_dir.glob('ID_*.json'))
|
|
print(f"Found {len(profile_files):,} profile files")
|
|
|
|
if args.limit:
|
|
profile_files = profile_files[:args.limit]
|
|
print(f"Limiting to {args.limit} profiles")
|
|
|
|
# Count profiles by country for reporting
|
|
country_counts: Dict[str, int] = defaultdict(int)
|
|
for pf in profile_files:
|
|
country = extract_country_from_ppid(pf.name)
|
|
country_counts[country] += 1
|
|
|
|
print("Profiles by country:")
|
|
for country, count in sorted(country_counts.items(), key=lambda x: -x[1])[:10]:
|
|
scorer_status = "✓" if country in scorers else "→ default"
|
|
print(f" {country}: {count:,} {scorer_status}")
|
|
|
|
# Analyze profiles
|
|
results = []
|
|
errors = 0
|
|
|
|
print("Analyzing profiles...")
|
|
for i, file_path in enumerate(profile_files):
|
|
if (i + 1) % 5000 == 0:
|
|
print(f" Processed {i + 1:,} / {len(profile_files):,}")
|
|
|
|
result = analyze_profile(file_path, scorers, default_scorer)
|
|
if result:
|
|
results.append(result)
|
|
else:
|
|
errors += 1
|
|
|
|
print(f"Analyzed {len(results):,} profiles ({errors} errors)")
|
|
|
|
# Generate report
|
|
print("Generating report...")
|
|
report_path = generate_report(results, data_dir, args.top_risks)
|
|
|
|
print(f"\nReport generated: {report_path}")
|
|
print(f"JSON data: {data_dir / '_profile_risk_scores.json'}")
|
|
|
|
# Summary
|
|
needs_review = sum(1 for r in results if r.needs_manual_review)
|
|
print(f"\n{'='*60}")
|
|
print(f"SUMMARY")
|
|
print(f"{'='*60}")
|
|
print(f"Total profiles: {len(results):,}")
|
|
print(f"Needs manual review: {needs_review:,} ({needs_review/len(results)*100:.1f}%)")
|
|
|
|
# Top 10 highest risk
|
|
results.sort(key=lambda x: x.combined_risk_score, reverse=True)
|
|
print(f"\nTop 10 Highest Risk Profiles:")
|
|
for r in results[:10]:
|
|
print(f" {r.combined_risk_score:.2f} | {r.person_name[:30]} | {r.name_risk_level} | "
|
|
f"{r.claim_analysis.high_risk_claims} HIGH RISK claims")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|