glam/scripts/generate_profile_risk_report.py
2026-01-11 12:15:27 +01:00

713 lines
25 KiB
Python

#!/usr/bin/env python3
"""
Profile Risk Scoring Script for Person Data Quality.
This script analyzes all person profiles and generates a comprehensive risk report
combining:
1. Name Commonality Risk (from name_commonality.py)
2. Claim Source Quality (domain-based risk)
3. Entity Resolution Confidence
OUTPUT: A prioritized list of profiles that need manual review.
DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data.
Usage:
python scripts/generate_profile_risk_report.py --analyze
python scripts/generate_profile_risk_report.py --analyze --limit 1000
python scripts/generate_profile_risk_report.py --top-risks 100
"""
import argparse
import json
import os
import sys
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import unquote
import re
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.entity_resolution.name_commonality import (
NameCommonalityScorer,
RiskLevel,
score_name,
)
# =============================================================================
# DOMAIN RISK CLASSIFICATION
# =============================================================================
class DomainRisk(Enum):
"""Risk level for claim source domains."""
HIGH = "high" # Auto-remove candidates
MEDIUM = "medium" # Manual verification required
LOW = "low" # Acceptable with caution
SAFE = "safe" # Trusted institutional sources
# HIGH RISK: Entity aggregators, social media, entertainment databases
HIGH_RISK_DOMAINS: Set[str] = {
# People aggregators (frequent entity resolution failures)
"rocketreach.co",
"www.zoominfo.com",
"www.idcrawl.com",
"www.peekyou.com",
"www.spokeo.com",
"www.whitepages.com",
"www.beenverified.com",
"www.truepeoplesearch.com",
"www.fastpeoplesearch.com",
"www.ussearch.com",
# Entertainment databases (actor/director collisions)
"www.imdb.com",
"imdb.com",
"www.themoviedb.org",
# E-commerce (wrong author/reviewer attribution)
"www.amazon.com",
"www.amazon.co.uk",
"www.amazon.de",
"www.goodreads.com",
# Social media (high collision risk)
"www.instagram.com",
"instagram.com",
"www.tiktok.com",
"tiktok.com",
"linktr.ee",
# Sports databases
"worldathletics.org",
"www.eliteprospects.com",
# Genealogy sites (frequent namesake confusion)
"www.geni.com",
"geni.com",
"www.ancestry.com",
"ancestry.com",
"www.myheritage.com",
"myheritage.com",
"www.familysearch.org",
"familysearch.org",
"www.findagrave.com",
"findagrave.com",
}
# MEDIUM RISK: Sites that need verification but aren't auto-remove
MEDIUM_RISK_DOMAINS: Set[str] = {
"twitter.com",
"x.com",
"www.facebook.com",
"facebook.com",
"www.youtube.com",
"youtube.com",
"medium.com",
"www.researchgate.net", # Different than ORCID - some collisions
"www.academia.edu",
}
# SAFE DOMAINS: Trusted institutional sources
SAFE_DOMAINS: Set[str] = {
# Dutch heritage institutions
"www.rijksmuseum.nl",
"www.nationaalarchief.nl",
"www.kb.nl",
"www.niod.nl",
"www.eyefilm.nl",
"www.amsterdammuseum.nl",
"www.geldersarchief.nl",
"www.codart.nl",
"www.kunsthistorici.nl",
# Dutch universities
"www.universiteitleiden.nl",
"www.uva.nl",
"www.uu.nl",
"www.rug.nl",
"www.tue.nl",
"www.tudelft.nl",
"www.ru.nl",
"www.maastrichtuniversity.nl",
"www.vu.nl",
"www.wur.nl",
"pure.knaw.nl",
# International academic identifiers
"orcid.org",
# Wikipedia (verified with article matching)
"en.wikipedia.org",
"nl.wikipedia.org",
"de.wikipedia.org",
"fr.wikipedia.org",
}
def get_domain_from_url(url: str) -> Optional[str]:
"""Extract domain from URL."""
if not url:
return None
# Simple regex to extract domain
match = re.search(r'https?://([^/]+)', url.lower())
if match:
return match.group(1)
return None
def classify_domain_risk(domain: str) -> DomainRisk:
"""Classify a domain's risk level."""
if not domain:
return DomainRisk.MEDIUM
domain_lower = domain.lower()
# Check high risk
if domain_lower in HIGH_RISK_DOMAINS:
return DomainRisk.HIGH
# Check medium risk
if domain_lower in MEDIUM_RISK_DOMAINS:
return DomainRisk.MEDIUM
# Check safe
if domain_lower in SAFE_DOMAINS:
return DomainRisk.SAFE
# Check for institutional patterns
if any(pattern in domain_lower for pattern in [
'.edu', '.ac.uk', '.uni-', 'university', 'museum', 'archief',
'bibliotheek', 'library', 'archive', 'gov.', '.gouv.'
]):
return DomainRisk.LOW
# LinkedIn requires special handling
if 'linkedin.com' in domain_lower:
return DomainRisk.LOW # Low if slug matches, but we check separately
return DomainRisk.MEDIUM
# =============================================================================
# PROFILE RISK SCORING
# =============================================================================
@dataclass
class ClaimRiskAnalysis:
"""Risk analysis for claims in a profile."""
total_claims: int = 0
high_risk_claims: int = 0
medium_risk_claims: int = 0
safe_claims: int = 0
high_risk_domains: List[str] = field(default_factory=list)
medium_risk_domains: List[str] = field(default_factory=list)
@dataclass
class ProfileRiskScore:
"""Complete risk assessment for a person profile."""
file_path: str
file_name: str
person_name: str
linkedin_slug: Optional[str]
# Name risk
name_risk_level: str # RiskLevel.value
name_risk_score: float
name_required_attributes: int
# Claim risk
claim_analysis: ClaimRiskAnalysis
# Combined risk
combined_risk_score: float # 0.0 (safe) to 1.0 (highest risk)
priority_rank: int = 0 # Set after sorting
# Flags
has_high_risk_claims: bool = False
has_linkedin_mismatch: bool = False
needs_manual_review: bool = False
review_reasons: List[str] = field(default_factory=list)
def extract_person_name(profile: dict) -> str:
"""Extract the person's name from a profile."""
# Try different locations where name might be stored
if 'profile_data' in profile:
pd = profile['profile_data']
if pd.get('full_name'):
return pd['full_name']
if pd.get('name'):
return pd['name']
if pd.get('first_name') and pd.get('last_name'):
return f"{pd['first_name']} {pd['last_name']}"
if profile.get('full_name'):
return profile['full_name']
if profile.get('name'):
return profile['name']
# Try to extract from file name
return "Unknown"
def extract_linkedin_slug(profile: dict) -> Optional[str]:
"""Extract LinkedIn slug from profile."""
if 'extraction_metadata' in profile:
em = profile['extraction_metadata']
if em.get('linkedin_slug'):
return em['linkedin_slug']
if em.get('source_linkedin_slug'):
return em['source_linkedin_slug']
if profile.get('linkedin_slug'):
return profile['linkedin_slug']
# Try to extract from linkedin_url
linkedin_url = profile.get('linkedin_url') or ''
if 'linkedin.com/in/' in linkedin_url:
match = re.search(r'linkedin\.com/in/([^/\?]+)', linkedin_url)
if match:
return match.group(1)
return None
def normalize_linkedin_slug(slug: str) -> str:
"""
Normalize a LinkedIn slug by URL-decoding and lowercasing.
Handles URL-encoded diacritics like:
- %c3%a9 → é (e-acute)
- %c3%ab → ë (e-diaeresis)
- %c3%b6 → ö (o-umlaut)
This prevents false positive "mismatches" between:
- 'amélie-de-jong-60b507278' and 'am%c3%a9lie-de-jong-60b507278'
"""
if not slug:
return ""
# URL decode (handles %c3%a9 → é, etc.)
decoded = unquote(slug)
# Lowercase for comparison
return decoded.lower().strip()
def analyze_claims(profile: dict, profile_linkedin_slug: Optional[str]) -> Tuple[ClaimRiskAnalysis, bool, List[str]]:
"""
Analyze web claims in a profile for risk.
Returns: (ClaimRiskAnalysis, has_linkedin_mismatch, review_reasons)
"""
analysis = ClaimRiskAnalysis()
has_linkedin_mismatch = False
review_reasons = []
seen_high_risk_domains = set()
seen_medium_risk_domains = set()
# Normalize the profile's LinkedIn slug for comparison
normalized_profile_slug = normalize_linkedin_slug(profile_linkedin_slug) if profile_linkedin_slug else None
# Get claims from various locations
claims = []
if 'web_claims' in profile:
claims.extend(profile['web_claims'])
if 'web_enrichment' in profile and 'claims' in profile['web_enrichment']:
claims.extend(profile['web_enrichment']['claims'])
analysis.total_claims = len(claims)
for claim in claims:
source_url = claim.get('source_url', '')
domain = get_domain_from_url(source_url)
if not domain:
continue
risk = classify_domain_risk(domain)
if risk == DomainRisk.HIGH:
analysis.high_risk_claims += 1
seen_high_risk_domains.add(domain)
elif risk == DomainRisk.MEDIUM:
analysis.medium_risk_claims += 1
seen_medium_risk_domains.add(domain)
elif risk == DomainRisk.SAFE:
analysis.safe_claims += 1
# Check for LinkedIn slug mismatch (with proper URL decoding)
if 'linkedin.com/in/' in source_url and normalized_profile_slug:
url_slug_match = re.search(r'linkedin\.com/in/([^/\?]+)', source_url)
if url_slug_match:
url_slug = normalize_linkedin_slug(url_slug_match.group(1))
if url_slug != normalized_profile_slug:
has_linkedin_mismatch = True
review_reasons.append(f"LinkedIn mismatch: claim from '{url_slug}' but profile is '{profile_linkedin_slug}'")
# Check claim value for LinkedIn URLs pointing to other profiles (with URL decoding)
claim_value = str(claim.get('claim_value', ''))
if 'linkedin.com/in/' in claim_value and normalized_profile_slug:
value_slug_match = re.search(r'linkedin\.com/in/([^/\?]+)', claim_value)
if value_slug_match:
value_slug = normalize_linkedin_slug(value_slug_match.group(1))
if value_slug != normalized_profile_slug:
has_linkedin_mismatch = True
review_reasons.append(f"Claim value contains different LinkedIn profile: '{value_slug}'")
analysis.high_risk_domains = list(seen_high_risk_domains)
analysis.medium_risk_domains = list(seen_medium_risk_domains)
if analysis.high_risk_claims > 0:
review_reasons.append(f"{analysis.high_risk_claims} claims from HIGH RISK domains: {', '.join(analysis.high_risk_domains)}")
return analysis, has_linkedin_mismatch, review_reasons
def calculate_combined_risk(
name_risk_score: float,
claim_analysis: ClaimRiskAnalysis,
has_linkedin_mismatch: bool
) -> float:
"""
Calculate combined risk score (0.0 to 1.0).
Weights:
- Name commonality: 40%
- Claim source quality: 40%
- LinkedIn mismatch: 20%
"""
# Name risk component (0-1)
name_component = name_risk_score * 0.4
# Claim risk component (0-1)
if claim_analysis.total_claims == 0:
claim_component = 0.0
else:
# High risk claims count heavily
high_risk_ratio = claim_analysis.high_risk_claims / claim_analysis.total_claims
medium_risk_ratio = claim_analysis.medium_risk_claims / claim_analysis.total_claims
claim_component = (high_risk_ratio * 1.0 + medium_risk_ratio * 0.5) * 0.4
# LinkedIn mismatch is a strong signal
mismatch_component = 0.2 if has_linkedin_mismatch else 0.0
return min(1.0, name_component + claim_component + mismatch_component)
def analyze_profile(file_path: Path, scorer: NameCommonalityScorer) -> Optional[ProfileRiskScore]:
"""Analyze a single profile for risk."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
profile = json.load(f)
except (json.JSONDecodeError, IOError) as e:
return None
# Extract name
person_name = extract_person_name(profile)
if person_name == "Unknown":
# Try to extract from filename
fname = file_path.stem
if fname.startswith('ID_'):
# Format: ID_XX-XX-XXX_XXXX_XX-XX-XXX_XXXX_NAME-PARTS.json
parts = fname.split('_')
if len(parts) >= 5:
name_part = parts[-1] # Last part is the name
person_name = name_part.replace('-', ' ').title()
# Extract LinkedIn slug
linkedin_slug = extract_linkedin_slug(profile)
# Score name
name_result = scorer.score_name(full_name=person_name)
required_attrs = scorer.get_required_verification_attributes(name_result.risk_level)
# Analyze claims
claim_analysis, has_linkedin_mismatch, review_reasons = analyze_claims(profile, linkedin_slug)
# Calculate combined risk
combined_risk = calculate_combined_risk(
name_result.combined_score,
claim_analysis,
has_linkedin_mismatch
)
# Determine if manual review needed
needs_review = (
combined_risk >= 0.5 or
claim_analysis.high_risk_claims > 0 or
has_linkedin_mismatch or
(name_result.risk_level in (RiskLevel.VERY_HIGH, RiskLevel.HIGH) and claim_analysis.total_claims > 0)
)
if name_result.risk_level in (RiskLevel.VERY_HIGH, RiskLevel.HIGH) and claim_analysis.total_claims > 0:
review_reasons.append(f"Common name '{person_name}' with {claim_analysis.total_claims} web claims - verify all claims")
return ProfileRiskScore(
file_path=str(file_path),
file_name=file_path.name,
person_name=person_name,
linkedin_slug=linkedin_slug,
name_risk_level=name_result.risk_level.value,
name_risk_score=name_result.combined_score,
name_required_attributes=required_attrs,
claim_analysis=claim_analysis,
combined_risk_score=combined_risk,
has_high_risk_claims=claim_analysis.high_risk_claims > 0,
has_linkedin_mismatch=has_linkedin_mismatch,
needs_manual_review=needs_review,
review_reasons=review_reasons,
)
# =============================================================================
# REPORT GENERATION
# =============================================================================
def generate_report(
results: List[ProfileRiskScore],
output_dir: Path,
top_n: int = 100
) -> Path:
"""Generate risk assessment report."""
# Sort by combined risk score (highest first)
results.sort(key=lambda x: x.combined_risk_score, reverse=True)
# Assign priority ranks
for i, result in enumerate(results):
result.priority_rank = i + 1
# Statistics
total = len(results)
needs_review = sum(1 for r in results if r.needs_manual_review)
high_risk_names = sum(1 for r in results if r.name_risk_level in ('very_high', 'high'))
has_high_risk_claims = sum(1 for r in results if r.has_high_risk_claims)
has_linkedin_mismatch = sum(1 for r in results if r.has_linkedin_mismatch)
# Generate markdown report
report_path = output_dir / '_PROFILE_RISK_REPORT.md'
with open(report_path, 'w', encoding='utf-8') as f:
f.write("# Person Profile Risk Assessment Report\n\n")
f.write(f"**Generated**: {datetime.now().isoformat()}\n\n")
f.write("## Summary Statistics\n\n")
f.write(f"| Metric | Count | Percentage |\n")
f.write(f"|--------|-------|------------|\n")
f.write(f"| Total Profiles Analyzed | {total:,} | 100% |\n")
f.write(f"| **Needs Manual Review** | **{needs_review:,}** | **{needs_review/total*100:.1f}%** |\n")
f.write(f"| High/Very High Risk Names | {high_risk_names:,} | {high_risk_names/total*100:.1f}% |\n")
f.write(f"| Has HIGH RISK Domain Claims | {has_high_risk_claims:,} | {has_high_risk_claims/total*100:.1f}% |\n")
f.write(f"| Has LinkedIn Mismatch | {has_linkedin_mismatch:,} | {has_linkedin_mismatch/total*100:.1f}% |\n")
f.write("\n")
f.write("## Risk Level Distribution\n\n")
risk_counts = defaultdict(int)
for r in results:
if r.combined_risk_score >= 0.7:
risk_counts['Critical (0.7+)'] += 1
elif r.combined_risk_score >= 0.5:
risk_counts['High (0.5-0.7)'] += 1
elif r.combined_risk_score >= 0.3:
risk_counts['Medium (0.3-0.5)'] += 1
else:
risk_counts['Low (<0.3)'] += 1
f.write(f"| Risk Level | Count | Percentage |\n")
f.write(f"|------------|-------|------------|\n")
for level in ['Critical (0.7+)', 'High (0.5-0.7)', 'Medium (0.3-0.5)', 'Low (<0.3)']:
count = risk_counts[level]
f.write(f"| {level} | {count:,} | {count/total*100:.1f}% |\n")
f.write("\n")
f.write(f"## Top {top_n} Highest Risk Profiles\n\n")
f.write("These profiles need immediate manual review.\n\n")
f.write("| Rank | Name | Risk Score | Name Risk | Claims | Issues |\n")
f.write("|------|------|------------|-----------|--------|--------|\n")
for result in results[:top_n]:
issues = []
if result.has_high_risk_claims:
issues.append(f"{result.claim_analysis.high_risk_claims} HIGH RISK")
if result.has_linkedin_mismatch:
issues.append("LinkedIn mismatch")
if result.name_risk_level in ('very_high', 'high'):
issues.append(f"Common name ({result.name_risk_level})")
issues_str = "; ".join(issues) if issues else "-"
f.write(f"| {result.priority_rank} | {result.person_name[:30]} | {result.combined_risk_score:.2f} | "
f"{result.name_risk_level} | {result.claim_analysis.total_claims} | {issues_str} |\n")
f.write("\n")
# LinkedIn mismatch section
mismatch_profiles = [r for r in results if r.has_linkedin_mismatch]
if mismatch_profiles:
f.write(f"## Profiles with LinkedIn Mismatches ({len(mismatch_profiles)})\n\n")
f.write("These profiles have claims from different LinkedIn profiles - likely entity resolution errors.\n\n")
for result in mismatch_profiles[:50]: # Top 50
f.write(f"### {result.person_name}\n")
f.write(f"- **File**: `{result.file_name}`\n")
f.write(f"- **Profile LinkedIn**: `{result.linkedin_slug or 'None'}`\n")
for reason in result.review_reasons:
if 'LinkedIn' in reason:
f.write(f"- **Issue**: {reason}\n")
f.write("\n")
# High risk domain claims section
high_risk_profiles = [r for r in results if r.has_high_risk_claims]
if high_risk_profiles:
f.write(f"## Profiles with HIGH RISK Domain Claims ({len(high_risk_profiles)})\n\n")
f.write("These profiles have claims from domains known for entity resolution failures.\n\n")
domain_counts = defaultdict(int)
for result in high_risk_profiles:
for domain in result.claim_analysis.high_risk_domains:
domain_counts[domain] += 1
f.write("### High Risk Domains Found\n\n")
f.write("| Domain | Profiles Affected |\n")
f.write("|--------|-------------------|\n")
for domain, count in sorted(domain_counts.items(), key=lambda x: x[1], reverse=True):
f.write(f"| {domain} | {count} |\n")
f.write("\n")
f.write("---\n\n")
f.write("**DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data.**\n")
# Also save JSON for programmatic access
json_path = output_dir / '_profile_risk_scores.json'
# Convert to serializable format
serializable_results = []
for r in results:
item = {
'file_path': r.file_path,
'file_name': r.file_name,
'person_name': r.person_name,
'linkedin_slug': r.linkedin_slug,
'name_risk_level': r.name_risk_level,
'name_risk_score': r.name_risk_score,
'name_required_attributes': r.name_required_attributes,
'combined_risk_score': r.combined_risk_score,
'priority_rank': r.priority_rank,
'has_high_risk_claims': r.has_high_risk_claims,
'has_linkedin_mismatch': r.has_linkedin_mismatch,
'needs_manual_review': r.needs_manual_review,
'review_reasons': r.review_reasons,
'claim_analysis': {
'total_claims': r.claim_analysis.total_claims,
'high_risk_claims': r.claim_analysis.high_risk_claims,
'medium_risk_claims': r.claim_analysis.medium_risk_claims,
'safe_claims': r.claim_analysis.safe_claims,
'high_risk_domains': r.claim_analysis.high_risk_domains,
}
}
serializable_results.append(item)
with open(json_path, 'w', encoding='utf-8') as f:
json.dump({
'generated': datetime.now().isoformat(),
'total_profiles': total,
'needs_review': needs_review,
'profiles': serializable_results,
}, f, indent=2)
return report_path
# =============================================================================
# MAIN
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description='Generate profile risk assessment report'
)
parser.add_argument('--analyze', action='store_true',
help='Analyze all profiles')
parser.add_argument('--limit', type=int, default=None,
help='Limit number of profiles to analyze')
parser.add_argument('--top-risks', type=int, default=100,
help='Number of top risks to show in report')
parser.add_argument('--data-dir', type=str,
default='data/person',
help='Person data directory')
parser.add_argument('--country', type=str, default='NL',
help='Default country for name scoring')
args = parser.parse_args()
if not args.analyze:
parser.print_help()
return
# Setup
data_dir = Path(args.data_dir)
if not data_dir.exists():
print(f"Error: Data directory not found: {data_dir}")
sys.exit(1)
scorer = NameCommonalityScorer(default_country=args.country)
# Find all profile files
profile_files = list(data_dir.glob('ID_*.json'))
print(f"Found {len(profile_files):,} profile files")
if args.limit:
profile_files = profile_files[:args.limit]
print(f"Limiting to {args.limit} profiles")
# Analyze profiles
results = []
errors = 0
print("Analyzing profiles...")
for i, file_path in enumerate(profile_files):
if (i + 1) % 5000 == 0:
print(f" Processed {i + 1:,} / {len(profile_files):,}")
result = analyze_profile(file_path, scorer)
if result:
results.append(result)
else:
errors += 1
print(f"Analyzed {len(results):,} profiles ({errors} errors)")
# Generate report
print("Generating report...")
report_path = generate_report(results, data_dir, args.top_risks)
print(f"\nReport generated: {report_path}")
print(f"JSON data: {data_dir / '_profile_risk_scores.json'}")
# Summary
needs_review = sum(1 for r in results if r.needs_manual_review)
print(f"\n{'='*60}")
print(f"SUMMARY")
print(f"{'='*60}")
print(f"Total profiles: {len(results):,}")
print(f"Needs manual review: {needs_review:,} ({needs_review/len(results)*100:.1f}%)")
# Top 10 highest risk
results.sort(key=lambda x: x.combined_risk_score, reverse=True)
print(f"\nTop 10 Highest Risk Profiles:")
for r in results[:10]:
print(f" {r.combined_risk_score:.2f} | {r.person_name[:30]} | {r.name_risk_level} | "
f"{r.claim_analysis.high_risk_claims} HIGH RISK claims")
if __name__ == '__main__':
main()