420 lines
13 KiB
Python
420 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cleanup web claims from PERSON profiles based on source domain risk assessment.
|
|
|
|
This script removes claims from HIGH RISK sources (entity resolution failures)
|
|
and flags MEDIUM RISK sources for further review.
|
|
|
|
DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data.
|
|
|
|
Different from cleanup_web_claims.py which handles custodian YAML files.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Set, Tuple
|
|
|
|
# Domain classification for entity resolution risk
|
|
HIGH_RISK_DOMAINS = {
|
|
# Social media - random accounts, not verified person
|
|
'www.instagram.com',
|
|
'www.tiktok.com',
|
|
'linktr.ee',
|
|
|
|
# People aggregators - notorious for mixing up people with same names
|
|
'www.idcrawl.com',
|
|
'www.peekyou.com',
|
|
'rocketreach.co',
|
|
'www.zoominfo.com',
|
|
'profileability.com',
|
|
'holaconnect.com',
|
|
|
|
# Entertainment - actors/musicians with same names
|
|
'www.imdb.com',
|
|
'www.babelio.com', # French book site
|
|
'www.goodreads.com',
|
|
'www.poetryfoundation.org',
|
|
'bakerbookhouse.com', # Christian book awards
|
|
'www.thriftbooks.com',
|
|
'www.abebooks.com',
|
|
'www.amazon.com',
|
|
'arcmusic.org',
|
|
|
|
# Sports - athletes with same names
|
|
'worldathletics.org',
|
|
'www.eliteprospects.com',
|
|
|
|
# Art marketplaces - different artists
|
|
'www.mutualart.com',
|
|
|
|
# Genealogy - historical figures
|
|
'www.wikidata.org', # Often wrong person, needs manual verification
|
|
}
|
|
|
|
MEDIUM_RISK_DOMAINS = {
|
|
# Generic Twitter/X - may or may not be the person
|
|
'twitter.com',
|
|
'x.com',
|
|
|
|
# Research aggregators - sometimes wrong person
|
|
'www.researchgate.net',
|
|
'scholar.google.com',
|
|
'www.scilit.com',
|
|
'research.com',
|
|
'support.orcid.org',
|
|
}
|
|
|
|
SAFE_DOMAINS = {
|
|
# Institutional websites - high confidence
|
|
'www.rijksmuseum.nl',
|
|
'www.niod.nl',
|
|
'www.universiteitleiden.nl',
|
|
'www.uva.nl',
|
|
'www.uu.nl',
|
|
'www.rug.nl',
|
|
'www.kitlv.nl',
|
|
'www.codart.nl',
|
|
'pure.knaw.nl',
|
|
'www.vangoghmuseum.nl',
|
|
'krollermuller.nl',
|
|
'www.stedelijk.nl',
|
|
'www.allardpierson.nl',
|
|
'www.eyefilm.nl',
|
|
'www.groningermuseum.nl',
|
|
'framerframed.nl',
|
|
|
|
# ORCID with actual profile
|
|
'orcid.org',
|
|
|
|
# University research portals
|
|
'www.tilburguniversity.edu',
|
|
'hims.uva.nl',
|
|
'ias.uva.nl',
|
|
'research.vu.nl',
|
|
'research.wur.nl',
|
|
'research.ou.nl',
|
|
'researchportalplus.anu.edu.au',
|
|
'www.cuanschutz.edu',
|
|
'ischool.utoronto.ca',
|
|
'guides.lib.vt.edu',
|
|
'www.qmul.ac.uk',
|
|
'courtauld.ac.uk',
|
|
'www.uio.no',
|
|
'www.uia.no',
|
|
'www.khrono.no',
|
|
'ufg.phil-fak.uni-koeln.de',
|
|
'www.mh-freiburg.de',
|
|
'www.monmouth.edu',
|
|
'www.dainst.org',
|
|
'ccs.bard.edu',
|
|
'avesis.agu.edu.tr',
|
|
'westernsydney.academia.edu',
|
|
'mkg-hamburg.academia.edu',
|
|
'griffith.academia.edu',
|
|
'quaibranly.academia.edu',
|
|
|
|
# Museums and cultural institutions
|
|
'www.louvreabudhabi.ae',
|
|
'www.frick.org',
|
|
'www.thorvaldsensmuseum.dk',
|
|
'www.stiftung-berliner-mauer.de',
|
|
'kozlekedesimuzeum.hu',
|
|
'www.yadvashem.org',
|
|
'www.ehri-project.eu',
|
|
'www.ehri-uk.org',
|
|
'www.museumnext.com',
|
|
'blog.archive.org',
|
|
|
|
# Professional personal websites (verified curator)
|
|
'bsmets.net',
|
|
'charlesgielen.com',
|
|
'www.martin-munoz.net',
|
|
'seamusmccormack.com',
|
|
'johnmiedema.art',
|
|
'www.winkewiegersma.com',
|
|
'jentewesterhof.wixsite.com',
|
|
'susannalles.com',
|
|
'www.dianalopezbooks.com',
|
|
|
|
# Academic publishers
|
|
'academic.oup.com',
|
|
'journals.sagepub.com',
|
|
'dl.acm.org',
|
|
'www.intellectbooks.com',
|
|
'www.thamesandhudsonusa.com',
|
|
|
|
# News/cultural organizations
|
|
'www.bbc.co.uk',
|
|
'armenpress.am',
|
|
'startupitalia.eu',
|
|
'new.coinsweekly.com',
|
|
|
|
# Awards (if matching person)
|
|
'www.nationalbook.org',
|
|
'www.caineprize.com',
|
|
'www.pauljanssenaward.com',
|
|
|
|
# Other verified
|
|
'libereurope.eu',
|
|
'www.kvvak.nl',
|
|
'www.westfriesgenootschap.nl',
|
|
'www.isric.org',
|
|
'theorg.com',
|
|
'osc-international.com',
|
|
'transmissioninmotion.sites.uu.nl',
|
|
'www.library.universiteitleiden.nl',
|
|
'clay.earth',
|
|
'croyan.quaibranly.fr',
|
|
'vimeo.com',
|
|
'www.shivanipublications.com',
|
|
'www.jagodangdut.com',
|
|
'www.nationaljewish.org',
|
|
'www.samtidsdans.no',
|
|
'www.khm.uio.no',
|
|
'gritsenko-andrij-petrovich.webnode.com.ua',
|
|
}
|
|
|
|
# LinkedIn is special - valid source but needs slug matching
|
|
LINKEDIN_DOMAINS = {
|
|
'www.linkedin.com',
|
|
'nl.linkedin.com',
|
|
'be.linkedin.com',
|
|
'it.linkedin.com',
|
|
'fr.linkedin.com',
|
|
'es.linkedin.com',
|
|
'au.linkedin.com',
|
|
'uk.linkedin.com',
|
|
'no.linkedin.com',
|
|
'linkedin.com',
|
|
}
|
|
|
|
|
|
def extract_domain(url: str) -> str:
|
|
"""Extract domain from URL."""
|
|
if not url:
|
|
return ''
|
|
# Remove protocol
|
|
url = url.replace('https://', '').replace('http://', '')
|
|
# Get domain (before first /)
|
|
return url.split('/')[0]
|
|
|
|
|
|
def classify_claim_risk(claim: Dict) -> Tuple[str, str]:
|
|
"""
|
|
Classify a claim's risk level based on source domain.
|
|
|
|
Returns: (risk_level, reason)
|
|
"""
|
|
source_url = claim.get('provenance', {}).get('source_url', '')
|
|
if not source_url:
|
|
# Try older format
|
|
source_url = claim.get('source_url', '')
|
|
|
|
domain = extract_domain(source_url)
|
|
|
|
if domain in HIGH_RISK_DOMAINS:
|
|
return 'HIGH', f"High-risk domain: {domain} (entity resolution failures common)"
|
|
|
|
if domain in MEDIUM_RISK_DOMAINS:
|
|
return 'MEDIUM', f"Medium-risk domain: {domain} (may need verification)"
|
|
|
|
if domain in LINKEDIN_DOMAINS:
|
|
# LinkedIn needs special handling - check if slug matches
|
|
return 'LINKEDIN', f"LinkedIn source - verify profile matches person"
|
|
|
|
if domain in SAFE_DOMAINS:
|
|
return 'SAFE', f"Trusted institutional source: {domain}"
|
|
|
|
return 'UNKNOWN', f"Unknown domain: {domain} - needs classification"
|
|
|
|
|
|
def process_profile(file_path: Path, dry_run: bool = True) -> Dict:
|
|
"""
|
|
Process a single profile and remove high-risk claims.
|
|
|
|
Returns statistics about what was removed.
|
|
"""
|
|
stats = {
|
|
'file': str(file_path.name),
|
|
'claims_before': 0,
|
|
'claims_after': 0,
|
|
'removed_high_risk': [],
|
|
'flagged_medium_risk': [],
|
|
'flagged_linkedin': [],
|
|
'kept_safe': [],
|
|
'unknown': [],
|
|
}
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
profile = json.load(f)
|
|
except (json.JSONDecodeError, FileNotFoundError) as e:
|
|
stats['error'] = str(e)
|
|
return stats
|
|
|
|
web_claims = profile.get('web_claims', [])
|
|
stats['claims_before'] = len(web_claims)
|
|
|
|
if not web_claims:
|
|
return stats
|
|
|
|
# Process each claim
|
|
kept_claims = []
|
|
|
|
for claim in web_claims:
|
|
risk_level, reason = classify_claim_risk(claim)
|
|
|
|
claim_summary = {
|
|
'type': claim.get('claim_type', 'unknown'),
|
|
'value': str(claim.get('claim_value', ''))[:100],
|
|
'source': claim.get('provenance', {}).get('source_url', claim.get('source_url', ''))[:100],
|
|
'reason': reason,
|
|
}
|
|
|
|
if risk_level == 'HIGH':
|
|
stats['removed_high_risk'].append(claim_summary)
|
|
# Don't add to kept_claims
|
|
elif risk_level == 'MEDIUM':
|
|
stats['flagged_medium_risk'].append(claim_summary)
|
|
kept_claims.append(claim) # Keep but flag for review
|
|
elif risk_level == 'LINKEDIN':
|
|
stats['flagged_linkedin'].append(claim_summary)
|
|
kept_claims.append(claim) # Keep but flag for review
|
|
elif risk_level == 'SAFE':
|
|
stats['kept_safe'].append(claim_summary)
|
|
kept_claims.append(claim)
|
|
else:
|
|
stats['unknown'].append(claim_summary)
|
|
kept_claims.append(claim) # Keep unknown for manual review
|
|
|
|
stats['claims_after'] = len(kept_claims)
|
|
|
|
# Update profile if not dry run
|
|
if not dry_run and stats['removed_high_risk']:
|
|
profile['web_claims'] = kept_claims
|
|
|
|
# Add cleanup metadata
|
|
if 'enrichment_metadata' not in profile:
|
|
profile['enrichment_metadata'] = {}
|
|
|
|
cleanup_entry = {
|
|
'cleanup_date': datetime.now(timezone.utc).isoformat(),
|
|
'cleanup_script': 'cleanup_person_web_claims.py',
|
|
'claims_removed': len(stats['removed_high_risk']),
|
|
'removal_reasons': [c['reason'] for c in stats['removed_high_risk']],
|
|
}
|
|
|
|
if 'cleanup_history' not in profile['enrichment_metadata']:
|
|
profile['enrichment_metadata']['cleanup_history'] = []
|
|
profile['enrichment_metadata']['cleanup_history'].append(cleanup_entry)
|
|
|
|
# Write back
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
json.dump(profile, f, indent=2, ensure_ascii=False)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Clean up web claims from PERSON profiles')
|
|
parser.add_argument('--dry-run', action='store_true', default=True,
|
|
help='Do not modify files, just report (default: True)')
|
|
parser.add_argument('--execute', action='store_true',
|
|
help='Actually modify files (overrides --dry-run)')
|
|
parser.add_argument('--limit', type=int, default=None,
|
|
help='Process only N files')
|
|
parser.add_argument('--file', type=str, default=None,
|
|
help='Process a specific file')
|
|
|
|
args = parser.parse_args()
|
|
dry_run = not args.execute
|
|
|
|
person_dir = Path('/Users/kempersc/apps/glam/data/person')
|
|
|
|
if args.file:
|
|
files = [person_dir / args.file]
|
|
else:
|
|
files = sorted(person_dir.glob('ID_*.json'))
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
print(f"{'DRY RUN - ' if dry_run else ''}Processing {len(files)} files...")
|
|
print("=" * 80)
|
|
|
|
total_stats = {
|
|
'files_processed': 0,
|
|
'files_with_claims': 0,
|
|
'files_modified': 0,
|
|
'claims_removed': 0,
|
|
'claims_flagged_medium': 0,
|
|
'claims_flagged_linkedin': 0,
|
|
'claims_kept_safe': 0,
|
|
'claims_unknown': 0,
|
|
}
|
|
|
|
removal_log = []
|
|
|
|
for file_path in files:
|
|
stats = process_profile(file_path, dry_run=dry_run)
|
|
|
|
total_stats['files_processed'] += 1
|
|
|
|
if stats.get('error'):
|
|
print(f"ERROR: {file_path.name}: {stats['error']}")
|
|
continue
|
|
|
|
if stats['claims_before'] > 0:
|
|
total_stats['files_with_claims'] += 1
|
|
|
|
if stats['removed_high_risk']:
|
|
total_stats['files_modified'] += 1
|
|
total_stats['claims_removed'] += len(stats['removed_high_risk'])
|
|
removal_log.append(stats)
|
|
|
|
print(f"\n{file_path.name}:")
|
|
print(f" Removed {len(stats['removed_high_risk'])} high-risk claims:")
|
|
for claim in stats['removed_high_risk']:
|
|
print(f" - {claim['type']}: {claim['value'][:50]}...")
|
|
print(f" Source: {claim['source'][:60]}...")
|
|
|
|
total_stats['claims_flagged_medium'] += len(stats['flagged_medium_risk'])
|
|
total_stats['claims_flagged_linkedin'] += len(stats['flagged_linkedin'])
|
|
total_stats['claims_kept_safe'] += len(stats['kept_safe'])
|
|
total_stats['claims_unknown'] += len(stats['unknown'])
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Files processed: {total_stats['files_processed']}")
|
|
print(f"Files with claims: {total_stats['files_with_claims']}")
|
|
print(f"Files modified: {total_stats['files_modified']}")
|
|
print(f"Claims removed (HIGH): {total_stats['claims_removed']}")
|
|
print(f"Claims flagged (MED): {total_stats['claims_flagged_medium']}")
|
|
print(f"Claims flagged (LI): {total_stats['claims_flagged_linkedin']}")
|
|
print(f"Claims kept (SAFE): {total_stats['claims_kept_safe']}")
|
|
print(f"Claims unknown: {total_stats['claims_unknown']}")
|
|
|
|
if dry_run:
|
|
print("\n*** DRY RUN - No files were modified ***")
|
|
print("Run with --execute to apply changes")
|
|
|
|
# Save removal log
|
|
log_path = person_dir / '_web_claims_cleanup_log.json'
|
|
with open(log_path, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'cleanup_date': datetime.now(timezone.utc).isoformat(),
|
|
'dry_run': dry_run,
|
|
'total_stats': total_stats,
|
|
'removal_details': removal_log,
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nCleanup log saved to: {log_path}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|