glam/scripts/cleanup_person_web_claims.py

#!/usr/bin/env python3
"""
Cleanup web claims from PERSON profiles based on source domain risk assessment.

This script removes claims from HIGH RISK sources (entity resolution failures)
and flags MEDIUM RISK sources for further review.

DATA QUALITY IS OF UTMOST IMPORTANCE - Wrong data is worse than no data.

Different from cleanup_web_claims.py which handles custodian YAML files.
"""

import json
import os
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Set, Tuple

# Domain classification for entity resolution risk
HIGH_RISK_DOMAINS = {
    # Social media - random accounts, not verified person
    'www.instagram.com',
    'www.tiktok.com',
    'linktr.ee',

    # People aggregators - notorious for mixing up people with same names
    'www.idcrawl.com',
    'www.peekyou.com',
    'rocketreach.co',
    'www.zoominfo.com',
    'profileability.com',
    'holaconnect.com',

    # Entertainment - actors/musicians with same names
    'www.imdb.com',
    'www.babelio.com',  # French book site
    'www.goodreads.com',
    'www.poetryfoundation.org',
    'bakerbookhouse.com',  # Christian book awards
    'www.thriftbooks.com',
    'www.abebooks.com',
    'www.amazon.com',
    'arcmusic.org',

    # Sports - athletes with same names
    'worldathletics.org',
    'www.eliteprospects.com',

    # Art marketplaces - different artists
    'www.mutualart.com',

    # Genealogy - historical figures
    'www.wikidata.org',  # Often wrong person, needs manual verification
}

MEDIUM_RISK_DOMAINS = {
    # Generic Twitter/X - may or may not be the person
    'twitter.com',
    'x.com',

    # Research aggregators - sometimes wrong person
    'www.researchgate.net',
    'scholar.google.com',
    'www.scilit.com',
    'research.com',
    'support.orcid.org',
}

SAFE_DOMAINS = {
    # Institutional websites - high confidence
    'www.rijksmuseum.nl',
    'www.niod.nl',
    'www.universiteitleiden.nl',
    'www.uva.nl',
    'www.uu.nl',
    'www.rug.nl',
    'www.kitlv.nl',
    'www.codart.nl',
    'pure.knaw.nl',
    'www.vangoghmuseum.nl',
    'krollermuller.nl',
    'www.stedelijk.nl',
    'www.allardpierson.nl',
    'www.eyefilm.nl',
    'www.groningermuseum.nl',
    'framerframed.nl',

    # ORCID with actual profile
    'orcid.org',

    # University research portals
    'www.tilburguniversity.edu',
    'hims.uva.nl',
    'ias.uva.nl',
    'research.vu.nl',
    'research.wur.nl',
    'research.ou.nl',
    'researchportalplus.anu.edu.au',
    'www.cuanschutz.edu',
    'ischool.utoronto.ca',
    'guides.lib.vt.edu',
    'www.qmul.ac.uk',
    'courtauld.ac.uk',
    'www.uio.no',
    'www.uia.no',
    'www.khrono.no',
    'ufg.phil-fak.uni-koeln.de',
    'www.mh-freiburg.de',
    'www.monmouth.edu',
    'www.dainst.org',
    'ccs.bard.edu',
    'avesis.agu.edu.tr',
    'westernsydney.academia.edu',
    'mkg-hamburg.academia.edu',
    'griffith.academia.edu',
    'quaibranly.academia.edu',

    # Museums and cultural institutions
    'www.louvreabudhabi.ae',
    'www.frick.org',
    'www.thorvaldsensmuseum.dk',
    'www.stiftung-berliner-mauer.de',
    'kozlekedesimuzeum.hu',
    'www.yadvashem.org',
    'www.ehri-project.eu',
    'www.ehri-uk.org',
    'www.museumnext.com',
    'blog.archive.org',

    # Professional personal websites (verified curator)
    'bsmets.net',
    'charlesgielen.com',
    'www.martin-munoz.net',
    'seamusmccormack.com',
    'johnmiedema.art',
    'www.winkewiegersma.com',
    'jentewesterhof.wixsite.com',
    'susannalles.com',
    'www.dianalopezbooks.com',

    # Academic publishers
    'academic.oup.com',
    'journals.sagepub.com',
    'dl.acm.org',
    'www.intellectbooks.com',
    'www.thamesandhudsonusa.com',

    # News/cultural organizations
    'www.bbc.co.uk',
    'armenpress.am',
    'startupitalia.eu',
    'new.coinsweekly.com',

    # Awards (if matching person)
    'www.nationalbook.org',
    'www.caineprize.com',
    'www.pauljanssenaward.com',

    # Other verified
    'libereurope.eu',
    'www.kvvak.nl',
    'www.westfriesgenootschap.nl',
    'www.isric.org',
    'theorg.com',
    'osc-international.com',
    'transmissioninmotion.sites.uu.nl',
    'www.library.universiteitleiden.nl',
    'clay.earth',
    'croyan.quaibranly.fr',
    'vimeo.com',
    'www.shivanipublications.com',
    'www.jagodangdut.com',
    'www.nationaljewish.org',
    'www.samtidsdans.no',
    'www.khm.uio.no',
    'gritsenko-andrij-petrovich.webnode.com.ua',
}

# LinkedIn is special - valid source but needs slug matching
LINKEDIN_DOMAINS = {
    'www.linkedin.com',
    'nl.linkedin.com',
    'be.linkedin.com',
    'it.linkedin.com',
    'fr.linkedin.com',
    'es.linkedin.com',
    'au.linkedin.com',
    'uk.linkedin.com',
    'no.linkedin.com',
    'linkedin.com',
}


def extract_domain(url: str) -> str:
    """Extract domain from URL."""
    if not url:
        return ''
    # Remove protocol
    url = url.replace('https://', '').replace('http://', '')
    # Get domain (before first /)
    return url.split('/')[0]


def classify_claim_risk(claim: Dict) -> Tuple[str, str]:
    """
    Classify a claim's risk level based on source domain.

    Returns: (risk_level, reason)
    """
    source_url = claim.get('provenance', {}).get('source_url', '')
    if not source_url:
        # Try older format
        source_url = claim.get('source_url', '')

    domain = extract_domain(source_url)

    if domain in HIGH_RISK_DOMAINS:
        return 'HIGH', f"High-risk domain: {domain} (entity resolution failures common)"

    if domain in MEDIUM_RISK_DOMAINS:
        return 'MEDIUM', f"Medium-risk domain: {domain} (may need verification)"

    if domain in LINKEDIN_DOMAINS:
        # LinkedIn needs special handling - check if slug matches
        return 'LINKEDIN', f"LinkedIn source - verify profile matches person"

    if domain in SAFE_DOMAINS:
        return 'SAFE', f"Trusted institutional source: {domain}"

    return 'UNKNOWN', f"Unknown domain: {domain} - needs classification"


def process_profile(file_path: Path, dry_run: bool = True) -> Dict:
    """
    Process a single profile and remove high-risk claims.

    Returns statistics about what was removed.
    """
    stats = {
        'file': str(file_path.name),
        'claims_before': 0,
        'claims_after': 0,
        'removed_high_risk': [],
        'flagged_medium_risk': [],
        'flagged_linkedin': [],
        'kept_safe': [],
        'unknown': [],
    }

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            profile = json.load(f)
    except (json.JSONDecodeError, FileNotFoundError) as e:
        stats['error'] = str(e)
        return stats

    web_claims = profile.get('web_claims', [])
    stats['claims_before'] = len(web_claims)

    if not web_claims:
        return stats

    # Process each claim
    kept_claims = []

    for claim in web_claims:
        risk_level, reason = classify_claim_risk(claim)

        claim_summary = {
            'type': claim.get('claim_type', 'unknown'),
            'value': str(claim.get('claim_value', ''))[:100],
            'source': claim.get('provenance', {}).get('source_url', claim.get('source_url', ''))[:100],
            'reason': reason,
        }

        if risk_level == 'HIGH':
            stats['removed_high_risk'].append(claim_summary)
            # Don't add to kept_claims
        elif risk_level == 'MEDIUM':
            stats['flagged_medium_risk'].append(claim_summary)
            kept_claims.append(claim)  # Keep but flag for review
        elif risk_level == 'LINKEDIN':
            stats['flagged_linkedin'].append(claim_summary)
            kept_claims.append(claim)  # Keep but flag for review
        elif risk_level == 'SAFE':
            stats['kept_safe'].append(claim_summary)
            kept_claims.append(claim)
        else:
            stats['unknown'].append(claim_summary)
            kept_claims.append(claim)  # Keep unknown for manual review

    stats['claims_after'] = len(kept_claims)

    # Update profile if not dry run
    if not dry_run and stats['removed_high_risk']:
        profile['web_claims'] = kept_claims

        # Add cleanup metadata
        if 'enrichment_metadata' not in profile:
            profile['enrichment_metadata'] = {}

        cleanup_entry = {
            'cleanup_date': datetime.now(timezone.utc).isoformat(),
            'cleanup_script': 'cleanup_person_web_claims.py',
            'claims_removed': len(stats['removed_high_risk']),
            'removal_reasons': [c['reason'] for c in stats['removed_high_risk']],
        }

        if 'cleanup_history' not in profile['enrichment_metadata']:
            profile['enrichment_metadata']['cleanup_history'] = []
        profile['enrichment_metadata']['cleanup_history'].append(cleanup_entry)

        # Write back
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(profile, f, indent=2, ensure_ascii=False)

    return stats


def main():
    import argparse

    parser = argparse.ArgumentParser(description='Clean up web claims from PERSON profiles')
    parser.add_argument('--dry-run', action='store_true', default=True,
                        help='Do not modify files, just report (default: True)')
    parser.add_argument('--execute', action='store_true',
                        help='Actually modify files (overrides --dry-run)')
    parser.add_argument('--limit', type=int, default=None,
                        help='Process only N files')
    parser.add_argument('--file', type=str, default=None,
                        help='Process a specific file')

    args = parser.parse_args()
    dry_run = not args.execute

    person_dir = Path('/Users/kempersc/apps/glam/data/person')

    if args.file:
        files = [person_dir / args.file]
    else:
        files = sorted(person_dir.glob('ID_*.json'))
        if args.limit:
            files = files[:args.limit]

    print(f"{'DRY RUN - ' if dry_run else ''}Processing {len(files)} files...")
    print("=" * 80)

    total_stats = {
        'files_processed': 0,
        'files_with_claims': 0,
        'files_modified': 0,
        'claims_removed': 0,
        'claims_flagged_medium': 0,
        'claims_flagged_linkedin': 0,
        'claims_kept_safe': 0,
        'claims_unknown': 0,
    }

    removal_log = []

    for file_path in files:
        stats = process_profile(file_path, dry_run=dry_run)

        total_stats['files_processed'] += 1

        if stats.get('error'):
            print(f"ERROR: {file_path.name}: {stats['error']}")
            continue

        if stats['claims_before'] > 0:
            total_stats['files_with_claims'] += 1

        if stats['removed_high_risk']:
            total_stats['files_modified'] += 1
            total_stats['claims_removed'] += len(stats['removed_high_risk'])
            removal_log.append(stats)

            print(f"\n{file_path.name}:")
            print(f"  Removed {len(stats['removed_high_risk'])} high-risk claims:")
            for claim in stats['removed_high_risk']:
                print(f"    - {claim['type']}: {claim['value'][:50]}...")
                print(f"      Source: {claim['source'][:60]}...")

        total_stats['claims_flagged_medium'] += len(stats['flagged_medium_risk'])
        total_stats['claims_flagged_linkedin'] += len(stats['flagged_linkedin'])
        total_stats['claims_kept_safe'] += len(stats['kept_safe'])
        total_stats['claims_unknown'] += len(stats['unknown'])

    # Print summary
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Files processed:        {total_stats['files_processed']}")
    print(f"Files with claims:      {total_stats['files_with_claims']}")
    print(f"Files modified:         {total_stats['files_modified']}")
    print(f"Claims removed (HIGH):  {total_stats['claims_removed']}")
    print(f"Claims flagged (MED):   {total_stats['claims_flagged_medium']}")
    print(f"Claims flagged (LI):    {total_stats['claims_flagged_linkedin']}")
    print(f"Claims kept (SAFE):     {total_stats['claims_kept_safe']}")
    print(f"Claims unknown:         {total_stats['claims_unknown']}")

    if dry_run:
        print("\n*** DRY RUN - No files were modified ***")
        print("Run with --execute to apply changes")

    # Save removal log
    log_path = person_dir / '_web_claims_cleanup_log.json'
    with open(log_path, 'w', encoding='utf-8') as f:
        json.dump({
            'cleanup_date': datetime.now(timezone.utc).isoformat(),
            'dry_run': dry_run,
            'total_stats': total_stats,
            'removal_details': removal_log,
        }, f, indent=2, ensure_ascii=False)

    print(f"\nCleanup log saved to: {log_path}")


if __name__ == '__main__':
    main()