glam/scripts/revert_auto_merged_wcms.py

#!/usr/bin/env python3
"""
Revert auto-merged WCMS data from LinkedIn entity files.

CRITICAL: Only CONFIRMED matches (review_decision == 'match') should have WCMS data.
This script removes wcms_identifiers, wcms_activity, contact_details from files
that were auto-merged without human review.

Usage:
    python scripts/revert_auto_merged_wcms.py --dry-run  # Preview changes
    python scripts/revert_auto_merged_wcms.py             # Apply changes
"""

import argparse
import json
from pathlib import Path
from datetime import datetime, timezone


def main():
    parser = argparse.ArgumentParser(description='Revert auto-merged WCMS data')
    parser.add_argument('--dry-run', action='store_true', help='Preview without making changes')
    args = parser.parse_args()

    # Paths
    candidates_file = Path('data/entity_resolution/entity_resolution_candidates.json')
    entity_dir = Path('data/custodian/person/entity')

    # Load confirmed matches (review_decision == 'match')
    print(f"Loading candidates from {candidates_file}...")
    with open(candidates_file) as f:
        data = json.load(f)

    # Build set of LinkedIn slugs that are CONFIRMED matches
    confirmed_slugs = set()
    for c in data.get('candidates', []):
        if c.get('review_decision') == 'match':
            slug = c.get('linkedin_slug')
            if slug:
                confirmed_slugs.add(slug)

    print(f"Confirmed match slugs: {len(confirmed_slugs)}")

    # Process entity files
    files_checked = 0
    files_with_wcms = 0
    files_confirmed = 0
    files_reverted = 0
    files_errors = 0

    for f in sorted(entity_dir.glob('*.json')):
        if f.name.startswith('_'):
            continue

        files_checked += 1

        try:
            with open(f) as fp:
                entity_data = json.load(fp)

            # Check if file has WCMS data
            if not entity_data.get('wcms_identifiers'):
                continue

            files_with_wcms += 1

            # Extract slug from filename (remove timestamp)
            slug = f.stem.rsplit('_', 1)[0]

            if slug in confirmed_slugs:
                files_confirmed += 1
                continue  # Keep WCMS data for confirmed matches

            # This file was auto-merged - REVERT IT
            files_reverted += 1

            if args.dry_run:
                print(f"  Would revert: {f.name}")
                continue

            # Remove WCMS fields
            entity_data.pop('wcms_identifiers', None)
            entity_data.pop('wcms_activity', None)
            entity_data.pop('contact_details', None)

            # Remove 'wcms' from data_sources if present
            if 'data_sources' in entity_data:
                entity_data['data_sources'] = [
                    s for s in entity_data['data_sources'] if s != 'wcms'
                ]

            # Add revert note
            if 'extraction_metadata' not in entity_data:
                entity_data['extraction_metadata'] = {}
            entity_data['extraction_metadata']['wcms_reverted'] = {
                'reverted_at': datetime.now(timezone.utc).isoformat(),
                'reason': 'Auto-merged without human review - reverted per data quality rules'
            }

            # Write back
            with open(f, 'w') as fp:
                json.dump(entity_data, fp, indent=2, ensure_ascii=False)

        except Exception as e:
            files_errors += 1
            print(f"  Error processing {f.name}: {e}")

    # Summary
    print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
    print(f"  Files checked: {files_checked:,}")
    print(f"  Files with WCMS data: {files_with_wcms:,}")
    print(f"  Files from confirmed matches (KEPT): {files_confirmed:,}")
    print(f"  Files reverted (auto-merged): {files_reverted:,}")
    print(f"  Errors: {files_errors}")

    if args.dry_run:
        print(f"\nRun without --dry-run to apply changes")


if __name__ == '__main__':
    main()