#!/usr/bin/env python3 """ Revert auto-merged WCMS data from LinkedIn entity files. CRITICAL: Only CONFIRMED matches (review_decision == 'match') should have WCMS data. This script removes wcms_identifiers, wcms_activity, contact_details from files that were auto-merged without human review. Usage: python scripts/revert_auto_merged_wcms.py --dry-run # Preview changes python scripts/revert_auto_merged_wcms.py # Apply changes """ import argparse import json from pathlib import Path from datetime import datetime, timezone def main(): parser = argparse.ArgumentParser(description='Revert auto-merged WCMS data') parser.add_argument('--dry-run', action='store_true', help='Preview without making changes') args = parser.parse_args() # Paths candidates_file = Path('data/entity_resolution/entity_resolution_candidates.json') entity_dir = Path('data/custodian/person/entity') # Load confirmed matches (review_decision == 'match') print(f"Loading candidates from {candidates_file}...") with open(candidates_file) as f: data = json.load(f) # Build set of LinkedIn slugs that are CONFIRMED matches confirmed_slugs = set() for c in data.get('candidates', []): if c.get('review_decision') == 'match': slug = c.get('linkedin_slug') if slug: confirmed_slugs.add(slug) print(f"Confirmed match slugs: {len(confirmed_slugs)}") # Process entity files files_checked = 0 files_with_wcms = 0 files_confirmed = 0 files_reverted = 0 files_errors = 0 for f in sorted(entity_dir.glob('*.json')): if f.name.startswith('_'): continue files_checked += 1 try: with open(f) as fp: entity_data = json.load(fp) # Check if file has WCMS data if not entity_data.get('wcms_identifiers'): continue files_with_wcms += 1 # Extract slug from filename (remove timestamp) slug = f.stem.rsplit('_', 1)[0] if slug in confirmed_slugs: files_confirmed += 1 continue # Keep WCMS data for confirmed matches # This file was auto-merged - REVERT IT files_reverted += 1 if args.dry_run: print(f" Would revert: {f.name}") continue # Remove WCMS fields entity_data.pop('wcms_identifiers', None) entity_data.pop('wcms_activity', None) entity_data.pop('contact_details', None) # Remove 'wcms' from data_sources if present if 'data_sources' in entity_data: entity_data['data_sources'] = [ s for s in entity_data['data_sources'] if s != 'wcms' ] # Add revert note if 'extraction_metadata' not in entity_data: entity_data['extraction_metadata'] = {} entity_data['extraction_metadata']['wcms_reverted'] = { 'reverted_at': datetime.now(timezone.utc).isoformat(), 'reason': 'Auto-merged without human review - reverted per data quality rules' } # Write back with open(f, 'w') as fp: json.dump(entity_data, fp, indent=2, ensure_ascii=False) except Exception as e: files_errors += 1 print(f" Error processing {f.name}: {e}") # Summary print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:") print(f" Files checked: {files_checked:,}") print(f" Files with WCMS data: {files_with_wcms:,}") print(f" Files from confirmed matches (KEPT): {files_confirmed:,}") print(f" Files reverted (auto-merged): {files_reverted:,}") print(f" Errors: {files_errors}") if args.dry_run: print(f"\nRun without --dry-run to apply changes") if __name__ == '__main__': main()