glam/scripts/revert_auto_merged_wcms.py
kempersc 7424b85352 Add new slots for heritage custodian entities
- Introduced setpoint_max, setpoint_min, setpoint_tolerance, setpoint_type, setpoint_unit, setpoint_value, temperature_target, track_id, typical_http_methods, typical_metadata_standard, typical_response_formats, typical_scope, typical_technical_feature, unit_code, unit_symbol, unit_type, wikidata_entity, wikidata_equivalent, and wikidata_id slots.
- Each slot includes a unique identifier, name, title, description, and annotations for custodian types and specificity score.
2026-01-16 01:04:38 +01:00

121 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""
Revert auto-merged WCMS data from LinkedIn entity files.
CRITICAL: Only CONFIRMED matches (review_decision == 'match') should have WCMS data.
This script removes wcms_identifiers, wcms_activity, contact_details from files
that were auto-merged without human review.
Usage:
python scripts/revert_auto_merged_wcms.py --dry-run # Preview changes
python scripts/revert_auto_merged_wcms.py # Apply changes
"""
import argparse
import json
from pathlib import Path
from datetime import datetime, timezone
def main():
parser = argparse.ArgumentParser(description='Revert auto-merged WCMS data')
parser.add_argument('--dry-run', action='store_true', help='Preview without making changes')
args = parser.parse_args()
# Paths
candidates_file = Path('data/entity_resolution/entity_resolution_candidates.json')
entity_dir = Path('data/custodian/person/entity')
# Load confirmed matches (review_decision == 'match')
print(f"Loading candidates from {candidates_file}...")
with open(candidates_file) as f:
data = json.load(f)
# Build set of LinkedIn slugs that are CONFIRMED matches
confirmed_slugs = set()
for c in data.get('candidates', []):
if c.get('review_decision') == 'match':
slug = c.get('linkedin_slug')
if slug:
confirmed_slugs.add(slug)
print(f"Confirmed match slugs: {len(confirmed_slugs)}")
# Process entity files
files_checked = 0
files_with_wcms = 0
files_confirmed = 0
files_reverted = 0
files_errors = 0
for f in sorted(entity_dir.glob('*.json')):
if f.name.startswith('_'):
continue
files_checked += 1
try:
with open(f) as fp:
entity_data = json.load(fp)
# Check if file has WCMS data
if not entity_data.get('wcms_identifiers'):
continue
files_with_wcms += 1
# Extract slug from filename (remove timestamp)
slug = f.stem.rsplit('_', 1)[0]
if slug in confirmed_slugs:
files_confirmed += 1
continue # Keep WCMS data for confirmed matches
# This file was auto-merged - REVERT IT
files_reverted += 1
if args.dry_run:
print(f" Would revert: {f.name}")
continue
# Remove WCMS fields
entity_data.pop('wcms_identifiers', None)
entity_data.pop('wcms_activity', None)
entity_data.pop('contact_details', None)
# Remove 'wcms' from data_sources if present
if 'data_sources' in entity_data:
entity_data['data_sources'] = [
s for s in entity_data['data_sources'] if s != 'wcms'
]
# Add revert note
if 'extraction_metadata' not in entity_data:
entity_data['extraction_metadata'] = {}
entity_data['extraction_metadata']['wcms_reverted'] = {
'reverted_at': datetime.now(timezone.utc).isoformat(),
'reason': 'Auto-merged without human review - reverted per data quality rules'
}
# Write back
with open(f, 'w') as fp:
json.dump(entity_data, fp, indent=2, ensure_ascii=False)
except Exception as e:
files_errors += 1
print(f" Error processing {f.name}: {e}")
# Summary
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
print(f" Files checked: {files_checked:,}")
print(f" Files with WCMS data: {files_with_wcms:,}")
print(f" Files from confirmed matches (KEPT): {files_confirmed:,}")
print(f" Files reverted (auto-merged): {files_reverted:,}")
print(f" Errors: {files_errors}")
if args.dry_run:
print(f"\nRun without --dry-run to apply changes")
if __name__ == '__main__':
main()