- Introduced setpoint_max, setpoint_min, setpoint_tolerance, setpoint_type, setpoint_unit, setpoint_value, temperature_target, track_id, typical_http_methods, typical_metadata_standard, typical_response_formats, typical_scope, typical_technical_feature, unit_code, unit_symbol, unit_type, wikidata_entity, wikidata_equivalent, and wikidata_id slots. - Each slot includes a unique identifier, name, title, description, and annotations for custodian types and specificity score.
121 lines
4.1 KiB
Python
121 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Revert auto-merged WCMS data from LinkedIn entity files.
|
|
|
|
CRITICAL: Only CONFIRMED matches (review_decision == 'match') should have WCMS data.
|
|
This script removes wcms_identifiers, wcms_activity, contact_details from files
|
|
that were auto-merged without human review.
|
|
|
|
Usage:
|
|
python scripts/revert_auto_merged_wcms.py --dry-run # Preview changes
|
|
python scripts/revert_auto_merged_wcms.py # Apply changes
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Revert auto-merged WCMS data')
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview without making changes')
|
|
args = parser.parse_args()
|
|
|
|
# Paths
|
|
candidates_file = Path('data/entity_resolution/entity_resolution_candidates.json')
|
|
entity_dir = Path('data/custodian/person/entity')
|
|
|
|
# Load confirmed matches (review_decision == 'match')
|
|
print(f"Loading candidates from {candidates_file}...")
|
|
with open(candidates_file) as f:
|
|
data = json.load(f)
|
|
|
|
# Build set of LinkedIn slugs that are CONFIRMED matches
|
|
confirmed_slugs = set()
|
|
for c in data.get('candidates', []):
|
|
if c.get('review_decision') == 'match':
|
|
slug = c.get('linkedin_slug')
|
|
if slug:
|
|
confirmed_slugs.add(slug)
|
|
|
|
print(f"Confirmed match slugs: {len(confirmed_slugs)}")
|
|
|
|
# Process entity files
|
|
files_checked = 0
|
|
files_with_wcms = 0
|
|
files_confirmed = 0
|
|
files_reverted = 0
|
|
files_errors = 0
|
|
|
|
for f in sorted(entity_dir.glob('*.json')):
|
|
if f.name.startswith('_'):
|
|
continue
|
|
|
|
files_checked += 1
|
|
|
|
try:
|
|
with open(f) as fp:
|
|
entity_data = json.load(fp)
|
|
|
|
# Check if file has WCMS data
|
|
if not entity_data.get('wcms_identifiers'):
|
|
continue
|
|
|
|
files_with_wcms += 1
|
|
|
|
# Extract slug from filename (remove timestamp)
|
|
slug = f.stem.rsplit('_', 1)[0]
|
|
|
|
if slug in confirmed_slugs:
|
|
files_confirmed += 1
|
|
continue # Keep WCMS data for confirmed matches
|
|
|
|
# This file was auto-merged - REVERT IT
|
|
files_reverted += 1
|
|
|
|
if args.dry_run:
|
|
print(f" Would revert: {f.name}")
|
|
continue
|
|
|
|
# Remove WCMS fields
|
|
entity_data.pop('wcms_identifiers', None)
|
|
entity_data.pop('wcms_activity', None)
|
|
entity_data.pop('contact_details', None)
|
|
|
|
# Remove 'wcms' from data_sources if present
|
|
if 'data_sources' in entity_data:
|
|
entity_data['data_sources'] = [
|
|
s for s in entity_data['data_sources'] if s != 'wcms'
|
|
]
|
|
|
|
# Add revert note
|
|
if 'extraction_metadata' not in entity_data:
|
|
entity_data['extraction_metadata'] = {}
|
|
entity_data['extraction_metadata']['wcms_reverted'] = {
|
|
'reverted_at': datetime.now(timezone.utc).isoformat(),
|
|
'reason': 'Auto-merged without human review - reverted per data quality rules'
|
|
}
|
|
|
|
# Write back
|
|
with open(f, 'w') as fp:
|
|
json.dump(entity_data, fp, indent=2, ensure_ascii=False)
|
|
|
|
except Exception as e:
|
|
files_errors += 1
|
|
print(f" Error processing {f.name}: {e}")
|
|
|
|
# Summary
|
|
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
|
|
print(f" Files checked: {files_checked:,}")
|
|
print(f" Files with WCMS data: {files_with_wcms:,}")
|
|
print(f" Files from confirmed matches (KEPT): {files_confirmed:,}")
|
|
print(f" Files reverted (auto-merged): {files_reverted:,}")
|
|
print(f" Errors: {files_errors}")
|
|
|
|
if args.dry_run:
|
|
print(f"\nRun without --dry-run to apply changes")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|