#!/usr/bin/env python3 """ Clean person data files: 1. Remove "is open to work" and similar suffixes from names 2. Filter out organization entries mistakenly added as staff 3. Add job_seeking_status metadata field Usage: python scripts/clean_person_data.py --dry-run # Preview changes python scripts/clean_person_data.py # Apply changes """ import argparse import json import os import re from pathlib import Path from datetime import datetime, timezone from typing import Any # Patterns to remove from names and add as metadata JOB_SEEKING_PATTERNS = [ r'\s+is open to work\s*$', r'\s+is hiring\s*$', r'\s+looking for work\s*$', r'\s+seeking opportunities\s*$', r'\s+actively seeking\s*$', r'\s+open for opportunities\s*$', r'\s+#OpenToWork\s*$', r'\s+#Hiring\s*$', ] # Compiled regex for job seeking detection JOB_SEEKING_REGEX = re.compile('|'.join(JOB_SEEKING_PATTERNS), re.IGNORECASE) def is_organization_entry(name: str, custodian_name: str) -> bool: """Check if a staff entry is actually the organization itself.""" if not name or not custodian_name: return False # Normalize for comparison name_lower = name.lower().strip() custodian_lower = custodian_name.lower().strip() # Direct match if name_lower == custodian_lower: return True # Check if name contains the custodian name (for bilingual names) # e.g., "ACP/ ICA- Archival Community for Palestine / التجمع الارشيفي - فلسطين" if custodian_lower in name_lower or name_lower in custodian_lower: # Additional check: organization names typically don't have human name patterns human_name_indicators = [' at ', ' from ', ' with ', ' based in '] if not any(ind in name_lower for ind in human_name_indicators): return True return False def clean_name(name: str) -> tuple[str, str | None]: """ Clean job seeking status from name. Returns (cleaned_name, status_type or None). status_type is 'hiring' or 'open_to_work'. """ if not name: return name, None # Check for job seeking patterns match = JOB_SEEKING_REGEX.search(name) if match: matched_text = match.group(0).lower() cleaned = JOB_SEEKING_REGEX.sub('', name).strip() # Determine status type if 'hiring' in matched_text: return cleaned, 'hiring' else: return cleaned, 'open_to_work' return name, None def clean_entity_file(filepath: Path, dry_run: bool = False) -> dict: """Clean a person entity JSON file.""" with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) changes = [] status_overall = None # Clean source_staff_info.name if 'source_staff_info' in data and 'name' in data['source_staff_info']: orig_name = data['source_staff_info']['name'] cleaned, status = clean_name(orig_name) if cleaned != orig_name: changes.append(f"source_staff_info.name: '{orig_name}' -> '{cleaned}'") status_overall = status_overall or status if not dry_run: data['source_staff_info']['name'] = cleaned data['source_staff_info']['job_seeking_status'] = status # Clean profile_data.name if 'profile_data' in data and 'name' in data['profile_data']: orig_name = data['profile_data']['name'] cleaned, status = clean_name(orig_name) if cleaned != orig_name: changes.append(f"profile_data.name: '{orig_name}' -> '{cleaned}'") status_overall = status_overall or status if not dry_run: data['profile_data']['name'] = cleaned data['profile_data']['job_seeking_status'] = status # Clean profile_data.full_name (some entity files use this instead of name) if 'profile_data' in data and 'full_name' in data['profile_data']: orig_name = data['profile_data']['full_name'] cleaned, status = clean_name(orig_name) if cleaned != orig_name: changes.append(f"profile_data.full_name: '{orig_name}' -> '{cleaned}'") status_overall = status_overall or status if not dry_run: data['profile_data']['full_name'] = cleaned data['profile_data']['job_seeking_status'] = status # Clean profile_data.last_name (may also contain the suffix) if 'profile_data' in data and 'last_name' in data['profile_data']: orig_name = data['profile_data']['last_name'] cleaned, status = clean_name(orig_name) if cleaned != orig_name: changes.append(f"profile_data.last_name: '{orig_name}' -> '{cleaned}'") status_overall = status_overall or status if not dry_run: data['profile_data']['last_name'] = cleaned if changes and not dry_run: data['_cleaning_metadata'] = { 'cleaned_date': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), 'changes_applied': changes } with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) return {'file': str(filepath), 'changes': changes} def clean_staff_file(filepath: Path, dry_run: bool = False) -> dict: """Clean a staff list JSON file, removing org entries and cleaning names.""" with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) changes = [] custodian_name = data.get('custodian_metadata', {}).get('custodian_name', '') if 'staff' not in data: return {'file': str(filepath), 'changes': []} original_count = len(data['staff']) filtered_staff = [] for staff in data['staff']: name = staff.get('name', '') # Check if this is the organization itself if is_organization_entry(name, custodian_name): changes.append(f"REMOVED ORG ENTRY: '{name}'") continue # Clean job seeking from name cleaned, status = clean_name(name) if cleaned != name: changes.append(f"CLEANED NAME: '{name}' -> '{cleaned}' (status: {status})") if not dry_run: staff['name'] = cleaned staff['job_seeking_status'] = status # Also clean headline field if present if 'headline' in staff: headline = staff['headline'] cleaned_headline, _ = clean_name(headline) if cleaned_headline != headline: changes.append(f"CLEANED HEADLINE: '{headline}' -> '{cleaned_headline}'") if not dry_run: staff['headline'] = cleaned_headline filtered_staff.append(staff) if not dry_run and changes: data['staff'] = filtered_staff if 'staff_analysis' in data: data['staff_analysis']['total_staff_extracted'] = len(filtered_staff) data['_cleaning_metadata'] = { 'cleaned_date': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), 'original_staff_count': original_count, 'final_staff_count': len(filtered_staff), 'changes_applied': changes } with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) return {'file': str(filepath), 'changes': changes} def clean_generic_json_file(filepath: Path, dry_run: bool = False) -> dict: """Clean any JSON file by recursively finding and cleaning name-like fields and keys.""" with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) changes = [] updates = [] # Store updates to apply after iteration key_renames = [] # Store (obj, old_key, new_key) tuples for key renaming def clean_recursive(obj, path=""): """Recursively clean name fields and keys in any JSON structure.""" nonlocal changes, updates, key_renames if isinstance(obj, dict): keys = list(obj.keys()) # Create a copy of keys to avoid mutation during iteration for key in keys: value = obj[key] current_path = f"{path}.{key}" if path else key # Clean dictionary keys that look like dirty names (e.g., in name_to_slug mappings) if isinstance(key, str): cleaned_key, _ = clean_name(key) if cleaned_key != key: changes.append(f"RENAMED KEY at {path}: '{key}' -> '{cleaned_key}'") if not dry_run: key_renames.append((obj, key, cleaned_key)) # Clean name-like field values if key in ('name', 'full_name', 'headline', 'last_name') and isinstance(value, str): cleaned, status = clean_name(value) if cleaned != value: changes.append(f"{current_path}: '{value}' -> '{cleaned}'") if not dry_run: updates.append((obj, key, cleaned, status if key == 'name' else None)) elif isinstance(value, (dict, list)): clean_recursive(value, current_path) elif isinstance(obj, list): for i, item in enumerate(obj): current_path = f"{path}[{i}]" if isinstance(item, str): # Check if this looks like a name with job seeking text cleaned, _ = clean_name(item) if cleaned != item: changes.append(f"{current_path}: '{item}' -> '{cleaned}'") if not dry_run: obj[i] = cleaned elif isinstance(item, (dict, list)): clean_recursive(item, current_path) clean_recursive(data) # Apply key renames after iteration is complete (to avoid dict mutation during iteration) for obj, old_key, new_key in key_renames: if old_key in obj: # Check key still exists obj[new_key] = obj.pop(old_key) # Apply value updates after iteration is complete for obj, key, cleaned, status in updates: obj[key] = cleaned if status: obj['job_seeking_status'] = status if changes and not dry_run: data['_cleaning_metadata'] = { 'cleaned_date': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), 'changes_applied': changes } with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) return {'file': str(filepath), 'changes': changes} def main(): parser = argparse.ArgumentParser(description='Clean person data files') parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying') args = parser.parse_args() base_path = Path('/Users/kempersc/apps/glam/data/custodian/person') # Track statistics total_files = 0 files_with_changes = 0 total_changes = 0 org_entries_removed = 0 names_cleaned = 0 # Clean entity files entity_dir = base_path / 'entity' if entity_dir.exists(): for filepath in entity_dir.glob('*.json'): result = clean_entity_file(filepath, args.dry_run) total_files += 1 if result['changes']: files_with_changes += 1 total_changes += len(result['changes']) names_cleaned += sum(1 for c in result['changes'] if 'name' in c.lower()) if not args.dry_run or args.dry_run: print(f"\n{filepath.name}:") for change in result['changes']: print(f" - {change}") # Clean staff files from both directories # 1. affiliated/parsed/ - legacy parsed staff files # 2. bu/ - current staff files used by Qdrant sync staff_dirs = [ base_path / 'affiliated' / 'parsed', base_path / 'bu', ] for staff_dir in staff_dirs: if staff_dir.exists(): print(f"\nProcessing staff directory: {staff_dir}") for filepath in staff_dir.glob('*_staff_*.json'): result = clean_staff_file(filepath, args.dry_run) total_files += 1 if result['changes']: files_with_changes += 1 total_changes += len(result['changes']) org_entries_removed += sum(1 for c in result['changes'] if 'REMOVED ORG' in c) names_cleaned += sum(1 for c in result['changes'] if 'CLEANED NAME' in c or 'CLEANED HEADLINE' in c) if not args.dry_run or args.dry_run: print(f"\n{filepath.name}:") for change in result['changes']: print(f" - {change}") # Clean other JSON files in bu/ directory (not matching *_staff_*.json) bu_dir = base_path / 'bu' if bu_dir.exists(): print(f"\nProcessing other JSON files in bu/:") for filepath in bu_dir.glob('*.json'): if '_staff_' not in filepath.name: # Skip staff files already processed result = clean_generic_json_file(filepath, args.dry_run) total_files += 1 if result['changes']: files_with_changes += 1 total_changes += len(result['changes']) names_cleaned += len(result['changes']) if not args.dry_run or args.dry_run: print(f"\n{filepath.name}:") for change in result['changes']: print(f" - {change}") # Print summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Total files scanned: {total_files}") print(f"Files with changes: {files_with_changes}") print(f"Total changes: {total_changes}") print(f" - Names cleaned (job seeking removed): {names_cleaned}") print(f" - Organization entries removed: {org_entries_removed}") if args.dry_run: print("\n[DRY RUN - No changes applied. Run without --dry-run to apply.]") else: print("\n[Changes applied successfully]") if __name__ == '__main__': main()