glam/scripts/clean_person_data.py

#!/usr/bin/env python3
"""
Clean person data files:
1. Remove "is open to work" and similar suffixes from names
2. Filter out organization entries mistakenly added as staff
3. Add job_seeking_status metadata field

Usage:
    python scripts/clean_person_data.py --dry-run  # Preview changes
    python scripts/clean_person_data.py            # Apply changes
"""

import argparse
import json
import os
import re
from pathlib import Path
from datetime import datetime
from typing import Any

# Patterns to remove from names and add as metadata
JOB_SEEKING_PATTERNS = [
    r'\s+is open to work\s*$',
    r'\s+looking for work\s*$',
    r'\s+seeking opportunities\s*$',
    r'\s+actively seeking\s*$',
    r'\s+open for opportunities\s*$',
    r'\s+#OpenToWork\s*$',
]

# Compiled regex for job seeking detection
JOB_SEEKING_REGEX = re.compile('|'.join(JOB_SEEKING_PATTERNS), re.IGNORECASE)


def is_organization_entry(name: str, custodian_name: str) -> bool:
    """Check if a staff entry is actually the organization itself."""
    if not name or not custodian_name:
        return False

    # Normalize for comparison
    name_lower = name.lower().strip()
    custodian_lower = custodian_name.lower().strip()

    # Direct match
    if name_lower == custodian_lower:
        return True

    # Check if name contains the custodian name (for bilingual names)
    # e.g., "ACP/ ICA- Archival Community for Palestine / التجمع الارشيفي - فلسطين"
    if custodian_lower in name_lower or name_lower in custodian_lower:
        # Additional check: organization names typically don't have human name patterns
        human_name_indicators = [' at ', ' from ', ' with ', ' based in ']
        if not any(ind in name_lower for ind in human_name_indicators):
            return True

    return False


def clean_name(name: str) -> tuple[str, bool]:
    """
    Clean job seeking status from name.
    Returns (cleaned_name, is_job_seeking).
    """
    if not name:
        return name, False

    # Check for job seeking patterns
    match = JOB_SEEKING_REGEX.search(name)
    if match:
        cleaned = JOB_SEEKING_REGEX.sub('', name).strip()
        return cleaned, True

    return name, False


def clean_entity_file(filepath: Path, dry_run: bool = False) -> dict:
    """Clean a person entity JSON file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    changes = []

    # Clean source_staff_info.name
    if 'source_staff_info' in data and 'name' in data['source_staff_info']:
        orig_name = data['source_staff_info']['name']
        cleaned, is_seeking = clean_name(orig_name)
        if cleaned != orig_name:
            changes.append(f"source_staff_info.name: '{orig_name}' -> '{cleaned}'")
            if not dry_run:
                data['source_staff_info']['name'] = cleaned
                data['source_staff_info']['job_seeking_status'] = 'open_to_work' if is_seeking else None

    # Clean profile_data.name
    if 'profile_data' in data and 'name' in data['profile_data']:
        orig_name = data['profile_data']['name']
        cleaned, is_seeking = clean_name(orig_name)
        if cleaned != orig_name:
            changes.append(f"profile_data.name: '{orig_name}' -> '{cleaned}'")
            if not dry_run:
                data['profile_data']['name'] = cleaned
                data['profile_data']['job_seeking_status'] = 'open_to_work' if is_seeking else None

    if changes and not dry_run:
        data['_cleaning_metadata'] = {
            'cleaned_date': datetime.utcnow().isoformat() + 'Z',
            'changes_applied': changes
        }
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

    return {'file': str(filepath), 'changes': changes}


def clean_staff_file(filepath: Path, dry_run: bool = False) -> dict:
    """Clean a staff list JSON file, removing org entries and cleaning names."""
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    changes = []
    custodian_name = data.get('custodian_metadata', {}).get('custodian_name', '')

    if 'staff' not in data:
        return {'file': str(filepath), 'changes': []}

    original_count = len(data['staff'])
    filtered_staff = []

    for staff in data['staff']:
        name = staff.get('name', '')

        # Check if this is the organization itself
        if is_organization_entry(name, custodian_name):
            changes.append(f"REMOVED ORG ENTRY: '{name}'")
            continue

        # Clean job seeking from name
        cleaned, is_seeking = clean_name(name)
        if cleaned != name:
            changes.append(f"CLEANED NAME: '{name}' -> '{cleaned}'")
            if not dry_run:
                staff['name'] = cleaned
                staff['job_seeking_status'] = 'open_to_work' if is_seeking else None

        filtered_staff.append(staff)

    if not dry_run and changes:
        data['staff'] = filtered_staff
        data['staff_analysis']['total_staff_extracted'] = len(filtered_staff)
        data['_cleaning_metadata'] = {
            'cleaned_date': datetime.utcnow().isoformat() + 'Z',
            'original_staff_count': original_count,
            'final_staff_count': len(filtered_staff),
            'changes_applied': changes
        }
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

    return {'file': str(filepath), 'changes': changes}


def main():
    parser = argparse.ArgumentParser(description='Clean person data files')
    parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying')
    args = parser.parse_args()

    base_path = Path('/Users/kempersc/apps/glam/data/custodian/person')

    # Track statistics
    total_files = 0
    files_with_changes = 0
    total_changes = 0
    org_entries_removed = 0
    names_cleaned = 0

    # Clean entity files
    entity_dir = base_path / 'entity'
    if entity_dir.exists():
        for filepath in entity_dir.glob('*.json'):
            result = clean_entity_file(filepath, args.dry_run)
            total_files += 1
            if result['changes']:
                files_with_changes += 1
                total_changes += len(result['changes'])
                names_cleaned += sum(1 for c in result['changes'] if 'name' in c.lower())
                if not args.dry_run or args.dry_run:
                    print(f"\n{filepath.name}:")
                    for change in result['changes']:
                        print(f"  - {change}")

    # Clean staff files
    parsed_dir = base_path / 'affiliated' / 'parsed'
    if parsed_dir.exists():
        for filepath in parsed_dir.glob('*_staff_*.json'):
            result = clean_staff_file(filepath, args.dry_run)
            total_files += 1
            if result['changes']:
                files_with_changes += 1
                total_changes += len(result['changes'])
                org_entries_removed += sum(1 for c in result['changes'] if 'REMOVED ORG' in c)
                names_cleaned += sum(1 for c in result['changes'] if 'CLEANED NAME' in c)
                if not args.dry_run or args.dry_run:
                    print(f"\n{filepath.name}:")
                    for change in result['changes']:
                        print(f"  - {change}")

    # Print summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Total files scanned: {total_files}")
    print(f"Files with changes: {files_with_changes}")
    print(f"Total changes: {total_changes}")
    print(f"  - Names cleaned (job seeking removed): {names_cleaned}")
    print(f"  - Organization entries removed: {org_entries_removed}")

    if args.dry_run:
        print("\n[DRY RUN - No changes applied. Run without --dry-run to apply.]")
    else:
        print("\n[Changes applied successfully]")


if __name__ == '__main__':
    main()