364 lines
14 KiB
Python
364 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Clean person data files:
|
|
1. Remove "is open to work" and similar suffixes from names
|
|
2. Filter out organization entries mistakenly added as staff
|
|
3. Add job_seeking_status metadata field
|
|
|
|
Usage:
|
|
python scripts/clean_person_data.py --dry-run # Preview changes
|
|
python scripts/clean_person_data.py # Apply changes
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
|
|
# Patterns to remove from names and add as metadata
|
|
JOB_SEEKING_PATTERNS = [
|
|
r'\s+is open to work\s*$',
|
|
r'\s+is hiring\s*$',
|
|
r'\s+looking for work\s*$',
|
|
r'\s+seeking opportunities\s*$',
|
|
r'\s+actively seeking\s*$',
|
|
r'\s+open for opportunities\s*$',
|
|
r'\s+#OpenToWork\s*$',
|
|
r'\s+#Hiring\s*$',
|
|
]
|
|
|
|
# Compiled regex for job seeking detection
|
|
JOB_SEEKING_REGEX = re.compile('|'.join(JOB_SEEKING_PATTERNS), re.IGNORECASE)
|
|
|
|
|
|
def is_organization_entry(name: str, custodian_name: str) -> bool:
|
|
"""Check if a staff entry is actually the organization itself."""
|
|
if not name or not custodian_name:
|
|
return False
|
|
|
|
# Normalize for comparison
|
|
name_lower = name.lower().strip()
|
|
custodian_lower = custodian_name.lower().strip()
|
|
|
|
# Direct match
|
|
if name_lower == custodian_lower:
|
|
return True
|
|
|
|
# Check if name contains the custodian name (for bilingual names)
|
|
# e.g., "ACP/ ICA- Archival Community for Palestine / التجمع الارشيفي - فلسطين"
|
|
if custodian_lower in name_lower or name_lower in custodian_lower:
|
|
# Additional check: organization names typically don't have human name patterns
|
|
human_name_indicators = [' at ', ' from ', ' with ', ' based in ']
|
|
if not any(ind in name_lower for ind in human_name_indicators):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def clean_name(name: str) -> tuple[str, str | None]:
|
|
"""
|
|
Clean job seeking status from name.
|
|
Returns (cleaned_name, status_type or None).
|
|
status_type is 'hiring' or 'open_to_work'.
|
|
"""
|
|
if not name:
|
|
return name, None
|
|
|
|
# Check for job seeking patterns
|
|
match = JOB_SEEKING_REGEX.search(name)
|
|
if match:
|
|
matched_text = match.group(0).lower()
|
|
cleaned = JOB_SEEKING_REGEX.sub('', name).strip()
|
|
|
|
# Determine status type
|
|
if 'hiring' in matched_text:
|
|
return cleaned, 'hiring'
|
|
else:
|
|
return cleaned, 'open_to_work'
|
|
|
|
return name, None
|
|
|
|
|
|
def clean_entity_file(filepath: Path, dry_run: bool = False) -> dict:
|
|
"""Clean a person entity JSON file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
changes = []
|
|
status_overall = None
|
|
|
|
# Clean source_staff_info.name
|
|
if 'source_staff_info' in data and 'name' in data['source_staff_info']:
|
|
orig_name = data['source_staff_info']['name']
|
|
cleaned, status = clean_name(orig_name)
|
|
if cleaned != orig_name:
|
|
changes.append(f"source_staff_info.name: '{orig_name}' -> '{cleaned}'")
|
|
status_overall = status_overall or status
|
|
if not dry_run:
|
|
data['source_staff_info']['name'] = cleaned
|
|
data['source_staff_info']['job_seeking_status'] = status
|
|
|
|
# Clean profile_data.name
|
|
if 'profile_data' in data and 'name' in data['profile_data']:
|
|
orig_name = data['profile_data']['name']
|
|
cleaned, status = clean_name(orig_name)
|
|
if cleaned != orig_name:
|
|
changes.append(f"profile_data.name: '{orig_name}' -> '{cleaned}'")
|
|
status_overall = status_overall or status
|
|
if not dry_run:
|
|
data['profile_data']['name'] = cleaned
|
|
data['profile_data']['job_seeking_status'] = status
|
|
|
|
# Clean profile_data.full_name (some entity files use this instead of name)
|
|
if 'profile_data' in data and 'full_name' in data['profile_data']:
|
|
orig_name = data['profile_data']['full_name']
|
|
cleaned, status = clean_name(orig_name)
|
|
if cleaned != orig_name:
|
|
changes.append(f"profile_data.full_name: '{orig_name}' -> '{cleaned}'")
|
|
status_overall = status_overall or status
|
|
if not dry_run:
|
|
data['profile_data']['full_name'] = cleaned
|
|
data['profile_data']['job_seeking_status'] = status
|
|
|
|
# Clean profile_data.last_name (may also contain the suffix)
|
|
if 'profile_data' in data and 'last_name' in data['profile_data']:
|
|
orig_name = data['profile_data']['last_name']
|
|
cleaned, status = clean_name(orig_name)
|
|
if cleaned != orig_name:
|
|
changes.append(f"profile_data.last_name: '{orig_name}' -> '{cleaned}'")
|
|
status_overall = status_overall or status
|
|
if not dry_run:
|
|
data['profile_data']['last_name'] = cleaned
|
|
|
|
if changes and not dry_run:
|
|
data['_cleaning_metadata'] = {
|
|
'cleaned_date': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
|
|
'changes_applied': changes
|
|
}
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
return {'file': str(filepath), 'changes': changes}
|
|
|
|
|
|
def clean_staff_file(filepath: Path, dry_run: bool = False) -> dict:
|
|
"""Clean a staff list JSON file, removing org entries and cleaning names."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
changes = []
|
|
custodian_name = data.get('custodian_metadata', {}).get('custodian_name', '')
|
|
|
|
if 'staff' not in data:
|
|
return {'file': str(filepath), 'changes': []}
|
|
|
|
original_count = len(data['staff'])
|
|
filtered_staff = []
|
|
|
|
for staff in data['staff']:
|
|
name = staff.get('name', '')
|
|
|
|
# Check if this is the organization itself
|
|
if is_organization_entry(name, custodian_name):
|
|
changes.append(f"REMOVED ORG ENTRY: '{name}'")
|
|
continue
|
|
|
|
# Clean job seeking from name
|
|
cleaned, status = clean_name(name)
|
|
if cleaned != name:
|
|
changes.append(f"CLEANED NAME: '{name}' -> '{cleaned}' (status: {status})")
|
|
if not dry_run:
|
|
staff['name'] = cleaned
|
|
staff['job_seeking_status'] = status
|
|
|
|
# Also clean headline field if present
|
|
if 'headline' in staff:
|
|
headline = staff['headline']
|
|
cleaned_headline, _ = clean_name(headline)
|
|
if cleaned_headline != headline:
|
|
changes.append(f"CLEANED HEADLINE: '{headline}' -> '{cleaned_headline}'")
|
|
if not dry_run:
|
|
staff['headline'] = cleaned_headline
|
|
|
|
filtered_staff.append(staff)
|
|
|
|
if not dry_run and changes:
|
|
data['staff'] = filtered_staff
|
|
if 'staff_analysis' in data:
|
|
data['staff_analysis']['total_staff_extracted'] = len(filtered_staff)
|
|
data['_cleaning_metadata'] = {
|
|
'cleaned_date': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
|
|
'original_staff_count': original_count,
|
|
'final_staff_count': len(filtered_staff),
|
|
'changes_applied': changes
|
|
}
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
return {'file': str(filepath), 'changes': changes}
|
|
|
|
|
|
def clean_generic_json_file(filepath: Path, dry_run: bool = False) -> dict:
|
|
"""Clean any JSON file by recursively finding and cleaning name-like fields and keys."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
changes = []
|
|
updates = [] # Store updates to apply after iteration
|
|
key_renames = [] # Store (obj, old_key, new_key) tuples for key renaming
|
|
|
|
def clean_recursive(obj, path=""):
|
|
"""Recursively clean name fields and keys in any JSON structure."""
|
|
nonlocal changes, updates, key_renames
|
|
|
|
if isinstance(obj, dict):
|
|
keys = list(obj.keys()) # Create a copy of keys to avoid mutation during iteration
|
|
for key in keys:
|
|
value = obj[key]
|
|
current_path = f"{path}.{key}" if path else key
|
|
|
|
# Clean dictionary keys that look like dirty names (e.g., in name_to_slug mappings)
|
|
if isinstance(key, str):
|
|
cleaned_key, _ = clean_name(key)
|
|
if cleaned_key != key:
|
|
changes.append(f"RENAMED KEY at {path}: '{key}' -> '{cleaned_key}'")
|
|
if not dry_run:
|
|
key_renames.append((obj, key, cleaned_key))
|
|
|
|
# Clean name-like field values
|
|
if key in ('name', 'full_name', 'headline', 'last_name') and isinstance(value, str):
|
|
cleaned, status = clean_name(value)
|
|
if cleaned != value:
|
|
changes.append(f"{current_path}: '{value}' -> '{cleaned}'")
|
|
if not dry_run:
|
|
updates.append((obj, key, cleaned, status if key == 'name' else None))
|
|
elif isinstance(value, (dict, list)):
|
|
clean_recursive(value, current_path)
|
|
elif isinstance(obj, list):
|
|
for i, item in enumerate(obj):
|
|
current_path = f"{path}[{i}]"
|
|
if isinstance(item, str):
|
|
# Check if this looks like a name with job seeking text
|
|
cleaned, _ = clean_name(item)
|
|
if cleaned != item:
|
|
changes.append(f"{current_path}: '{item}' -> '{cleaned}'")
|
|
if not dry_run:
|
|
obj[i] = cleaned
|
|
elif isinstance(item, (dict, list)):
|
|
clean_recursive(item, current_path)
|
|
|
|
clean_recursive(data)
|
|
|
|
# Apply key renames after iteration is complete (to avoid dict mutation during iteration)
|
|
for obj, old_key, new_key in key_renames:
|
|
if old_key in obj: # Check key still exists
|
|
obj[new_key] = obj.pop(old_key)
|
|
|
|
# Apply value updates after iteration is complete
|
|
for obj, key, cleaned, status in updates:
|
|
obj[key] = cleaned
|
|
if status:
|
|
obj['job_seeking_status'] = status
|
|
|
|
if changes and not dry_run:
|
|
data['_cleaning_metadata'] = {
|
|
'cleaned_date': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
|
|
'changes_applied': changes
|
|
}
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
return {'file': str(filepath), 'changes': changes}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Clean person data files')
|
|
parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying')
|
|
args = parser.parse_args()
|
|
|
|
base_path = Path('/Users/kempersc/apps/glam/data/custodian/person')
|
|
|
|
# Track statistics
|
|
total_files = 0
|
|
files_with_changes = 0
|
|
total_changes = 0
|
|
org_entries_removed = 0
|
|
names_cleaned = 0
|
|
|
|
# Clean entity files
|
|
entity_dir = base_path / 'entity'
|
|
if entity_dir.exists():
|
|
for filepath in entity_dir.glob('*.json'):
|
|
result = clean_entity_file(filepath, args.dry_run)
|
|
total_files += 1
|
|
if result['changes']:
|
|
files_with_changes += 1
|
|
total_changes += len(result['changes'])
|
|
names_cleaned += sum(1 for c in result['changes'] if 'name' in c.lower())
|
|
if not args.dry_run or args.dry_run:
|
|
print(f"\n{filepath.name}:")
|
|
for change in result['changes']:
|
|
print(f" - {change}")
|
|
|
|
# Clean staff files from both directories
|
|
# 1. affiliated/parsed/ - legacy parsed staff files
|
|
# 2. bu/ - current staff files used by Qdrant sync
|
|
staff_dirs = [
|
|
base_path / 'affiliated' / 'parsed',
|
|
base_path / 'bu',
|
|
]
|
|
|
|
for staff_dir in staff_dirs:
|
|
if staff_dir.exists():
|
|
print(f"\nProcessing staff directory: {staff_dir}")
|
|
for filepath in staff_dir.glob('*_staff_*.json'):
|
|
result = clean_staff_file(filepath, args.dry_run)
|
|
total_files += 1
|
|
if result['changes']:
|
|
files_with_changes += 1
|
|
total_changes += len(result['changes'])
|
|
org_entries_removed += sum(1 for c in result['changes'] if 'REMOVED ORG' in c)
|
|
names_cleaned += sum(1 for c in result['changes'] if 'CLEANED NAME' in c or 'CLEANED HEADLINE' in c)
|
|
if not args.dry_run or args.dry_run:
|
|
print(f"\n{filepath.name}:")
|
|
for change in result['changes']:
|
|
print(f" - {change}")
|
|
|
|
# Clean other JSON files in bu/ directory (not matching *_staff_*.json)
|
|
bu_dir = base_path / 'bu'
|
|
if bu_dir.exists():
|
|
print(f"\nProcessing other JSON files in bu/:")
|
|
for filepath in bu_dir.glob('*.json'):
|
|
if '_staff_' not in filepath.name: # Skip staff files already processed
|
|
result = clean_generic_json_file(filepath, args.dry_run)
|
|
total_files += 1
|
|
if result['changes']:
|
|
files_with_changes += 1
|
|
total_changes += len(result['changes'])
|
|
names_cleaned += len(result['changes'])
|
|
if not args.dry_run or args.dry_run:
|
|
print(f"\n{filepath.name}:")
|
|
for change in result['changes']:
|
|
print(f" - {change}")
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total files scanned: {total_files}")
|
|
print(f"Files with changes: {files_with_changes}")
|
|
print(f"Total changes: {total_changes}")
|
|
print(f" - Names cleaned (job seeking removed): {names_cleaned}")
|
|
print(f" - Organization entries removed: {org_entries_removed}")
|
|
|
|
if args.dry_run:
|
|
print("\n[DRY RUN - No changes applied. Run without --dry-run to apply.]")
|
|
else:
|
|
print("\n[Changes applied successfully]")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|