glam/scripts/add_person_observations_to_custodians.py
2025-12-15 22:31:41 +01:00

198 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""
Add person_observations sections to matched custodian YAML files.
This script reads the matching results from custodian_yaml_matches_final.json
and adds person_observations sections to custodian YAML files that don't have them.
Per Rule 27 (Person-Custodian Data Architecture):
- Custodian YAML files store only references and affiliation provenance
- Person entity files (data/custodian/person/entity/) store full profile data
"""
import json
import yaml
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
def load_staff_json(slug: str, parsed_dir: Path) -> dict | None:
"""Load staff JSON file for a given custodian slug."""
# Try different file patterns
patterns = [
f"{slug}_staff_*.json",
f"{slug.replace('-', '_')}_staff_*.json",
]
for pattern in patterns:
matches = list(parsed_dir.glob(pattern))
if matches:
# Get the most recent file
latest = max(matches, key=lambda p: p.stat().st_mtime)
with open(latest, 'r', encoding='utf-8') as f:
return json.load(f)
return None
def build_person_observations(staff_data: dict, custodian_slug: str) -> dict:
"""Build person_observations structure from staff JSON data."""
metadata = staff_data.get('custodian_metadata', {})
staff_list = staff_data.get('staff', [])
analysis = staff_data.get('staff_analysis', {})
# Count staff with LinkedIn URLs
staff_with_linkedin = sum(1 for s in staff_list if s.get('linkedin_url'))
# Build observation metadata
observation_metadata = {
'retrieval_agent': 'linkedin_html_parser',
'retrieval_timestamp': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/",
'html_file': None,
'staff_count_total': metadata.get('associated_members', len(staff_list)),
'staff_count_extracted': len(staff_list),
'staff_count_with_linkedin': staff_with_linkedin,
'staff_count_heritage_relevant': analysis.get('heritage_relevant_count', 0),
}
# Build staff list (abbreviated for custodian YAML - full data in entity files)
staff_observations = []
for i, staff in enumerate(staff_list):
person_id = staff.get('staff_id', f"{custodian_slug}_staff_{i:04d}")
# Extract LinkedIn slug from URL
linkedin_url = staff.get('linkedin_url', '')
linkedin_slug = ''
if linkedin_url and '/in/' in linkedin_url:
linkedin_slug = linkedin_url.split('/in/')[-1].rstrip('/')
observation = {
'person_id': person_id,
'person_name': staff.get('name', 'Unknown'),
'role_title': staff.get('headline', ''),
'heritage_relevant': staff.get('heritage_relevant', False),
'heritage_type': staff.get('heritage_type'),
'current': True,
'affiliation_provenance': {
'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/",
'retrieved_on': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
'retrieval_agent': 'linkedin_html_parser',
},
}
# Add LinkedIn URL if available
if linkedin_url:
observation['linkedin_profile_url'] = linkedin_url
# Check for entity file
if linkedin_slug:
entity_path = f"data/custodian/person/entity/{linkedin_slug}_*.json"
observation['linkedin_profile_path'] = None # Placeholder - entity files may not exist yet
staff_observations.append(observation)
return {
'observation_metadata': observation_metadata,
'staff': staff_observations,
}
def add_person_observations_to_yaml(yaml_path: Path, person_observations: dict) -> bool:
"""Add person_observations section to a custodian YAML file."""
# Read existing YAML
with open(yaml_path, 'r', encoding='utf-8') as f:
content = f.read()
# Check if already has person_observations
if 'person_observations:' in content:
print(f" ⚠️ Already has person_observations, skipping")
return False
# Parse YAML
data = yaml.safe_load(content)
# Add person_observations
data['person_observations'] = person_observations
# Write back
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True
def main():
base_dir = Path('/Users/kempersc/apps/glam')
parsed_dir = base_dir / 'data/custodian/person/affiliated/parsed'
matches_file = parsed_dir / 'custodian_yaml_matches_final.json'
# Load matching results
with open(matches_file, 'r', encoding='utf-8') as f:
matches = json.load(f)
matched = matches['matched']
print(f"Found {len(matched)} matched custodians")
print("=" * 60)
processed = 0
skipped = 0
errors = 0
for m in matched:
custodian = m['custodian']
yaml_file = Path(m['yaml_file'])
slug = custodian['slug']
name = custodian['name']
staff_count = custodian['staff_count']
heritage_count = custodian['heritage_count']
print(f"\n{name} ({staff_count} staff, {heritage_count} heritage)")
print(f" YAML: {yaml_file.name}")
# Check if YAML file exists
if not yaml_file.exists():
print(f" ❌ YAML file not found")
errors += 1
continue
# Check if already has person_observations
with open(yaml_file, 'r', encoding='utf-8') as f:
if 'person_observations:' in f.read():
print(f" ⚠️ Already has person_observations")
skipped += 1
continue
# Load staff JSON
staff_data = load_staff_json(slug, parsed_dir)
if not staff_data:
print(f" ❌ Staff JSON not found for slug: {slug}")
errors += 1
continue
print(f" Staff JSON: {slug}_staff_*.json ({len(staff_data.get('staff', []))} entries)")
# Build person_observations
person_observations = build_person_observations(staff_data, slug)
# Add to YAML
if add_person_observations_to_yaml(yaml_file, person_observations):
print(f" ✅ Added person_observations")
processed += 1
else:
skipped += 1
print("\n" + "=" * 60)
print(f"Summary:")
print(f" Processed: {processed}")
print(f" Skipped (already done): {skipped}")
print(f" Errors: {errors}")
if __name__ == '__main__':
main()