#!/usr/bin/env python3 """ Add person_observations sections to matched custodian YAML files. This script reads the matching results from custodian_yaml_matches_final.json and adds person_observations sections to custodian YAML files that don't have them. Per Rule 27 (Person-Custodian Data Architecture): - Custodian YAML files store only references and affiliation provenance - Person entity files (data/custodian/person/entity/) store full profile data """ import json import yaml import os import sys from datetime import datetime, timezone from pathlib import Path def load_staff_json(slug: str, parsed_dir: Path) -> dict | None: """Load staff JSON file for a given custodian slug.""" # Try different file patterns patterns = [ f"{slug}_staff_*.json", f"{slug.replace('-', '_')}_staff_*.json", ] for pattern in patterns: matches = list(parsed_dir.glob(pattern)) if matches: # Get the most recent file latest = max(matches, key=lambda p: p.stat().st_mtime) with open(latest, 'r', encoding='utf-8') as f: return json.load(f) return None def build_person_observations(staff_data: dict, custodian_slug: str) -> dict: """Build person_observations structure from staff JSON data.""" metadata = staff_data.get('custodian_metadata', {}) staff_list = staff_data.get('staff', []) analysis = staff_data.get('staff_analysis', {}) # Count staff with LinkedIn URLs staff_with_linkedin = sum(1 for s in staff_list if s.get('linkedin_url')) # Build observation metadata observation_metadata = { 'retrieval_agent': 'linkedin_html_parser', 'retrieval_timestamp': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), 'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/", 'html_file': None, 'staff_count_total': metadata.get('associated_members', len(staff_list)), 'staff_count_extracted': len(staff_list), 'staff_count_with_linkedin': staff_with_linkedin, 'staff_count_heritage_relevant': analysis.get('heritage_relevant_count', 0), } # Build staff list (abbreviated for custodian YAML - full data in entity files) staff_observations = [] for i, staff in enumerate(staff_list): person_id = staff.get('staff_id', f"{custodian_slug}_staff_{i:04d}") # Extract LinkedIn slug from URL linkedin_url = staff.get('linkedin_url', '') linkedin_slug = '' if linkedin_url and '/in/' in linkedin_url: linkedin_slug = linkedin_url.split('/in/')[-1].rstrip('/') observation = { 'person_id': person_id, 'person_name': staff.get('name', 'Unknown'), 'role_title': staff.get('headline', ''), 'heritage_relevant': staff.get('heritage_relevant', False), 'heritage_type': staff.get('heritage_type'), 'current': True, 'affiliation_provenance': { 'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/", 'retrieved_on': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), 'retrieval_agent': 'linkedin_html_parser', }, } # Add LinkedIn URL if available if linkedin_url: observation['linkedin_profile_url'] = linkedin_url # Check for entity file if linkedin_slug: entity_path = f"data/custodian/person/entity/{linkedin_slug}_*.json" observation['linkedin_profile_path'] = None # Placeholder - entity files may not exist yet staff_observations.append(observation) return { 'observation_metadata': observation_metadata, 'staff': staff_observations, } def add_person_observations_to_yaml(yaml_path: Path, person_observations: dict) -> bool: """Add person_observations section to a custodian YAML file.""" # Read existing YAML with open(yaml_path, 'r', encoding='utf-8') as f: content = f.read() # Check if already has person_observations if 'person_observations:' in content: print(f" ⚠️ Already has person_observations, skipping") return False # Parse YAML data = yaml.safe_load(content) # Add person_observations data['person_observations'] = person_observations # Write back with open(yaml_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True def main(): base_dir = Path('/Users/kempersc/apps/glam') parsed_dir = base_dir / 'data/custodian/person/affiliated/parsed' matches_file = parsed_dir / 'custodian_yaml_matches_final.json' # Load matching results with open(matches_file, 'r', encoding='utf-8') as f: matches = json.load(f) matched = matches['matched'] print(f"Found {len(matched)} matched custodians") print("=" * 60) processed = 0 skipped = 0 errors = 0 for m in matched: custodian = m['custodian'] yaml_file = Path(m['yaml_file']) slug = custodian['slug'] name = custodian['name'] staff_count = custodian['staff_count'] heritage_count = custodian['heritage_count'] print(f"\n{name} ({staff_count} staff, {heritage_count} heritage)") print(f" YAML: {yaml_file.name}") # Check if YAML file exists if not yaml_file.exists(): print(f" ❌ YAML file not found") errors += 1 continue # Check if already has person_observations with open(yaml_file, 'r', encoding='utf-8') as f: if 'person_observations:' in f.read(): print(f" ⚠️ Already has person_observations") skipped += 1 continue # Load staff JSON staff_data = load_staff_json(slug, parsed_dir) if not staff_data: print(f" ❌ Staff JSON not found for slug: {slug}") errors += 1 continue print(f" Staff JSON: {slug}_staff_*.json ({len(staff_data.get('staff', []))} entries)") # Build person_observations person_observations = build_person_observations(staff_data, slug) # Add to YAML if add_person_observations_to_yaml(yaml_file, person_observations): print(f" ✅ Added person_observations") processed += 1 else: skipped += 1 print("\n" + "=" * 60) print(f"Summary:") print(f" Processed: {processed}") print(f" Skipped (already done): {skipped}") print(f" Errors: {errors}") if __name__ == '__main__': main()