198 lines
6.8 KiB
Python
198 lines
6.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Add person_observations sections to matched custodian YAML files.
|
|
|
|
This script reads the matching results from custodian_yaml_matches_final.json
|
|
and adds person_observations sections to custodian YAML files that don't have them.
|
|
|
|
Per Rule 27 (Person-Custodian Data Architecture):
|
|
- Custodian YAML files store only references and affiliation provenance
|
|
- Person entity files (data/custodian/person/entity/) store full profile data
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
def load_staff_json(slug: str, parsed_dir: Path) -> dict | None:
|
|
"""Load staff JSON file for a given custodian slug."""
|
|
# Try different file patterns
|
|
patterns = [
|
|
f"{slug}_staff_*.json",
|
|
f"{slug.replace('-', '_')}_staff_*.json",
|
|
]
|
|
|
|
for pattern in patterns:
|
|
matches = list(parsed_dir.glob(pattern))
|
|
if matches:
|
|
# Get the most recent file
|
|
latest = max(matches, key=lambda p: p.stat().st_mtime)
|
|
with open(latest, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
return None
|
|
|
|
|
|
def build_person_observations(staff_data: dict, custodian_slug: str) -> dict:
|
|
"""Build person_observations structure from staff JSON data."""
|
|
|
|
metadata = staff_data.get('custodian_metadata', {})
|
|
staff_list = staff_data.get('staff', [])
|
|
analysis = staff_data.get('staff_analysis', {})
|
|
|
|
# Count staff with LinkedIn URLs
|
|
staff_with_linkedin = sum(1 for s in staff_list if s.get('linkedin_url'))
|
|
|
|
# Build observation metadata
|
|
observation_metadata = {
|
|
'retrieval_agent': 'linkedin_html_parser',
|
|
'retrieval_timestamp': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/",
|
|
'html_file': None,
|
|
'staff_count_total': metadata.get('associated_members', len(staff_list)),
|
|
'staff_count_extracted': len(staff_list),
|
|
'staff_count_with_linkedin': staff_with_linkedin,
|
|
'staff_count_heritage_relevant': analysis.get('heritage_relevant_count', 0),
|
|
}
|
|
|
|
# Build staff list (abbreviated for custodian YAML - full data in entity files)
|
|
staff_observations = []
|
|
for i, staff in enumerate(staff_list):
|
|
person_id = staff.get('staff_id', f"{custodian_slug}_staff_{i:04d}")
|
|
|
|
# Extract LinkedIn slug from URL
|
|
linkedin_url = staff.get('linkedin_url', '')
|
|
linkedin_slug = ''
|
|
if linkedin_url and '/in/' in linkedin_url:
|
|
linkedin_slug = linkedin_url.split('/in/')[-1].rstrip('/')
|
|
|
|
observation = {
|
|
'person_id': person_id,
|
|
'person_name': staff.get('name', 'Unknown'),
|
|
'role_title': staff.get('headline', ''),
|
|
'heritage_relevant': staff.get('heritage_relevant', False),
|
|
'heritage_type': staff.get('heritage_type'),
|
|
'current': True,
|
|
'affiliation_provenance': {
|
|
'source_url': f"https://www.linkedin.com/company/{custodian_slug}/people/",
|
|
'retrieved_on': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
'retrieval_agent': 'linkedin_html_parser',
|
|
},
|
|
}
|
|
|
|
# Add LinkedIn URL if available
|
|
if linkedin_url:
|
|
observation['linkedin_profile_url'] = linkedin_url
|
|
|
|
# Check for entity file
|
|
if linkedin_slug:
|
|
entity_path = f"data/custodian/person/entity/{linkedin_slug}_*.json"
|
|
observation['linkedin_profile_path'] = None # Placeholder - entity files may not exist yet
|
|
|
|
staff_observations.append(observation)
|
|
|
|
return {
|
|
'observation_metadata': observation_metadata,
|
|
'staff': staff_observations,
|
|
}
|
|
|
|
|
|
def add_person_observations_to_yaml(yaml_path: Path, person_observations: dict) -> bool:
|
|
"""Add person_observations section to a custodian YAML file."""
|
|
|
|
# Read existing YAML
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Check if already has person_observations
|
|
if 'person_observations:' in content:
|
|
print(f" ⚠️ Already has person_observations, skipping")
|
|
return False
|
|
|
|
# Parse YAML
|
|
data = yaml.safe_load(content)
|
|
|
|
# Add person_observations
|
|
data['person_observations'] = person_observations
|
|
|
|
# Write back
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
base_dir = Path('/Users/kempersc/apps/glam')
|
|
parsed_dir = base_dir / 'data/custodian/person/affiliated/parsed'
|
|
matches_file = parsed_dir / 'custodian_yaml_matches_final.json'
|
|
|
|
# Load matching results
|
|
with open(matches_file, 'r', encoding='utf-8') as f:
|
|
matches = json.load(f)
|
|
|
|
matched = matches['matched']
|
|
|
|
print(f"Found {len(matched)} matched custodians")
|
|
print("=" * 60)
|
|
|
|
processed = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
for m in matched:
|
|
custodian = m['custodian']
|
|
yaml_file = Path(m['yaml_file'])
|
|
slug = custodian['slug']
|
|
name = custodian['name']
|
|
staff_count = custodian['staff_count']
|
|
heritage_count = custodian['heritage_count']
|
|
|
|
print(f"\n{name} ({staff_count} staff, {heritage_count} heritage)")
|
|
print(f" YAML: {yaml_file.name}")
|
|
|
|
# Check if YAML file exists
|
|
if not yaml_file.exists():
|
|
print(f" ❌ YAML file not found")
|
|
errors += 1
|
|
continue
|
|
|
|
# Check if already has person_observations
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
if 'person_observations:' in f.read():
|
|
print(f" ⚠️ Already has person_observations")
|
|
skipped += 1
|
|
continue
|
|
|
|
# Load staff JSON
|
|
staff_data = load_staff_json(slug, parsed_dir)
|
|
if not staff_data:
|
|
print(f" ❌ Staff JSON not found for slug: {slug}")
|
|
errors += 1
|
|
continue
|
|
|
|
print(f" Staff JSON: {slug}_staff_*.json ({len(staff_data.get('staff', []))} entries)")
|
|
|
|
# Build person_observations
|
|
person_observations = build_person_observations(staff_data, slug)
|
|
|
|
# Add to YAML
|
|
if add_person_observations_to_yaml(yaml_file, person_observations):
|
|
print(f" ✅ Added person_observations")
|
|
processed += 1
|
|
else:
|
|
skipped += 1
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"Summary:")
|
|
print(f" Processed: {processed}")
|
|
print(f" Skipped (already done): {skipped}")
|
|
print(f" Errors: {errors}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|