#!/usr/bin/env python3 """ Create basic entity profiles for privacy-restricted LinkedIn profiles. These profiles cannot be extracted via Exa API (403 SOURCE_NOT_AVAILABLE), but we have basic information from the LinkedIn "People" search results. This script creates minimal entity profiles using the data we already have from the parsed staff files. """ import json import os from datetime import datetime, timezone from pathlib import Path def get_existing_slugs(entity_dir: Path) -> set[str]: """Get set of slugs that already have entity profiles.""" slugs = set() for f in entity_dir.glob("*.json"): # Filename format: {slug}_{timestamp}.json slug = f.stem.rsplit("_", 1)[0] slugs.add(slug) return slugs def create_basic_entity(profile: dict, timestamp: str) -> dict: """Create a basic entity profile from parsed staff data.""" return { "extraction_metadata": { "source_file": profile.get("source_file", "missing_entity_profiles.json"), "staff_id": profile.get("staff_id"), "extraction_date": timestamp, "extraction_method": "fallback_basic", "extraction_agent": "", "linkedin_url": profile.get("linkedin_url"), "cost_usd": 0, "notes": "Basic profile created from LinkedIn search results. Full extraction failed due to privacy restrictions (403 SOURCE_NOT_AVAILABLE)." }, "source_staff_info": { "name": profile.get("name"), "headline": profile.get("headline"), "heritage_type": profile.get("heritage_type"), "custodian": profile.get("custodian"), "custodian_slug": profile.get("custodian_slug") }, "profile_data": { "name": profile.get("name"), "headline": profile.get("headline"), "linkedin_url": profile.get("linkedin_url"), "location": None, "about": None, "experience": [], "education": [], "skills": [], "languages": [] }, "heritage_relevance": { "is_heritage_relevant": True, "heritage_types": [profile.get("heritage_type")] if profile.get("heritage_type") else [], "rationale": f"Identified as heritage-relevant from LinkedIn search results for {profile.get('custodian', 'unknown custodian')}. Heritage type: {profile.get('heritage_type', 'unknown')}." }, "affiliations": [ { "custodian_name": profile.get("custodian"), "custodian_slug": profile.get("custodian_slug"), "role_title": profile.get("headline"), "heritage_relevant": True, "heritage_type": profile.get("heritage_type"), "current": True, "observed_on": timestamp, "source": "linkedin_people_search" } ] if profile.get("custodian") else [] } def main(): base_dir = Path("/Users/kempersc/apps/glam") entity_dir = base_dir / "data/custodian/person/entity" missing_file = base_dir / "data/custodian/person/affiliated/parsed/missing_entity_profiles.json" # Load missing profiles print(f"Loading missing profiles from {missing_file}...") with open(missing_file) as f: data = json.load(f) all_profiles = data.get("missing_heritage_profiles", []) print(f"Total heritage profiles in list: {len(all_profiles)}") # Get existing slugs existing_slugs = get_existing_slugs(entity_dir) print(f"Existing entity profiles: {len(existing_slugs)}") # Filter to only those needing basic profiles remaining = [p for p in all_profiles if p.get("slug") and p.get("slug") not in existing_slugs] print(f"Profiles needing basic entities: {len(remaining)}") if not remaining: print("No profiles need basic entities. Done!") return # Create basic entities timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") created = 0 errors = 0 for i, profile in enumerate(remaining): slug = profile.get("slug") if not slug: errors += 1 continue try: entity = create_basic_entity(profile, datetime.now(timezone.utc).isoformat()) # Save entity file output_file = entity_dir / f"{slug}_{timestamp}.json" with open(output_file, "w") as f: json.dump(entity, f, indent=2) created += 1 if created % 500 == 0: print(f" Created {created}/{len(remaining)} basic profiles...") except Exception as e: print(f" Error creating profile for {slug}: {e}") errors += 1 print(f"\nDone!") print(f" Created: {created} basic entity profiles") print(f" Errors: {errors}") print(f" Total entities now: {len(existing_slugs) + created}") if __name__ == "__main__": main()