glam/scripts/create_basic_entity_profiles.py
2025-12-14 17:09:55 +01:00

139 lines
5 KiB
Python

#!/usr/bin/env python3
"""
Create basic entity profiles for privacy-restricted LinkedIn profiles.
These profiles cannot be extracted via Exa API (403 SOURCE_NOT_AVAILABLE),
but we have basic information from the LinkedIn "People" search results.
This script creates minimal entity profiles using the data we already have
from the parsed staff files.
"""
import json
import os
from datetime import datetime, timezone
from pathlib import Path
def get_existing_slugs(entity_dir: Path) -> set[str]:
"""Get set of slugs that already have entity profiles."""
slugs = set()
for f in entity_dir.glob("*.json"):
# Filename format: {slug}_{timestamp}.json
slug = f.stem.rsplit("_", 1)[0]
slugs.add(slug)
return slugs
def create_basic_entity(profile: dict, timestamp: str) -> dict:
"""Create a basic entity profile from parsed staff data."""
return {
"extraction_metadata": {
"source_file": profile.get("source_file", "missing_entity_profiles.json"),
"staff_id": profile.get("staff_id"),
"extraction_date": timestamp,
"extraction_method": "fallback_basic",
"extraction_agent": "",
"linkedin_url": profile.get("linkedin_url"),
"cost_usd": 0,
"notes": "Basic profile created from LinkedIn search results. Full extraction failed due to privacy restrictions (403 SOURCE_NOT_AVAILABLE)."
},
"source_staff_info": {
"name": profile.get("name"),
"headline": profile.get("headline"),
"heritage_type": profile.get("heritage_type"),
"custodian": profile.get("custodian"),
"custodian_slug": profile.get("custodian_slug")
},
"profile_data": {
"name": profile.get("name"),
"headline": profile.get("headline"),
"linkedin_url": profile.get("linkedin_url"),
"location": None,
"about": None,
"experience": [],
"education": [],
"skills": [],
"languages": []
},
"heritage_relevance": {
"is_heritage_relevant": True,
"heritage_types": [profile.get("heritage_type")] if profile.get("heritage_type") else [],
"rationale": f"Identified as heritage-relevant from LinkedIn search results for {profile.get('custodian', 'unknown custodian')}. Heritage type: {profile.get('heritage_type', 'unknown')}."
},
"affiliations": [
{
"custodian_name": profile.get("custodian"),
"custodian_slug": profile.get("custodian_slug"),
"role_title": profile.get("headline"),
"heritage_relevant": True,
"heritage_type": profile.get("heritage_type"),
"current": True,
"observed_on": timestamp,
"source": "linkedin_people_search"
}
] if profile.get("custodian") else []
}
def main():
base_dir = Path("/Users/kempersc/apps/glam")
entity_dir = base_dir / "data/custodian/person/entity"
missing_file = base_dir / "data/custodian/person/affiliated/parsed/missing_entity_profiles.json"
# Load missing profiles
print(f"Loading missing profiles from {missing_file}...")
with open(missing_file) as f:
data = json.load(f)
all_profiles = data.get("missing_heritage_profiles", [])
print(f"Total heritage profiles in list: {len(all_profiles)}")
# Get existing slugs
existing_slugs = get_existing_slugs(entity_dir)
print(f"Existing entity profiles: {len(existing_slugs)}")
# Filter to only those needing basic profiles
remaining = [p for p in all_profiles if p.get("slug") and p.get("slug") not in existing_slugs]
print(f"Profiles needing basic entities: {len(remaining)}")
if not remaining:
print("No profiles need basic entities. Done!")
return
# Create basic entities
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
created = 0
errors = 0
for i, profile in enumerate(remaining):
slug = profile.get("slug")
if not slug:
errors += 1
continue
try:
entity = create_basic_entity(profile, datetime.now(timezone.utc).isoformat())
# Save entity file
output_file = entity_dir / f"{slug}_{timestamp}.json"
with open(output_file, "w") as f:
json.dump(entity, f, indent=2)
created += 1
if created % 500 == 0:
print(f" Created {created}/{len(remaining)} basic profiles...")
except Exception as e:
print(f" Error creating profile for {slug}: {e}")
errors += 1
print(f"\nDone!")
print(f" Created: {created} basic entity profiles")
print(f" Errors: {errors}")
print(f" Total entities now: {len(existing_slugs) + created}")
if __name__ == "__main__":
main()