139 lines
5 KiB
Python
139 lines
5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Create basic entity profiles for privacy-restricted LinkedIn profiles.
|
|
|
|
These profiles cannot be extracted via Exa API (403 SOURCE_NOT_AVAILABLE),
|
|
but we have basic information from the LinkedIn "People" search results.
|
|
|
|
This script creates minimal entity profiles using the data we already have
|
|
from the parsed staff files.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
def get_existing_slugs(entity_dir: Path) -> set[str]:
|
|
"""Get set of slugs that already have entity profiles."""
|
|
slugs = set()
|
|
for f in entity_dir.glob("*.json"):
|
|
# Filename format: {slug}_{timestamp}.json
|
|
slug = f.stem.rsplit("_", 1)[0]
|
|
slugs.add(slug)
|
|
return slugs
|
|
|
|
|
|
def create_basic_entity(profile: dict, timestamp: str) -> dict:
|
|
"""Create a basic entity profile from parsed staff data."""
|
|
return {
|
|
"extraction_metadata": {
|
|
"source_file": profile.get("source_file", "missing_entity_profiles.json"),
|
|
"staff_id": profile.get("staff_id"),
|
|
"extraction_date": timestamp,
|
|
"extraction_method": "fallback_basic",
|
|
"extraction_agent": "",
|
|
"linkedin_url": profile.get("linkedin_url"),
|
|
"cost_usd": 0,
|
|
"notes": "Basic profile created from LinkedIn search results. Full extraction failed due to privacy restrictions (403 SOURCE_NOT_AVAILABLE)."
|
|
},
|
|
"source_staff_info": {
|
|
"name": profile.get("name"),
|
|
"headline": profile.get("headline"),
|
|
"heritage_type": profile.get("heritage_type"),
|
|
"custodian": profile.get("custodian"),
|
|
"custodian_slug": profile.get("custodian_slug")
|
|
},
|
|
"profile_data": {
|
|
"name": profile.get("name"),
|
|
"headline": profile.get("headline"),
|
|
"linkedin_url": profile.get("linkedin_url"),
|
|
"location": None,
|
|
"about": None,
|
|
"experience": [],
|
|
"education": [],
|
|
"skills": [],
|
|
"languages": []
|
|
},
|
|
"heritage_relevance": {
|
|
"is_heritage_relevant": True,
|
|
"heritage_types": [profile.get("heritage_type")] if profile.get("heritage_type") else [],
|
|
"rationale": f"Identified as heritage-relevant from LinkedIn search results for {profile.get('custodian', 'unknown custodian')}. Heritage type: {profile.get('heritage_type', 'unknown')}."
|
|
},
|
|
"affiliations": [
|
|
{
|
|
"custodian_name": profile.get("custodian"),
|
|
"custodian_slug": profile.get("custodian_slug"),
|
|
"role_title": profile.get("headline"),
|
|
"heritage_relevant": True,
|
|
"heritage_type": profile.get("heritage_type"),
|
|
"current": True,
|
|
"observed_on": timestamp,
|
|
"source": "linkedin_people_search"
|
|
}
|
|
] if profile.get("custodian") else []
|
|
}
|
|
|
|
|
|
def main():
|
|
base_dir = Path("/Users/kempersc/apps/glam")
|
|
entity_dir = base_dir / "data/custodian/person/entity"
|
|
missing_file = base_dir / "data/custodian/person/affiliated/parsed/missing_entity_profiles.json"
|
|
|
|
# Load missing profiles
|
|
print(f"Loading missing profiles from {missing_file}...")
|
|
with open(missing_file) as f:
|
|
data = json.load(f)
|
|
|
|
all_profiles = data.get("missing_heritage_profiles", [])
|
|
print(f"Total heritage profiles in list: {len(all_profiles)}")
|
|
|
|
# Get existing slugs
|
|
existing_slugs = get_existing_slugs(entity_dir)
|
|
print(f"Existing entity profiles: {len(existing_slugs)}")
|
|
|
|
# Filter to only those needing basic profiles
|
|
remaining = [p for p in all_profiles if p.get("slug") and p.get("slug") not in existing_slugs]
|
|
print(f"Profiles needing basic entities: {len(remaining)}")
|
|
|
|
if not remaining:
|
|
print("No profiles need basic entities. Done!")
|
|
return
|
|
|
|
# Create basic entities
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
created = 0
|
|
errors = 0
|
|
|
|
for i, profile in enumerate(remaining):
|
|
slug = profile.get("slug")
|
|
if not slug:
|
|
errors += 1
|
|
continue
|
|
|
|
try:
|
|
entity = create_basic_entity(profile, datetime.now(timezone.utc).isoformat())
|
|
|
|
# Save entity file
|
|
output_file = entity_dir / f"{slug}_{timestamp}.json"
|
|
with open(output_file, "w") as f:
|
|
json.dump(entity, f, indent=2)
|
|
|
|
created += 1
|
|
|
|
if created % 500 == 0:
|
|
print(f" Created {created}/{len(remaining)} basic profiles...")
|
|
|
|
except Exception as e:
|
|
print(f" Error creating profile for {slug}: {e}")
|
|
errors += 1
|
|
|
|
print(f"\nDone!")
|
|
print(f" Created: {created} basic entity profiles")
|
|
print(f" Errors: {errors}")
|
|
print(f" Total entities now: {len(existing_slugs) + created}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|