#!/usr/bin/env python3 """ Review Linkup Enrichment Results Interactive script to review profiles enriched via Linkup API and assign proper heritage scores and experience data. Usage: python scripts/review_timeline_enrichments.py [--limit N] """ import json import sys import argparse from datetime import datetime, timezone from pathlib import Path ENTITY_DIR = Path("data/custodian/person/entity") LINKUP_DIR = Path("data/custodian/web/linkedin") def find_enriched_profiles(limit: int = None) -> list[Path]: """Find profiles that have Linkup enrichment but need manual review.""" candidates = [] for json_file in ENTITY_DIR.glob("*_20251214T*.json"): try: with open(json_file) as f: data = json.load(f) # Check if has Linkup enrichment that needs review linkup = data.get("timeline_enrichment", {}) if not linkup: continue # Skip if already manually reviewed (confidence > 0.5) if linkup.get("confidence_score", 0) > 0.6: continue # Skip if already marked non-heritage with rationale heritage = data.get("heritage_relevance", {}) if not heritage.get("is_heritage_relevant", True) and "RECLASSIFIED" in heritage.get("rationale", ""): continue candidates.append(json_file) if limit and len(candidates) >= limit: break except (json.JSONDecodeError, KeyError) as e: continue return candidates def load_linkup_results(profile_data: dict) -> dict | None: """Load the Linkup search results for a profile.""" linkup = profile_data.get("timeline_enrichment", {}) results_file = linkup.get("results_file") if results_file and Path(results_file).exists(): with open(results_file) as f: return json.load(f) return None def display_profile_summary(data: dict, results: dict | None): """Display profile and search results for review.""" source = data.get("source_staff_info", {}) profile = data.get("profile_data", {}) print("\n" + "=" * 70) print(f"NAME: {source.get('name', 'Unknown')}") print(f"CUSTODIAN: {source.get('custodian', 'Unknown')}") print(f"HEADLINE: {source.get('headline', 'N/A')}") print(f"HERITAGE TYPE: {source.get('heritage_type', 'N/A')}") print("=" * 70) if results: print("\nLINKUP SEARCH RESULTS:") print("-" * 40) search_results = results.get("results", {}).get("results", []) for i, result in enumerate(search_results[:5], 1): print(f"\n[{i}] {result.get('name', 'N/A')}") print(f" URL: {result.get('url', 'N/A')}") content = result.get('content', '')[:200] if content: print(f" {content}...") else: print("\n[No Linkup results available]") print("\n" + "-" * 70) def get_user_input(prompt: str, valid_options: list = None, allow_empty: bool = False) -> str: """Get validated user input.""" while True: response = input(prompt).strip() if allow_empty and not response: return response if valid_options and response.lower() not in [o.lower() for o in valid_options]: print(f"Invalid option. Choose from: {', '.join(valid_options)}") continue return response def review_profile(json_file: Path) -> dict | None: """Interactively review a single profile.""" with open(json_file) as f: data = json.load(f) results = load_linkup_results(data) display_profile_summary(data, results) # Ask if heritage relevant print("\nIs this profile HERITAGE RELEVANT?") print(" y = Yes (museum, archive, library, etc. professional)") print(" n = No (non-heritage organization or support role)") print(" s = Skip (come back later)") print(" q = Quit review") choice = get_user_input("Choice [y/n/s/q]: ", ["y", "n", "s", "q"]) if choice == "q": return None if choice == "s": return {"status": "skipped"} is_heritage = choice == "y" if not is_heritage: # Mark as non-heritage rationale = get_user_input("Rationale for non-heritage classification: ") data["heritage_relevance"] = { "is_heritage_relevant": False, "heritage_types": [], "score": 0.0, "rationale": f"RECLASSIFIED AS NON-HERITAGE. {rationale}" } data["timeline_enrichment"]["confidence_score"] = 1.0 data["timeline_enrichment"]["notes"] = f"Manual review on {datetime.now().strftime('%Y-%m-%d')}: Non-heritage" else: # Get heritage score print("\nHeritage Score (0.0-1.0):") print(" 0.9-1.0 = Senior professional (curator, director, archivist)") print(" 0.7-0.8 = Mid-level heritage role (specialist, researcher)") print(" 0.5-0.6 = Entry-level or support role (assistant, intern)") print(" 0.3-0.4 = Heritage-adjacent (IT, admin in heritage org)") score_str = get_user_input("Score [0.0-1.0]: ") try: score = float(score_str) score = max(0.0, min(1.0, score)) except ValueError: score = 0.5 # Get heritage types print("\nHeritage Types (comma-separated: A=Archive, M=Museum, L=Library, D=Digital, etc.):") types_str = get_user_input("Types: ", allow_empty=True) types = [t.strip().upper() for t in types_str.split(",") if t.strip()] # Get rationale rationale = get_user_input("Brief rationale: ") data["heritage_relevance"] = { "is_heritage_relevant": True, "heritage_types": types, "score": score, "rationale": rationale } data["timeline_enrichment"]["confidence_score"] = 0.8 data["timeline_enrichment"]["notes"] = f"Manual review on {datetime.now().strftime('%Y-%m-%d')}: {rationale}" # Save updated profile with open(json_file, "w") as f: json.dump(data, f, indent=2) print(f"\n✓ Updated: {json_file}") return {"status": "updated", "is_heritage": is_heritage} def main(): parser = argparse.ArgumentParser(description="Review Linkup enrichment results") parser.add_argument("--limit", type=int, default=10, help="Maximum profiles to review") args = parser.parse_args() print("LinkedIn Profile Enrichment Review") print("=" * 50) candidates = find_enriched_profiles(limit=args.limit) print(f"Found {len(candidates)} profiles needing review") if not candidates: print("No profiles to review.") return reviewed = 0 skipped = 0 for json_file in candidates: result = review_profile(json_file) if result is None: # User quit break elif result.get("status") == "skipped": skipped += 1 else: reviewed += 1 print("\n" + "=" * 50) print(f"Reviewed: {reviewed}") print(f"Skipped: {skipped}") if __name__ == "__main__": main()