glam/scripts/review_linkup_enrichments.py

#!/usr/bin/env python3
"""
Review Linkup Enrichment Results

Interactive script to review profiles enriched via Linkup API and assign
proper heritage scores and experience data.

Usage:
    python scripts/review_timeline_enrichments.py [--limit N]
"""

import json
import sys
import argparse
from datetime import datetime, timezone
from pathlib import Path

ENTITY_DIR = Path("data/custodian/person/entity")
LINKUP_DIR = Path("data/custodian/web/linkedin")


def find_enriched_profiles(limit: int = None) -> list[Path]:
    """Find profiles that have Linkup enrichment but need manual review."""
    candidates = []

    for json_file in ENTITY_DIR.glob("*_20251214T*.json"):
        try:
            with open(json_file) as f:
                data = json.load(f)

            # Check if has Linkup enrichment that needs review
            linkup = data.get("timeline_enrichment", {})
            if not linkup:
                continue

            # Skip if already manually reviewed (confidence > 0.5)
            if linkup.get("confidence_score", 0) > 0.6:
                continue

            # Skip if already marked non-heritage with rationale
            heritage = data.get("heritage_relevance", {})
            if not heritage.get("is_heritage_relevant", True) and "RECLASSIFIED" in heritage.get("rationale", ""):
                continue

            candidates.append(json_file)

            if limit and len(candidates) >= limit:
                break

        except (json.JSONDecodeError, KeyError) as e:
            continue

    return candidates


def load_linkup_results(profile_data: dict) -> dict | None:
    """Load the Linkup search results for a profile."""
    linkup = profile_data.get("timeline_enrichment", {})
    results_file = linkup.get("results_file")

    if results_file and Path(results_file).exists():
        with open(results_file) as f:
            return json.load(f)
    return None


def display_profile_summary(data: dict, results: dict | None):
    """Display profile and search results for review."""
    source = data.get("source_staff_info", {})
    profile = data.get("profile_data", {})

    print("\n" + "=" * 70)
    print(f"NAME: {source.get('name', 'Unknown')}")
    print(f"CUSTODIAN: {source.get('custodian', 'Unknown')}")
    print(f"HEADLINE: {source.get('headline', 'N/A')}")
    print(f"HERITAGE TYPE: {source.get('heritage_type', 'N/A')}")
    print("=" * 70)

    if results:
        print("\nLINKUP SEARCH RESULTS:")
        print("-" * 40)
        search_results = results.get("results", {}).get("results", [])
        for i, result in enumerate(search_results[:5], 1):
            print(f"\n[{i}] {result.get('name', 'N/A')}")
            print(f"    URL: {result.get('url', 'N/A')}")
            content = result.get('content', '')[:200]
            if content:
                print(f"    {content}...")
    else:
        print("\n[No Linkup results available]")

    print("\n" + "-" * 70)


def get_user_input(prompt: str, valid_options: list = None, allow_empty: bool = False) -> str:
    """Get validated user input."""
    while True:
        response = input(prompt).strip()
        if allow_empty and not response:
            return response
        if valid_options and response.lower() not in [o.lower() for o in valid_options]:
            print(f"Invalid option. Choose from: {', '.join(valid_options)}")
            continue
        return response


def review_profile(json_file: Path) -> dict | None:
    """Interactively review a single profile."""
    with open(json_file) as f:
        data = json.load(f)

    results = load_linkup_results(data)
    display_profile_summary(data, results)

    # Ask if heritage relevant
    print("\nIs this profile HERITAGE RELEVANT?")
    print("  y = Yes (museum, archive, library, etc. professional)")
    print("  n = No (non-heritage organization or support role)")
    print("  s = Skip (come back later)")
    print("  q = Quit review")

    choice = get_user_input("Choice [y/n/s/q]: ", ["y", "n", "s", "q"])

    if choice == "q":
        return None
    if choice == "s":
        return {"status": "skipped"}

    is_heritage = choice == "y"

    if not is_heritage:
        # Mark as non-heritage
        rationale = get_user_input("Rationale for non-heritage classification: ")
        data["heritage_relevance"] = {
            "is_heritage_relevant": False,
            "heritage_types": [],
            "score": 0.0,
            "rationale": f"RECLASSIFIED AS NON-HERITAGE. {rationale}"
        }
        data["timeline_enrichment"]["confidence_score"] = 1.0
        data["timeline_enrichment"]["notes"] = f"Manual review on {datetime.now().strftime('%Y-%m-%d')}: Non-heritage"
    else:
        # Get heritage score
        print("\nHeritage Score (0.0-1.0):")
        print("  0.9-1.0 = Senior professional (curator, director, archivist)")
        print("  0.7-0.8 = Mid-level heritage role (specialist, researcher)")
        print("  0.5-0.6 = Entry-level or support role (assistant, intern)")
        print("  0.3-0.4 = Heritage-adjacent (IT, admin in heritage org)")

        score_str = get_user_input("Score [0.0-1.0]: ")
        try:
            score = float(score_str)
            score = max(0.0, min(1.0, score))
        except ValueError:
            score = 0.5

        # Get heritage types
        print("\nHeritage Types (comma-separated: A=Archive, M=Museum, L=Library, D=Digital, etc.):")
        types_str = get_user_input("Types: ", allow_empty=True)
        types = [t.strip().upper() for t in types_str.split(",") if t.strip()]

        # Get rationale
        rationale = get_user_input("Brief rationale: ")

        data["heritage_relevance"] = {
            "is_heritage_relevant": True,
            "heritage_types": types,
            "score": score,
            "rationale": rationale
        }
        data["timeline_enrichment"]["confidence_score"] = 0.8
        data["timeline_enrichment"]["notes"] = f"Manual review on {datetime.now().strftime('%Y-%m-%d')}: {rationale}"

    # Save updated profile
    with open(json_file, "w") as f:
        json.dump(data, f, indent=2)

    print(f"\n✓ Updated: {json_file}")
    return {"status": "updated", "is_heritage": is_heritage}


def main():
    parser = argparse.ArgumentParser(description="Review Linkup enrichment results")
    parser.add_argument("--limit", type=int, default=10, help="Maximum profiles to review")
    args = parser.parse_args()

    print("LinkedIn Profile Enrichment Review")
    print("=" * 50)

    candidates = find_enriched_profiles(limit=args.limit)
    print(f"Found {len(candidates)} profiles needing review")

    if not candidates:
        print("No profiles to review.")
        return

    reviewed = 0
    skipped = 0

    for json_file in candidates:
        result = review_profile(json_file)

        if result is None:  # User quit
            break
        elif result.get("status") == "skipped":
            skipped += 1
        else:
            reviewed += 1

    print("\n" + "=" * 50)
    print(f"Reviewed: {reviewed}")
    print(f"Skipped: {skipped}")


if __name__ == "__main__":
    main()