glam/scripts/review_linkup_enrichments.py
2026-01-02 02:11:04 +01:00

216 lines
7.2 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Review Linkup Enrichment Results
Interactive script to review profiles enriched via Linkup API and assign
proper heritage scores and experience data.
Usage:
python scripts/review_timeline_enrichments.py [--limit N]
"""
import json
import sys
import argparse
from datetime import datetime, timezone
from pathlib import Path
ENTITY_DIR = Path("data/custodian/person/entity")
LINKUP_DIR = Path("data/custodian/web/linkedin")
def find_enriched_profiles(limit: int = None) -> list[Path]:
"""Find profiles that have Linkup enrichment but need manual review."""
candidates = []
for json_file in ENTITY_DIR.glob("*_20251214T*.json"):
try:
with open(json_file) as f:
data = json.load(f)
# Check if has Linkup enrichment that needs review
linkup = data.get("timeline_enrichment", {})
if not linkup:
continue
# Skip if already manually reviewed (confidence > 0.5)
if linkup.get("confidence_score", 0) > 0.6:
continue
# Skip if already marked non-heritage with rationale
heritage = data.get("heritage_relevance", {})
if not heritage.get("is_heritage_relevant", True) and "RECLASSIFIED" in heritage.get("rationale", ""):
continue
candidates.append(json_file)
if limit and len(candidates) >= limit:
break
except (json.JSONDecodeError, KeyError) as e:
continue
return candidates
def load_linkup_results(profile_data: dict) -> dict | None:
"""Load the Linkup search results for a profile."""
linkup = profile_data.get("timeline_enrichment", {})
results_file = linkup.get("results_file")
if results_file and Path(results_file).exists():
with open(results_file) as f:
return json.load(f)
return None
def display_profile_summary(data: dict, results: dict | None):
"""Display profile and search results for review."""
source = data.get("source_staff_info", {})
profile = data.get("profile_data", {})
print("\n" + "=" * 70)
print(f"NAME: {source.get('name', 'Unknown')}")
print(f"CUSTODIAN: {source.get('custodian', 'Unknown')}")
print(f"HEADLINE: {source.get('headline', 'N/A')}")
print(f"HERITAGE TYPE: {source.get('heritage_type', 'N/A')}")
print("=" * 70)
if results:
print("\nLINKUP SEARCH RESULTS:")
print("-" * 40)
search_results = results.get("results", {}).get("results", [])
for i, result in enumerate(search_results[:5], 1):
print(f"\n[{i}] {result.get('name', 'N/A')}")
print(f" URL: {result.get('url', 'N/A')}")
content = result.get('content', '')[:200]
if content:
print(f" {content}...")
else:
print("\n[No Linkup results available]")
print("\n" + "-" * 70)
def get_user_input(prompt: str, valid_options: list = None, allow_empty: bool = False) -> str:
"""Get validated user input."""
while True:
response = input(prompt).strip()
if allow_empty and not response:
return response
if valid_options and response.lower() not in [o.lower() for o in valid_options]:
print(f"Invalid option. Choose from: {', '.join(valid_options)}")
continue
return response
def review_profile(json_file: Path) -> dict | None:
"""Interactively review a single profile."""
with open(json_file) as f:
data = json.load(f)
results = load_linkup_results(data)
display_profile_summary(data, results)
# Ask if heritage relevant
print("\nIs this profile HERITAGE RELEVANT?")
print(" y = Yes (museum, archive, library, etc. professional)")
print(" n = No (non-heritage organization or support role)")
print(" s = Skip (come back later)")
print(" q = Quit review")
choice = get_user_input("Choice [y/n/s/q]: ", ["y", "n", "s", "q"])
if choice == "q":
return None
if choice == "s":
return {"status": "skipped"}
is_heritage = choice == "y"
if not is_heritage:
# Mark as non-heritage
rationale = get_user_input("Rationale for non-heritage classification: ")
data["heritage_relevance"] = {
"is_heritage_relevant": False,
"heritage_types": [],
"score": 0.0,
"rationale": f"RECLASSIFIED AS NON-HERITAGE. {rationale}"
}
data["timeline_enrichment"]["confidence_score"] = 1.0
data["timeline_enrichment"]["notes"] = f"Manual review on {datetime.now().strftime('%Y-%m-%d')}: Non-heritage"
else:
# Get heritage score
print("\nHeritage Score (0.0-1.0):")
print(" 0.9-1.0 = Senior professional (curator, director, archivist)")
print(" 0.7-0.8 = Mid-level heritage role (specialist, researcher)")
print(" 0.5-0.6 = Entry-level or support role (assistant, intern)")
print(" 0.3-0.4 = Heritage-adjacent (IT, admin in heritage org)")
score_str = get_user_input("Score [0.0-1.0]: ")
try:
score = float(score_str)
score = max(0.0, min(1.0, score))
except ValueError:
score = 0.5
# Get heritage types
print("\nHeritage Types (comma-separated: A=Archive, M=Museum, L=Library, D=Digital, etc.):")
types_str = get_user_input("Types: ", allow_empty=True)
types = [t.strip().upper() for t in types_str.split(",") if t.strip()]
# Get rationale
rationale = get_user_input("Brief rationale: ")
data["heritage_relevance"] = {
"is_heritage_relevant": True,
"heritage_types": types,
"score": score,
"rationale": rationale
}
data["timeline_enrichment"]["confidence_score"] = 0.8
data["timeline_enrichment"]["notes"] = f"Manual review on {datetime.now().strftime('%Y-%m-%d')}: {rationale}"
# Save updated profile
with open(json_file, "w") as f:
json.dump(data, f, indent=2)
print(f"\n✓ Updated: {json_file}")
return {"status": "updated", "is_heritage": is_heritage}
def main():
parser = argparse.ArgumentParser(description="Review Linkup enrichment results")
parser.add_argument("--limit", type=int, default=10, help="Maximum profiles to review")
args = parser.parse_args()
print("LinkedIn Profile Enrichment Review")
print("=" * 50)
candidates = find_enriched_profiles(limit=args.limit)
print(f"Found {len(candidates)} profiles needing review")
if not candidates:
print("No profiles to review.")
return
reviewed = 0
skipped = 0
for json_file in candidates:
result = review_profile(json_file)
if result is None: # User quit
break
elif result.get("status") == "skipped":
skipped += 1
else:
reviewed += 1
print("\n" + "=" * 50)
print(f"Reviewed: {reviewed}")
print(f"Skipped: {skipped}")
if __name__ == "__main__":
main()