216 lines
7.2 KiB
Python
Executable file
216 lines
7.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Review Linkup Enrichment Results
|
|
|
|
Interactive script to review profiles enriched via Linkup API and assign
|
|
proper heritage scores and experience data.
|
|
|
|
Usage:
|
|
python scripts/review_timeline_enrichments.py [--limit N]
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import argparse
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
ENTITY_DIR = Path("data/custodian/person/entity")
|
|
LINKUP_DIR = Path("data/custodian/web/linkedin")
|
|
|
|
|
|
def find_enriched_profiles(limit: int = None) -> list[Path]:
|
|
"""Find profiles that have Linkup enrichment but need manual review."""
|
|
candidates = []
|
|
|
|
for json_file in ENTITY_DIR.glob("*_20251214T*.json"):
|
|
try:
|
|
with open(json_file) as f:
|
|
data = json.load(f)
|
|
|
|
# Check if has Linkup enrichment that needs review
|
|
linkup = data.get("timeline_enrichment", {})
|
|
if not linkup:
|
|
continue
|
|
|
|
# Skip if already manually reviewed (confidence > 0.5)
|
|
if linkup.get("confidence_score", 0) > 0.6:
|
|
continue
|
|
|
|
# Skip if already marked non-heritage with rationale
|
|
heritage = data.get("heritage_relevance", {})
|
|
if not heritage.get("is_heritage_relevant", True) and "RECLASSIFIED" in heritage.get("rationale", ""):
|
|
continue
|
|
|
|
candidates.append(json_file)
|
|
|
|
if limit and len(candidates) >= limit:
|
|
break
|
|
|
|
except (json.JSONDecodeError, KeyError) as e:
|
|
continue
|
|
|
|
return candidates
|
|
|
|
|
|
def load_linkup_results(profile_data: dict) -> dict | None:
|
|
"""Load the Linkup search results for a profile."""
|
|
linkup = profile_data.get("timeline_enrichment", {})
|
|
results_file = linkup.get("results_file")
|
|
|
|
if results_file and Path(results_file).exists():
|
|
with open(results_file) as f:
|
|
return json.load(f)
|
|
return None
|
|
|
|
|
|
def display_profile_summary(data: dict, results: dict | None):
|
|
"""Display profile and search results for review."""
|
|
source = data.get("source_staff_info", {})
|
|
profile = data.get("profile_data", {})
|
|
|
|
print("\n" + "=" * 70)
|
|
print(f"NAME: {source.get('name', 'Unknown')}")
|
|
print(f"CUSTODIAN: {source.get('custodian', 'Unknown')}")
|
|
print(f"HEADLINE: {source.get('headline', 'N/A')}")
|
|
print(f"HERITAGE TYPE: {source.get('heritage_type', 'N/A')}")
|
|
print("=" * 70)
|
|
|
|
if results:
|
|
print("\nLINKUP SEARCH RESULTS:")
|
|
print("-" * 40)
|
|
search_results = results.get("results", {}).get("results", [])
|
|
for i, result in enumerate(search_results[:5], 1):
|
|
print(f"\n[{i}] {result.get('name', 'N/A')}")
|
|
print(f" URL: {result.get('url', 'N/A')}")
|
|
content = result.get('content', '')[:200]
|
|
if content:
|
|
print(f" {content}...")
|
|
else:
|
|
print("\n[No Linkup results available]")
|
|
|
|
print("\n" + "-" * 70)
|
|
|
|
|
|
def get_user_input(prompt: str, valid_options: list = None, allow_empty: bool = False) -> str:
|
|
"""Get validated user input."""
|
|
while True:
|
|
response = input(prompt).strip()
|
|
if allow_empty and not response:
|
|
return response
|
|
if valid_options and response.lower() not in [o.lower() for o in valid_options]:
|
|
print(f"Invalid option. Choose from: {', '.join(valid_options)}")
|
|
continue
|
|
return response
|
|
|
|
|
|
def review_profile(json_file: Path) -> dict | None:
|
|
"""Interactively review a single profile."""
|
|
with open(json_file) as f:
|
|
data = json.load(f)
|
|
|
|
results = load_linkup_results(data)
|
|
display_profile_summary(data, results)
|
|
|
|
# Ask if heritage relevant
|
|
print("\nIs this profile HERITAGE RELEVANT?")
|
|
print(" y = Yes (museum, archive, library, etc. professional)")
|
|
print(" n = No (non-heritage organization or support role)")
|
|
print(" s = Skip (come back later)")
|
|
print(" q = Quit review")
|
|
|
|
choice = get_user_input("Choice [y/n/s/q]: ", ["y", "n", "s", "q"])
|
|
|
|
if choice == "q":
|
|
return None
|
|
if choice == "s":
|
|
return {"status": "skipped"}
|
|
|
|
is_heritage = choice == "y"
|
|
|
|
if not is_heritage:
|
|
# Mark as non-heritage
|
|
rationale = get_user_input("Rationale for non-heritage classification: ")
|
|
data["heritage_relevance"] = {
|
|
"is_heritage_relevant": False,
|
|
"heritage_types": [],
|
|
"score": 0.0,
|
|
"rationale": f"RECLASSIFIED AS NON-HERITAGE. {rationale}"
|
|
}
|
|
data["timeline_enrichment"]["confidence_score"] = 1.0
|
|
data["timeline_enrichment"]["notes"] = f"Manual review on {datetime.now().strftime('%Y-%m-%d')}: Non-heritage"
|
|
else:
|
|
# Get heritage score
|
|
print("\nHeritage Score (0.0-1.0):")
|
|
print(" 0.9-1.0 = Senior professional (curator, director, archivist)")
|
|
print(" 0.7-0.8 = Mid-level heritage role (specialist, researcher)")
|
|
print(" 0.5-0.6 = Entry-level or support role (assistant, intern)")
|
|
print(" 0.3-0.4 = Heritage-adjacent (IT, admin in heritage org)")
|
|
|
|
score_str = get_user_input("Score [0.0-1.0]: ")
|
|
try:
|
|
score = float(score_str)
|
|
score = max(0.0, min(1.0, score))
|
|
except ValueError:
|
|
score = 0.5
|
|
|
|
# Get heritage types
|
|
print("\nHeritage Types (comma-separated: A=Archive, M=Museum, L=Library, D=Digital, etc.):")
|
|
types_str = get_user_input("Types: ", allow_empty=True)
|
|
types = [t.strip().upper() for t in types_str.split(",") if t.strip()]
|
|
|
|
# Get rationale
|
|
rationale = get_user_input("Brief rationale: ")
|
|
|
|
data["heritage_relevance"] = {
|
|
"is_heritage_relevant": True,
|
|
"heritage_types": types,
|
|
"score": score,
|
|
"rationale": rationale
|
|
}
|
|
data["timeline_enrichment"]["confidence_score"] = 0.8
|
|
data["timeline_enrichment"]["notes"] = f"Manual review on {datetime.now().strftime('%Y-%m-%d')}: {rationale}"
|
|
|
|
# Save updated profile
|
|
with open(json_file, "w") as f:
|
|
json.dump(data, f, indent=2)
|
|
|
|
print(f"\n✓ Updated: {json_file}")
|
|
return {"status": "updated", "is_heritage": is_heritage}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Review Linkup enrichment results")
|
|
parser.add_argument("--limit", type=int, default=10, help="Maximum profiles to review")
|
|
args = parser.parse_args()
|
|
|
|
print("LinkedIn Profile Enrichment Review")
|
|
print("=" * 50)
|
|
|
|
candidates = find_enriched_profiles(limit=args.limit)
|
|
print(f"Found {len(candidates)} profiles needing review")
|
|
|
|
if not candidates:
|
|
print("No profiles to review.")
|
|
return
|
|
|
|
reviewed = 0
|
|
skipped = 0
|
|
|
|
for json_file in candidates:
|
|
result = review_profile(json_file)
|
|
|
|
if result is None: # User quit
|
|
break
|
|
elif result.get("status") == "skipped":
|
|
skipped += 1
|
|
else:
|
|
reviewed += 1
|
|
|
|
print("\n" + "=" * 50)
|
|
print(f"Reviewed: {reviewed}")
|
|
print(f"Skipped: {skipped}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|