glam/scripts/enrich_ppids_linkup.py

#!/usr/bin/env python3
"""
PPID Enrichment via Linkup Web Search (Rule 34 & 44 Compliant)

Uses Linkup search to find birth years and biographical data from:
- Academic profiles (university pages, ResearchGate, Academia.edu)
- News articles and press releases
- Institutional websites
- Wikipedia, Wikidata

Per Rule 34: Linkup is the preferred web scraper.
Per Rule 44: Birth dates use EDTF notation with web search enrichment.
Per Rule 45: All inferred data includes explicit provenance.

Usage:
    python scripts/enrich_ppids_linkup.py [--limit N] [--dry-run]
"""

import json
import os
import re
import time
import argparse
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
import httpx

# Linkup API configuration
LINKUP_API_URL = "https://api.linkup.so/v1/search"


def get_linkup_api_key() -> str:
    """Get Linkup API key from environment."""
    # Try .env file first
    env_path = Path(__file__).parent.parent / ".env"
    if env_path.exists():
        with open(env_path) as f:
            for line in f:
                if line.startswith("LINKUP_API_KEY="):
                    return line.strip().split("=", 1)[1].strip('"\'')

    # Fall back to environment variable
    key = os.environ.get("LINKUP_API_KEY", "")
    if not key:
        raise ValueError("LINKUP_API_KEY not found in .env or environment")
    return key


def search_linkup(query: str, api_key: str, depth: str = "standard") -> Dict[str, Any]:
    """Execute Linkup search query.

    Returns dict with 'answer' (synthesized response) and 'sources' (list of source URLs).
    The MCP tool returns 'results' but the API returns 'answer' + 'sources'.
    """
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    payload = {
        "q": query,
        "depth": depth,
        "outputType": "sourcedAnswer"
    }

    try:
        with httpx.Client(timeout=30.0) as client:
            response = client.post(LINKUP_API_URL, headers=headers, json=payload)
            response.raise_for_status()
            return response.json()
    except Exception as e:
        return {"error": str(e)}


def extract_birth_year_from_text(text: str, name: str) -> Optional[Tuple[int, str, float]]:
    """
    Extract birth year from text mentioning the person.
    Returns (year, source_snippet, confidence) or None.
    """
    if not text or not name:
        return None

    # Get name parts for matching
    name_parts = name.lower().split()
    last_name = name_parts[-1] if name_parts else ""

    # Patterns to find birth year (ordered by specificity)
    patterns = [
        # "born on 11 February 1948" or "born December 3, 1951"
        (r'born\s+(?:on\s+)?(?:\d{1,2}\s+)?\w+\s+(?:\d{1,2},?\s+)?(\d{4})', 0.95),
        # "was born in 1955" or "born in Amsterdam in 1955"
        (r'(?:was\s+)?born\s+(?:in\s+\w+\s+)?in\s+(\d{4})', 0.95),
        # "geboren in 1955" (Dutch)
        (r'geboren\s+(?:in\s+)?(\d{4})', 0.95),
        # "Name (born 1951)"
        (r'\(born\s+(\d{4})\)', 0.95),
        # "Name (1951)" - common Wikipedia format
        (r'\((\d{4})\)', 0.90),
        # "born in 1951"
        (r'born\s+(?:in\s+)?(\d{4})', 0.90),
        # "Name, born in New York City, USA, in 1951"
        (r'born\s+in\s+[\w\s,]+,?\s+in\s+(\d{4})', 0.85),
        # Fallback: just find a year after "born"
        (r'born.*?(\d{4})', 0.80),
    ]

    for pattern, confidence in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            year = int(match.group(1))
            if 1920 <= year <= 2010:  # Reasonable birth year range
                # Get context around match
                start = max(0, match.start() - 50)
                end = min(len(text), match.end() + 50)
                snippet = text[start:end].strip()
                return (year, snippet, confidence)

    return None


def search_person_birth_year(name: str, affiliations: List[str], api_key: str) -> Optional[Dict[str, Any]]:
    """
    Search for person's birth year using Linkup.
    The API returns 'answer' (synthesized) and 'sources' (URLs).
    """
    # Build search query with context
    affiliation_context = ""
    if affiliations:
        # Use first heritage-related affiliation
        for aff in affiliations[:2]:
            if any(keyword in aff.lower() for keyword in
                   ['museum', 'archive', 'library', 'university', 'heritage', 'curator']):
                affiliation_context = aff
                break
        if not affiliation_context and affiliations:
            affiliation_context = affiliations[0]

    # Search queries to try
    queries = [
        f'"{name}" born biography {affiliation_context}',
        f'"{name}" biography age born year',
    ]

    for query in queries:
        result = search_linkup(query, api_key)

        if "error" in result:
            continue

        # The API returns 'answer' field with synthesized response
        answer = result.get("answer", "")
        if answer:
            birth_info = extract_birth_year_from_text(answer, name)
            if birth_info:
                year, snippet, confidence = birth_info
                # Get first source URL if available
                sources = result.get("sources", [])
                source_url = sources[0].get("url", "") if sources else ""
                source_name = sources[0].get("name", "") if sources else ""

                return {
                    "birth_year": year,
                    "edtf": str(year),
                    "source_snippet": snippet,
                    "source_url": source_url,
                    "source_title": source_name,
                    "confidence": confidence,
                    "search_query": query,
                    "source_type": "linkup_answer"
                }

        # Rate limit
        time.sleep(0.5)

    return None


def enrich_ppid_file(filepath: Path, api_key: str, dry_run: bool = False) -> Dict[str, Any]:
    """
    Enrich a single PPID file with Linkup search data.
    Returns enrichment result.
    """
    with open(filepath) as f:
        data = json.load(f)

    # Skip if already has confirmed birth year
    birth_date = data.get("birth_date", {})
    if birth_date.get("edtf") and birth_date.get("edtf") != "XXXX":
        if not birth_date.get("edtf", "").endswith("X"):
            return {"status": "skipped", "reason": "already_has_birth_year"}

    # Get name
    name_data = data.get("name", {})
    full_name = name_data.get("full_name") or name_data.get("display_name", "")
    if not full_name or full_name == "LinkedIn Member":
        return {"status": "skipped", "reason": "no_name"}

    # Skip if not heritage relevant
    heritage = data.get("heritage_relevance", {})
    if not heritage.get("is_heritage_relevant"):
        return {"status": "skipped", "reason": "not_heritage_relevant"}

    # Get affiliations for context
    affiliations = []
    for aff in data.get("affiliations", []):
        if isinstance(aff, dict):
            org = aff.get("organization") or aff.get("company", "")
            if org:
                affiliations.append(org)

    # Also check profile_data
    profile = data.get("profile_data", {})
    headline = profile.get("headline", "")
    if headline:
        affiliations.insert(0, headline)

    if not affiliations:
        return {"status": "skipped", "reason": "no_affiliations"}

    # Search for birth year
    result = search_person_birth_year(full_name, affiliations, api_key)

    if not result:
        return {"status": "not_found", "name": full_name}

    # Build enrichment data with provenance (Rule 45)
    timestamp = datetime.now(timezone.utc).isoformat()

    enrichment = {
        "web_search_enrichment": {
            "birth_year_discovery": {
                "value": result["birth_year"],
                "edtf": result["edtf"],
                "confidence": result["confidence"],
                "provenance": {
                    "statement_created_at": timestamp,
                    "source_archived_at": timestamp,  # Search result is ephemeral
                    "retrieval_agent": "enrich_ppids_linkup.py",
                    "method": "linkup_web_search",
                    "search_query": result["search_query"],
                    "source_url": result.get("source_url", ""),
                    "source_title": result.get("source_title", ""),
                    "source_snippet": result["source_snippet"],
                    "source_type": result["source_type"]
                }
            }
        }
    }

    if not dry_run:
        # Merge with existing data
        if "web_search_enrichment" not in data:
            data["web_search_enrichment"] = {}
        data["web_search_enrichment"]["birth_year_discovery"] = enrichment["web_search_enrichment"]["birth_year_discovery"]

        # Update birth_date if we found a specific year (better than XXXX or decade)
        current_birth = data.get("birth_date", {}).get("edtf", "XXXX")
        if current_birth == "XXXX" or current_birth.endswith("X"):
            if result["confidence"] >= 0.80:
                data["birth_date"] = {
                    "edtf": result["edtf"],
                    "precision": "year",
                    "source": "web_search_enrichment",
                    "confidence": result["confidence"]
                }

        # Save
        with open(filepath, "w") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

    return {
        "status": "enriched",
        "name": full_name,
        "birth_year": result["birth_year"],
        "confidence": result["confidence"],
        "source": result.get("source_url", result["source_type"])
    }


def main():
    parser = argparse.ArgumentParser(description="Enrich PPID files using Linkup web search")
    parser.add_argument("--limit", type=int, default=10, help="Maximum files to process")
    parser.add_argument("--dry-run", action="store_true", help="Don't write changes")
    parser.add_argument("--min-confidence", type=float, default=0.70, help="Minimum confidence threshold")
    parser.add_argument("--heritage-only", action="store_true", default=True, help="Only process heritage-relevant profiles")
    args = parser.parse_args()

    # Get API key
    try:
        api_key = get_linkup_api_key()
        print(f"✓ Linkup API key loaded")
    except ValueError as e:
        print(f"✗ {e}")
        return

    # Find PPID files
    ppid_dir = Path(__file__).parent.parent / "data" / "person"
    if not ppid_dir.exists():
        print(f"✗ PPID directory not found: {ppid_dir}")
        return

    ppid_files = list(ppid_dir.glob("ID_*.json"))
    print(f"Found {len(ppid_files)} PPID files")

    # Filter to files needing enrichment (unknown or decade-only birth dates)
    candidates = []
    for f in ppid_files:
        try:
            with open(f) as fp:
                data = json.load(fp)

            # Check heritage relevance
            if args.heritage_only:
                heritage = data.get("heritage_relevance", {})
                if not heritage.get("is_heritage_relevant"):
                    continue

            # Check if birth date needs enrichment
            birth = data.get("birth_date", {}).get("edtf", "XXXX")
            if birth == "XXXX" or birth.endswith("X"):
                # Prioritize those with good names
                name = data.get("name", {}).get("full_name", "")
                if name and name != "LinkedIn Member":
                    candidates.append(f)
        except:
            continue

    print(f"Found {len(candidates)} files needing birth year enrichment")

    # Process
    stats = {"enriched": 0, "not_found": 0, "skipped": 0, "errors": 0}
    results = []

    for i, filepath in enumerate(candidates[:args.limit]):
        print(f"\n[{i+1}/{min(len(candidates), args.limit)}] Processing {filepath.name}...")

        try:
            result = enrich_ppid_file(filepath, api_key, args.dry_run)
            stats[result.get("status", "errors")] = stats.get(result.get("status", "errors"), 0) + 1

            if result["status"] == "enriched":
                print(f"  ✓ Found birth year: {result['birth_year']} (confidence: {result['confidence']:.0%})")
                results.append(result)
            elif result["status"] == "not_found":
                print(f"  ✗ No birth year found for {result.get('name', 'unknown')}")
            else:
                print(f"  - Skipped: {result.get('reason', 'unknown')}")

            # Rate limit
            time.sleep(1.0)

        except Exception as e:
            print(f"  ✗ Error: {e}")
            stats["errors"] += 1

    # Summary
    print(f"\n{'='*50}")
    print("ENRICHMENT SUMMARY")
    print(f"{'='*50}")
    print(f"Processed: {sum(stats.values())}")
    print(f"Enriched:  {stats['enriched']}")
    print(f"Not found: {stats['not_found']}")
    print(f"Skipped:   {stats['skipped']}")
    print(f"Errors:    {stats['errors']}")

    if results:
        print(f"\nEnriched profiles:")
        for r in results:
            print(f"  - {r['name']}: born {r['birth_year']} ({r['confidence']:.0%})")


if __name__ == "__main__":
    main()