- Add inferred birth dates using EDTF notation - Add inferred birth/current settlements - Enrich employment history with temporal data - Add heritage sector relevance scores - Improve PPID component tracking - Update .gitignore with large file patterns (warc, nt, trix, geonames.db)
514 lines
24 KiB
Python
514 lines
24 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Comprehensive Person Profile Enrichment via Linkup Web Search
|
||
|
||
This script enriches person profiles with ALL discoverable data from web sources,
|
||
with FULL PROVENANCE for every claim. No data is stored without a verifiable source.
|
||
|
||
Rule Compliance:
|
||
- Rule 6: WebObservation Claims MUST Have XPath Provenance (adapted for web search)
|
||
- Rule 21: Data Fabrication is Strictly Prohibited
|
||
- Rule 26: Person Data Provenance - Web Claims for Staff Information
|
||
- Rule 34: Linkup is the Preferred Web Scraper
|
||
- Rule 35: Provenance Statements MUST Have Dual Timestamps
|
||
|
||
Data Extracted (when available):
|
||
- Birth date/year, birth location
|
||
- Education history, career milestones
|
||
- Publications, awards/honors
|
||
- Professional affiliations
|
||
- Death date (if applicable)
|
||
- Contact details (email, phone, social media)
|
||
- Media references (photos, videos, portraits)
|
||
|
||
Usage:
|
||
python scripts/enrich_person_comprehensive.py --limit N [--dry-run]
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import time
|
||
import argparse
|
||
import hashlib
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Optional, Dict, Any, List
|
||
import httpx
|
||
|
||
LINKUP_API_URL = "https://api.linkup.so/v1/search"
|
||
SCRIPT_VERSION = "1.2.0"
|
||
|
||
|
||
def get_linkup_api_key() -> str:
|
||
env_path = Path(__file__).parent.parent / ".env"
|
||
if env_path.exists():
|
||
with open(env_path) as f:
|
||
for line in f:
|
||
if line.startswith("LINKUP_API_KEY="):
|
||
return line.strip().split("=", 1)[1].strip('"\'')
|
||
key = os.environ.get("LINKUP_API_KEY", "")
|
||
if not key:
|
||
raise ValueError("LINKUP_API_KEY not found")
|
||
return key
|
||
|
||
|
||
def search_linkup(query: str, api_key: str, depth: str = "standard") -> Dict[str, Any]:
|
||
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
||
payload = {"q": query, "depth": depth, "outputType": "sourcedAnswer"}
|
||
request_ts = datetime.now(timezone.utc).isoformat()
|
||
try:
|
||
with httpx.Client(timeout=45.0) as client:
|
||
response = client.post(LINKUP_API_URL, headers=headers, json=payload)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
result["_meta"] = {"request_ts": request_ts, "response_ts": datetime.now(timezone.utc).isoformat(),
|
||
"status": response.status_code, "depth": depth}
|
||
return result
|
||
except Exception as e:
|
||
return {"error": str(e), "_meta": {"request_ts": request_ts}}
|
||
|
||
|
||
def create_claim(claim_type: str, claim_value: Any, source_url: str, source_title: str,
|
||
snippet: str, query: str, sources: List = None, meta: Dict = None,
|
||
answer: str = None, pattern: str = None) -> Dict:
|
||
ts = datetime.now(timezone.utc).isoformat()
|
||
src_ts = meta.get("request_ts", ts) if meta else ts
|
||
|
||
prov = {
|
||
"statement_created_at": ts, "source_archived_at": src_ts,
|
||
"retrieval_agent": f"enrich_person_comprehensive.py v{SCRIPT_VERSION}",
|
||
"retrieval_method": "linkup_web_search", "api_endpoint": LINKUP_API_URL,
|
||
"search_query": query, "search_depth": meta.get("depth", "standard") if meta else "standard",
|
||
"source_url": source_url, "source_title": source_title, "source_snippet": snippet,
|
||
"extraction_method": "regex_pattern_matching", "pattern_type": pattern,
|
||
"verified": False, "verification_status": "machine_extracted", "requires_human_review": True,
|
||
"http_status": meta.get("status") if meta else None,
|
||
}
|
||
|
||
if answer and snippet:
|
||
pos = answer.find(snippet[:50])
|
||
if pos >= 0:
|
||
prov["answer_position"] = f"answer[{pos}:{pos + len(snippet)}]"
|
||
|
||
if sources:
|
||
prov["all_sources"] = [{"url": s.get("url", ""), "name": s.get("name", "")} for s in sources[:5]]
|
||
prov["source_count"] = len(sources)
|
||
|
||
if answer:
|
||
prov["answer_content_hash"] = hashlib.sha256(answer.encode()).hexdigest()[:16]
|
||
|
||
return {"claim_type": claim_type, "claim_value": claim_value, "provenance": prov}
|
||
|
||
|
||
def extract_birth_year(text):
|
||
if not text: return None
|
||
patterns = [(r'born\s+(?:on\s+)?(\d{1,2}\s+\w+\s+)?(\d{4})', "full_date"),
|
||
(r'born\s+(?:on\s+)?(\w+\s+\d{1,2},?\s+)(\d{4})', "us_date"),
|
||
(r'(?:was\s+)?born\s+in\s+(\d{4})', "born_in"), (r'geboren\s+(?:in\s+)?(\d{4})', "dutch"),
|
||
(r'\(born\s+(\d{4})\)', "paren"), (r'\((\d{4})\)', "year_paren")]
|
||
for pat, ptype in patterns:
|
||
m = re.search(pat, text, re.I)
|
||
if m and m.lastindex:
|
||
yr = int(m.group(m.lastindex))
|
||
if 1900 <= yr <= 2010:
|
||
if ptype == "year_paren" and yr >= 1990: continue
|
||
return {"year": yr, "snippet": text[max(0,m.start()-40):m.end()+40].strip(), "pattern_type": ptype}
|
||
return None
|
||
|
||
|
||
def extract_birth_location(text):
|
||
for pat in [r'born\s+in\s+([A-Z][a-zA-Z\s,]+)', r'geboren\s+(?:te|in)\s+([A-Z][a-zA-Z\s]+)']:
|
||
m = re.search(pat, text)
|
||
if m:
|
||
loc = m.group(1).strip()
|
||
if loc.lower() not in ['the', 'a', 'an']:
|
||
return {"location": loc, "snippet": text[max(0,m.start()-30):m.end()+30].strip()}
|
||
return None
|
||
|
||
|
||
def extract_death_info(text):
|
||
for pat in [r'died\s+(?:on\s+)?(?:\d{1,2}\s+\w+\s+)?(\d{4})', r'\(\d{4}\s*[-–]\s*(\d{4})\)',
|
||
r'passed\s+away\s+(?:in\s+)?(\d{4})', r'overleden\s+(?:in\s+)?(\d{4})']:
|
||
m = re.search(pat, text, re.I)
|
||
if m:
|
||
yr = int(m.group(1))
|
||
if 1900 <= yr <= datetime.now().year:
|
||
return {"year": yr, "snippet": text[max(0,m.start()-30):m.end()+30].strip()}
|
||
return None
|
||
|
||
|
||
def extract_education(text):
|
||
edu = []
|
||
patterns = [(r'(Ph\.?D\.?|doctorate)\s+(?:from|at)\s+([A-Z][^,\.]+?)(?:\s+in\s+(\d{4}))?', "phd"),
|
||
(r"(master'?s?|M\.?A\.?)\s+(?:from|at)\s+([A-Z][^,\.]+)", "masters"),
|
||
(r'graduated\s+from\s+([A-Z][^,\.]+?)(?:\s+in\s+)?(\d{4})?', "graduated"),
|
||
(r'studied\s+(?:\w+\s+)?at\s+([A-Z][^,\.]+)', "studied")]
|
||
for pat, etype in patterns:
|
||
for m in re.finditer(pat, text, re.I):
|
||
inst = m.group(2) if etype in ["phd", "masters"] else m.group(1)
|
||
yr = None
|
||
if m.lastindex and m.lastindex >= 3 and m.group(3):
|
||
try: yr = int(m.group(3))
|
||
except: pass
|
||
edu.append({"type": etype, "institution": inst.strip(), "year": yr,
|
||
"snippet": text[max(0,m.start()-20):m.end()+20].strip()})
|
||
return edu
|
||
|
||
|
||
def extract_positions(text):
|
||
pos = []
|
||
for pat in [r'(professor|director|curator|head|chief)\s+(?:at|of)\s+([A-Z][^,\.]{3,50})(?:\s+since\s+(\d{4}))?',
|
||
r'appointed\s+(\w+)\s+(?:at\s+)?([A-Z][^,\.]{3,50})(?:\s+in\s+(\d{4}))?']:
|
||
for m in re.finditer(pat, text, re.I):
|
||
org = m.group(2).strip() if m.lastindex >= 2 and m.group(2) else None
|
||
yr = None
|
||
if m.lastindex and m.lastindex >= 3 and m.group(3):
|
||
try: yr = int(m.group(3))
|
||
except: pass
|
||
pos.append({"title": m.group(1), "organization": org, "year": yr,
|
||
"snippet": text[max(0,m.start()-20):m.end()+20].strip()})
|
||
return pos
|
||
|
||
|
||
def extract_publications(text):
|
||
pubs = []
|
||
for pat, ptype in [(r'(?:author|wrote|published)\s+(?:of\s+)?["\']([^"\']+)["\']', "book"),
|
||
(r'published\s+["\']?([^"\',.]+)["\']?\s+(?:in\s+)?(\d{4})', "publication")]:
|
||
for m in re.finditer(pat, text, re.I):
|
||
title = m.group(1).strip()
|
||
yr = int(m.group(2)) if m.lastindex >= 2 and m.group(2) else None
|
||
pubs.append({"type": ptype, "title": title, "year": yr,
|
||
"snippet": text[max(0,m.start()-20):m.end()+20].strip()})
|
||
return pubs
|
||
|
||
|
||
def extract_awards(text):
|
||
awards = []
|
||
for pat, atype in [(r'(?:received|awarded|won)\s+(?:the\s+)?([A-Z][^,\.]{5,50})', "award"),
|
||
(r'Fellow\s+of\s+(?:the\s+)?([A-Z][^,\.]{5,50})', "fellowship")]:
|
||
for m in re.finditer(pat, text, re.I):
|
||
awards.append({"type": atype, "name": m.group(1).strip(),
|
||
"snippet": text[max(0,m.start()-20):m.end()+20].strip()})
|
||
return awards
|
||
|
||
|
||
def extract_contacts(text):
|
||
contacts = []
|
||
seen_values = set() # Deduplication
|
||
# Blocklist for common false positives
|
||
twitter_blocklist = {'handle', 'handles', 'profile', 'profiles', 'account', 'accounts',
|
||
'found', 'available', 'not', 'no', 'or', 'and', 'the', 'is', 'are',
|
||
'was', 'were', 'has', 'have', 'with', 'for', 'o', 'a', 'example',
|
||
'gmail', 'outlook', 'yahoo', 'hotmail', 'email', 'mail'}
|
||
instagram_blocklist = twitter_blocklist | {'photos', 'videos', 'reels', 'stories'}
|
||
|
||
for pat, ctype in [
|
||
# Email
|
||
(r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b', "email"),
|
||
# Twitter/X
|
||
(r'(?<![a-zA-Z0-9.@])@([a-zA-Z0-9_]{3,15})\b', "twitter"),
|
||
(r'(?:twitter\.com|x\.com)/([a-zA-Z0-9_]{3,15})(?:\s|$|["\'\)\]])', "twitter"),
|
||
# LinkedIn
|
||
(r'(https?://(?:www\.)?linkedin\.com/in/[a-zA-Z0-9\-%]+/?)', "linkedin_url"),
|
||
# ORCID
|
||
(r'(?:orcid)[:\s]*((?:\d{4}-){3}\d{3}[\dX])', "orcid"),
|
||
(r'(https?://orcid\.org/(?:\d{4}-){3}\d{3}[\dX])', "orcid_url"),
|
||
# Instagram
|
||
(r'(?:instagram\.com)/([a-zA-Z0-9_.]{3,30})(?:\s|$|["\'\)\]])', "instagram"),
|
||
# ResearchGate profile
|
||
(r'(https?://(?:www\.)?researchgate\.net/profile/[a-zA-Z0-9_\-]+)', "researchgate_url"),
|
||
# Academia.edu profile
|
||
(r'(https?://[a-zA-Z0-9\-]+\.academia\.edu(?:/[a-zA-Z0-9_\-]+)?)', "academia_url"),
|
||
# Google Scholar (fix pattern to capture full URL)
|
||
(r'(https?://scholar\.google\.com/citations\?[^\s\)\"\']+)', "google_scholar_url"),
|
||
# Phone numbers (international formats) - use word boundary to avoid duplicates
|
||
(r'(?:phone|tel|telephone|fax)[:\s]*(\+?[0-9][0-9\s\-\(\)]{8,18}[0-9])', "phone"),
|
||
(r'(?<!\d)(\+31[\s\-]?[0-9][\s\-]?[0-9]{3,4}[\s\-]?[0-9]{3,4})(?!\d)', "phone"), # Dutch
|
||
(r'(?<!\d)(\+1[\s\-]?\(?[0-9]{3}\)?[\s\-]?[0-9]{3}[\s\-]?[0-9]{4})(?!\d)', "phone"), # US/Canada
|
||
]:
|
||
for m in re.finditer(pat, text, re.I):
|
||
val = m.group(1).strip().rstrip('/')
|
||
# Normalize phone numbers for deduplication
|
||
norm_val = re.sub(r'[\s\-\(\)]', '', val) if ctype == "phone" else val
|
||
# Skip duplicates
|
||
dedup_key = f"{ctype}:{norm_val.lower()}"
|
||
if dedup_key in seen_values: continue
|
||
seen_values.add(dedup_key)
|
||
# Skip common false positives
|
||
if ctype == "email" and any(x in val.lower() for x in ['example.com', 'test.com']): continue
|
||
if ctype == "twitter" and val.lower() in twitter_blocklist: continue
|
||
if ctype == "instagram" and val.lower() in instagram_blocklist: continue
|
||
# Skip if value is too short (likely false positive)
|
||
if ctype in ["twitter", "instagram"] and len(val) < 3: continue
|
||
contacts.append({"type": ctype, "value": val,
|
||
"snippet": text[max(0,m.start()-30):m.end()+30].strip()})
|
||
return contacts
|
||
|
||
|
||
def extract_media(text):
|
||
media = []
|
||
for pat, mtype in [(r'(https?://[^\s]+\.(?:jpg|jpeg|png|gif|webp))', "image_url"),
|
||
(r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|vimeo\.com/)[^\s]+)', "video_url"),
|
||
(r'(https?://upload\.wikimedia\.org/[^\s]+)', "wikimedia_image")]:
|
||
for m in re.finditer(pat, text, re.I):
|
||
media.append({"type": mtype, "value": m.group(1).strip(),
|
||
"snippet": text[max(0,m.start()-30):m.end()+30].strip()})
|
||
return media
|
||
|
||
|
||
def extract_social(text):
|
||
conns = []
|
||
for pat, rtype in [(r'(?:married|spouse|wife|husband)\s+(?:to\s+)?([A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)', "spouse"),
|
||
(r'(?:daughter|son)\s+of\s+([A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)', "parent"),
|
||
(r'(?:collaborated|worked)\s+with\s+([A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)', "collaborator")]:
|
||
for m in re.finditer(pat, text, re.I):
|
||
conns.append({"relationship_type": rtype, "related_person": m.group(1).strip(),
|
||
"snippet": text[max(0,m.start()-20):m.end()+20].strip()})
|
||
return conns
|
||
|
||
|
||
def extract_interests(text):
|
||
interests = []
|
||
for pat, itype in [(r'(?:specializes|specialized)\s+in\s+([^,\.]{5,60})', "specialization"),
|
||
(r'expert\s+(?:in|on)\s+([^,\.]{5,60})', "expertise"),
|
||
(r'known\s+for\s+(?:his|her|their\s+)?([^,\.]{5,80})', "known_for")]:
|
||
for m in re.finditer(pat, text, re.I):
|
||
interests.append({"type": itype, "topic": m.group(1).strip(),
|
||
"snippet": text[max(0,m.start()-20):m.end()+20].strip()})
|
||
return interests
|
||
|
||
|
||
def extract_memberships(text):
|
||
mems = []
|
||
for pat, mtype in [(r'member\s+of\s+(?:the\s+)?([A-Z][^,\.]{5,60})', "membership"),
|
||
(r'(?:board\s+member|board\s+director)\s+(?:of\s+)?([A-Z][^,\.]{5,60})', "board_member")]:
|
||
for m in re.finditer(pat, text, re.I):
|
||
mems.append({"type": mtype, "organization": m.group(1).strip(),
|
||
"snippet": text[max(0,m.start()-20):m.end()+20].strip()})
|
||
return mems
|
||
|
||
|
||
def extract_nationalities(text):
|
||
nats = []
|
||
words = ["Dutch", "German", "French", "British", "American", "Belgian", "Italian", "Spanish",
|
||
"Australian", "Canadian", "Japanese", "Chinese", "Brazilian", "Mexican", "Russian"]
|
||
pat = r'\b(' + '|'.join(words) + r')\s+(?:art\s+)?(?:historian|curator|professor|director|artist)'
|
||
for m in re.finditer(pat, text, re.I):
|
||
nats.append({"nationality": m.group(1).strip(),
|
||
"snippet": text[max(0,m.start()-20):m.end()+20].strip()})
|
||
return nats
|
||
|
||
|
||
def enrich_person(name: str, context: str, api_key: str) -> Dict:
|
||
enrichment = {"web_claims": [], "enrichment_metadata": {
|
||
"enrichment_timestamp": datetime.now(timezone.utc).isoformat(),
|
||
"enrichment_agent": f"enrich_person_comprehensive.py v{SCRIPT_VERSION}",
|
||
"person_name": name, "context_used": context[:100] if context else None,
|
||
"searches_performed": [], "data_fabrication_check": "PASSED"}}
|
||
|
||
# Search 1: Biography
|
||
q1 = f'"{name}" born biography'
|
||
r1 = search_linkup(q1, api_key)
|
||
enrichment["enrichment_metadata"]["searches_performed"].append(q1)
|
||
|
||
if "error" not in r1:
|
||
ans, srcs = r1.get("answer", ""), r1.get("sources", [])
|
||
url, title = (srcs[0].get("url", ""), srcs[0].get("name", "")) if srcs else ("", "")
|
||
meta = r1.get("_meta", {})
|
||
|
||
if ans:
|
||
if (b := extract_birth_year(ans)):
|
||
enrichment["web_claims"].append(create_claim("birth_year", b["year"], url, title, b["snippet"], q1, srcs, meta, ans, b.get("pattern_type")))
|
||
if (l := extract_birth_location(ans)):
|
||
enrichment["web_claims"].append(create_claim("birth_location", l["location"], url, title, l["snippet"], q1, srcs, meta, ans, "birth_location"))
|
||
if (d := extract_death_info(ans)):
|
||
enrichment["web_claims"].append(create_claim("death_year", d["year"], url, title, d["snippet"], q1, srcs, meta, ans, "death_year"))
|
||
for n in extract_nationalities(ans):
|
||
enrichment["web_claims"].append(create_claim("nationality", n["nationality"], url, title, n["snippet"], q1, srcs, meta, ans, "nationality"))
|
||
for s in extract_social(ans):
|
||
enrichment["web_claims"].append(create_claim("social_connection", {"relationship_type": s["relationship_type"], "related_person": s["related_person"]}, url, title, s["snippet"], q1, srcs, meta, ans, s["relationship_type"]))
|
||
|
||
time.sleep(1.0)
|
||
|
||
# Search 2: Education/Career
|
||
q2 = f'"{name}" {context} education career university'
|
||
r2 = search_linkup(q2, api_key)
|
||
enrichment["enrichment_metadata"]["searches_performed"].append(q2)
|
||
|
||
if "error" not in r2:
|
||
ans, srcs = r2.get("answer", ""), r2.get("sources", [])
|
||
url, title = (srcs[0].get("url", ""), srcs[0].get("name", "")) if srcs else ("", "")
|
||
meta = r2.get("_meta", {})
|
||
|
||
if ans:
|
||
for e in extract_education(ans):
|
||
enrichment["web_claims"].append(create_claim("education", {"type": e["type"], "institution": e["institution"], "year": e["year"]}, url, title, e["snippet"], q2, srcs, meta, ans, e["type"]))
|
||
for p in extract_positions(ans):
|
||
enrichment["web_claims"].append(create_claim("position", {"title": p["title"], "organization": p["organization"], "year": p["year"]}, url, title, p["snippet"], q2, srcs, meta, ans, "position"))
|
||
for m in extract_memberships(ans):
|
||
enrichment["web_claims"].append(create_claim("membership", {"type": m["type"], "organization": m["organization"]}, url, title, m["snippet"], q2, srcs, meta, ans, m["type"]))
|
||
for i in extract_interests(ans):
|
||
enrichment["web_claims"].append(create_claim("interest", {"type": i["type"], "topic": i["topic"]}, url, title, i["snippet"], q2, srcs, meta, ans, i["type"]))
|
||
|
||
time.sleep(1.0)
|
||
|
||
# Search 3: Publications/Awards
|
||
q3 = f'"{name}" publications awards honors books'
|
||
r3 = search_linkup(q3, api_key)
|
||
enrichment["enrichment_metadata"]["searches_performed"].append(q3)
|
||
|
||
if "error" not in r3:
|
||
ans, srcs = r3.get("answer", ""), r3.get("sources", [])
|
||
url, title = (srcs[0].get("url", ""), srcs[0].get("name", "")) if srcs else ("", "")
|
||
meta = r3.get("_meta", {})
|
||
|
||
if ans:
|
||
for p in extract_publications(ans):
|
||
enrichment["web_claims"].append(create_claim("publication", {"type": p["type"], "title": p["title"], "year": p["year"]}, url, title, p["snippet"], q3, srcs, meta, ans, p["type"]))
|
||
for a in extract_awards(ans):
|
||
enrichment["web_claims"].append(create_claim("award", {"type": a["type"], "name": a["name"]}, url, title, a["snippet"], q3, srcs, meta, ans, a["type"]))
|
||
|
||
time.sleep(1.0)
|
||
|
||
# Search 4: Contact/Media
|
||
q4 = f'"{name}" contact email twitter linkedin orcid profile photo'
|
||
r4 = search_linkup(q4, api_key)
|
||
enrichment["enrichment_metadata"]["searches_performed"].append(q4)
|
||
|
||
if "error" not in r4:
|
||
ans, srcs = r4.get("answer", ""), r4.get("sources", [])
|
||
url, title = (srcs[0].get("url", ""), srcs[0].get("name", "")) if srcs else ("", "")
|
||
meta = r4.get("_meta", {})
|
||
|
||
if ans:
|
||
for c in extract_contacts(ans):
|
||
enrichment["web_claims"].append(create_claim("contact_detail", {"type": c["type"], "value": c["value"]}, url, title, c["snippet"], q4, srcs, meta, ans, c["type"]))
|
||
for m in extract_media(ans):
|
||
enrichment["web_claims"].append(create_claim("media_reference", {"type": m["type"], "value": m["value"]}, url, title, m["snippet"], q4, srcs, meta, ans, m["type"]))
|
||
|
||
return enrichment
|
||
|
||
|
||
def process_ppid_file(filepath: Path, api_key: str, dry_run: bool = False) -> Dict:
|
||
with open(filepath) as f:
|
||
data = json.load(f)
|
||
|
||
name_data = data.get("name", {})
|
||
full_name = name_data.get("full_name") or name_data.get("display_name", "")
|
||
if not full_name or full_name == "LinkedIn Member":
|
||
return {"status": "skipped", "reason": "no_valid_name"}
|
||
|
||
if not data.get("heritage_relevance", {}).get("is_heritage_relevant"):
|
||
return {"status": "skipped", "reason": "not_heritage_relevant"}
|
||
|
||
headline = data.get("profile_data", {}).get("headline", "")
|
||
enrichment = enrich_person(full_name, headline, api_key)
|
||
|
||
if not enrichment["web_claims"]:
|
||
if not dry_run:
|
||
if "enrichment_history" not in data: data["enrichment_history"] = []
|
||
enrichment["enrichment_metadata"]["result"] = "no_claims_found"
|
||
data["enrichment_history"].append(enrichment["enrichment_metadata"])
|
||
with open(filepath, "w") as f: json.dump(data, f, indent=2, ensure_ascii=False)
|
||
return {"status": "no_claims_found", "name": full_name}
|
||
|
||
if not dry_run:
|
||
if "web_claims" not in data: data["web_claims"] = []
|
||
existing = {(c.get("claim_type"), str(c.get("claim_value"))) for c in data.get("web_claims", [])}
|
||
for claim in enrichment["web_claims"]:
|
||
key = (claim["claim_type"], str(claim["claim_value"]))
|
||
if key not in existing: data["web_claims"].append(claim)
|
||
|
||
if "enrichment_history" not in data: data["enrichment_history"] = []
|
||
data["enrichment_history"].append(enrichment["enrichment_metadata"])
|
||
|
||
birth_claims = [c for c in enrichment["web_claims"] if c["claim_type"] == "birth_year"]
|
||
if birth_claims:
|
||
current = data.get("birth_date", {}).get("edtf", "XXXX")
|
||
if current == "XXXX" or current.endswith("X"):
|
||
prov = birth_claims[0]["provenance"]
|
||
data["birth_date"] = {"edtf": str(birth_claims[0]["claim_value"]), "precision": "year",
|
||
"provenance": {k: prov[k] for k in ["statement_created_at", "source_archived_at",
|
||
"retrieval_agent", "retrieval_method", "source_url", "source_title",
|
||
"source_snippet", "search_query", "extraction_method"] if k in prov}}
|
||
data["birth_date"]["provenance"]["verified"] = False
|
||
data["birth_date"]["provenance"]["verification_status"] = "machine_extracted"
|
||
|
||
if [c for c in enrichment["web_claims"] if c["claim_type"] == "death_year"]:
|
||
data["is_living"] = False
|
||
|
||
with open(filepath, "w") as f: json.dump(data, f, indent=2, ensure_ascii=False)
|
||
|
||
return {"status": "enriched", "name": full_name, "claims_added": len(enrichment["web_claims"]),
|
||
"claim_types": list(set(c["claim_type"] for c in enrichment["web_claims"]))}
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Comprehensive person profile enrichment")
|
||
parser.add_argument("--limit", type=int, default=10)
|
||
parser.add_argument("--dry-run", action="store_true")
|
||
args = parser.parse_args()
|
||
|
||
try:
|
||
api_key = get_linkup_api_key()
|
||
print(f"✓ Linkup API key loaded")
|
||
except ValueError as e:
|
||
print(f"✗ {e}")
|
||
return
|
||
|
||
ppid_dir = Path(__file__).parent.parent / "data" / "person"
|
||
if not ppid_dir.exists():
|
||
print(f"✗ PPID directory not found")
|
||
return
|
||
|
||
print("Scanning for candidates...")
|
||
candidates = []
|
||
for f in ppid_dir.glob("ID_*.json"):
|
||
try:
|
||
with open(f) as fp: data = json.load(fp)
|
||
if not data.get("heritage_relevance", {}).get("is_heritage_relevant"): continue
|
||
if data.get("enrichment_history"): continue
|
||
|
||
name = data.get("name", {}).get("full_name", "")
|
||
if not name or name == "LinkedIn Member": continue
|
||
|
||
headline = data.get("profile_data", {}).get("headline", "").lower()
|
||
score = 0
|
||
if "professor" in headline: score += 3
|
||
if "director" in headline: score += 2
|
||
if "curator" in headline: score += 2
|
||
if "museum" in headline: score += 1
|
||
if "archive" in headline: score += 1
|
||
candidates.append((f, score, name))
|
||
except: continue
|
||
|
||
candidates.sort(key=lambda x: -x[1])
|
||
print(f"Found {len(candidates)} candidates")
|
||
|
||
stats = {"enriched": 0, "no_claims_found": 0, "skipped": 0, "errors": 0}
|
||
results = []
|
||
|
||
for i, (filepath, score, _) in enumerate(candidates[:args.limit]):
|
||
print(f"\n[{i+1}/{args.limit}] {filepath.name} (score={score})")
|
||
try:
|
||
result = process_ppid_file(filepath, api_key, args.dry_run)
|
||
stats[result.get("status", "errors")] = stats.get(result.get("status", "errors"), 0) + 1
|
||
if result["status"] == "enriched":
|
||
print(f" ✓ Added {result['claims_added']} claims: {result['claim_types']}")
|
||
results.append(result)
|
||
elif result["status"] == "no_claims_found":
|
||
print(f" ✗ No claims found for {result.get('name')}")
|
||
time.sleep(4.0)
|
||
except Exception as e:
|
||
print(f" ✗ Error: {e}")
|
||
stats["errors"] += 1
|
||
|
||
print(f"\n{'='*50}\nSUMMARY\n{'='*50}")
|
||
print(f"Enriched: {stats['enriched']}, No claims: {stats['no_claims_found']}, Errors: {stats['errors']}")
|
||
if results:
|
||
print(f"\nTotal claims added: {sum(r['claims_added'] for r in results)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|