glam/scripts/enrich_ppids_web.py
kempersc dd0ee2cf11 feat(scripts): expand university location mappings and add web enrichment
- enrich_ppids.py: Add 40+ Dutch universities and hogescholen to location mapping
- enrich_ppids_web.py: New script for web-based PPID enrichment
- resolve_pending_known_orgs.py: Updates for pending org resolution
2026-01-09 21:10:14 +01:00

579 lines
18 KiB
Python

#!/usr/bin/env python3
"""
PPID Web Enrichment Script
Enriches PPID files with web-sourced claims using Exa AI and Linkup search.
Adds proper provenance statements per Rules 6, 26, and 35.
Enrichment targets:
1. Birth date/year - Search for biographical information
2. Publications - ORCID, Google Scholar, ResearchGate
3. News mentions - Press coverage, interviews
4. Wikidata entity - Authority file linking
5. Institutional affiliations - Verify current roles
All web claims include:
- source_url: Where the data was found
- retrieved_on: ISO 8601 timestamp
- retrieval_agent: Tool used (exa_web_search, linkup_search, etc.)
- claim_type: Type of claim (birth_date, publication, news_mention, etc.)
- claim_value: The extracted value
- provenance: Full provenance chain per Rule 35
Usage:
python scripts/enrich_ppids_web.py --limit 10 --verbose
python scripts/enrich_ppids_web.py --dry-run --sample stefankulk
"""
import json
import os
import re
import sys
import time
import argparse
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, List, Any, Tuple
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
def create_web_claim(
claim_type: str,
claim_value: str,
source_url: str,
retrieval_agent: str,
confidence: str = "medium",
notes: Optional[str] = None,
raw_response: Optional[Dict] = None
) -> Dict[str, Any]:
"""
Create a web claim with proper provenance per Rules 6, 26, and 35.
Args:
claim_type: Type of claim (birth_date, publication, news_mention, etc.)
claim_value: The extracted value
source_url: URL where the data was found
retrieval_agent: Tool used (exa_web_search, linkup_search, etc.)
confidence: Confidence level (high, medium, low, very_low)
notes: Additional notes about the claim
raw_response: Raw API response for audit
Returns:
Dict with claim structure per Rule 26
"""
now = datetime.now(timezone.utc)
claim = {
"claim_type": claim_type,
"claim_value": claim_value,
"source_url": source_url,
"retrieved_on": now.isoformat(),
"retrieval_agent": retrieval_agent,
"confidence": confidence,
"provenance": {
"statement_created_at": now.isoformat(),
"source_archived_at": now.isoformat(), # Same time for API responses
"retrieval_method": retrieval_agent,
}
}
if notes:
claim["notes"] = notes
if raw_response:
# Store snippet of raw response for audit (not full response to save space)
claim["provenance"]["response_snippet"] = str(raw_response)[:500]
return claim
def extract_birth_year_from_text(text: str, full_name: str) -> Optional[Tuple[str, str]]:
"""
Extract birth year from text using various patterns.
Returns:
Tuple of (birth_year_edtf, extraction_note) or None
"""
if not text:
return None
# Normalize text
text_lower = text.lower()
name_parts = full_name.lower().split()
last_name = name_parts[-1] if name_parts else ""
# Check if the text is about the right person (basic check)
if last_name and last_name not in text_lower:
return None
# Pattern 1: "born in YYYY" or "born YYYY"
born_match = re.search(r'born\s+(?:in\s+)?(\d{4})', text_lower)
if born_match:
year = born_match.group(1)
return (year, f"Extracted from 'born {year}' pattern")
# Pattern 2: "(YYYY - )" or "(YYYY-)" indicating birth year
birth_dash_match = re.search(r'\((\d{4})\s*[-–—]\s*\)', text)
if birth_dash_match:
year = birth_dash_match.group(1)
return (year, f"Extracted from '({year} - )' lifespan pattern")
# Pattern 3: "YYYY - present" or "b. YYYY"
b_match = re.search(r'(?:b\.|born)\s*(\d{4})', text_lower)
if b_match:
year = b_match.group(1)
return (year, f"Extracted from 'b. {year}' pattern")
# Pattern 4: Age patterns "X years old" with date context
age_match = re.search(r'(\d{1,2})\s*(?:years?\s*old|jaar\s*oud)', text_lower)
if age_match:
age = int(age_match.group(1))
if 20 <= age <= 100: # Reasonable age range
current_year = datetime.now().year
estimated_birth = current_year - age
return (f"{estimated_birth}~", f"Estimated from age {age} (approximate)")
# Pattern 5: Birthday patterns "birthday: Month DD, YYYY"
birthday_match = re.search(
r'(?:birthday|geboren|date of birth)[:\s]+(?:\w+\s+\d{1,2},?\s+)?(\d{4})',
text_lower
)
if birthday_match:
year = birthday_match.group(1)
return (year, "Extracted from birthday/geboren pattern")
return None
def extract_publications_from_text(text: str, full_name: str) -> List[Dict[str, str]]:
"""
Extract publication references from search results.
Returns:
List of publication dicts with title, year, venue
"""
publications = []
if not text:
return publications
# Look for DOI patterns
doi_matches = re.findall(r'10\.\d{4,}/[^\s]+', text)
for doi in doi_matches[:5]: # Limit to 5
publications.append({
"type": "doi",
"value": doi.strip(),
"note": "DOI found in search results"
})
# Look for ORCID patterns
orcid_match = re.search(r'orcid\.org/(\d{4}-\d{4}-\d{4}-\d{3}[\dX])', text)
if orcid_match:
publications.append({
"type": "orcid",
"value": orcid_match.group(1),
"note": "ORCID identifier found"
})
return publications
def search_birth_date_exa(full_name: str, context_hints: List[str] = None) -> Optional[Dict]:
"""
Search for birth date using Exa AI web search.
Note: This function is designed to be called via MCP tools.
In actual execution, replace with MCP tool call.
"""
# Build search query
query_parts = [f'"{full_name}"', "born", "birthday"]
if context_hints:
query_parts.extend(context_hints[:2]) # Add up to 2 context hints
query = " ".join(query_parts)
# This would be replaced with actual MCP call:
# result = exa_web_search_exa(query=query, numResults=5)
return {
"query": query,
"tool": "exa_web_search_exa",
"status": "pending_mcp_call"
}
def search_publications_exa(full_name: str, institution: str = None) -> Optional[Dict]:
"""
Search for publications using Exa AI.
"""
query_parts = [f'"{full_name}"']
if institution:
query_parts.append(institution)
query_parts.extend(["publications", "research", "ORCID"])
query = " ".join(query_parts)
return {
"query": query,
"tool": "exa_web_search_exa",
"status": "pending_mcp_call"
}
def search_news_mentions_exa(full_name: str, institution: str = None) -> Optional[Dict]:
"""
Search for news mentions using Exa AI.
"""
query_parts = [f'"{full_name}"']
if institution:
query_parts.append(institution)
query = " ".join(query_parts)
return {
"query": query,
"tool": "exa_web_search_exa",
"status": "pending_mcp_call"
}
def get_person_context(ppid_data: Dict) -> Dict[str, Any]:
"""
Extract context from PPID data for better search queries.
"""
context = {
"full_name": "",
"institutions": [],
"roles": [],
"location": None,
"linkedin_url": None,
"skills": [],
}
# Get name
name_data = ppid_data.get("name", {})
context["full_name"] = name_data.get("full_name", "")
# Get profile data
profile = ppid_data.get("profile_data", {})
if profile:
context["linkedin_url"] = profile.get("linkedin_url")
context["location"] = profile.get("location")
context["skills"] = profile.get("skills", [])[:10] # Top 10 skills
# Extract institutions from experience
for exp in profile.get("experience", []) or []:
if exp and exp.get("company"):
context["institutions"].append(exp["company"])
if exp.get("title"):
context["roles"].append(exp["title"])
# Extract from education
for edu in profile.get("education", []) or []:
if edu and edu.get("institution"):
context["institutions"].append(edu["institution"])
# Deduplicate
context["institutions"] = list(dict.fromkeys(context["institutions"]))[:5]
context["roles"] = list(dict.fromkeys(context["roles"]))[:5]
return context
def build_enrichment_queries(ppid_data: Dict) -> List[Dict[str, Any]]:
"""
Build a list of enrichment queries for a PPID.
Returns list of query specs to execute via MCP tools.
"""
context = get_person_context(ppid_data)
full_name = context["full_name"]
if not full_name:
return []
queries = []
# 1. Birth date search (only if not already known)
birth_date = ppid_data.get("birth_date", {}).get("edtf", "XXXX")
enrichment_meta = ppid_data.get("enrichment_metadata", {}).get("birth_date_search", {})
if birth_date == "XXXX" and not enrichment_meta.get("attempted"):
# Build birth date query with context
hints = []
if context["institutions"]:
hints.append(context["institutions"][0])
if context["location"]:
hints.append(context["location"].split(",")[0])
queries.append({
"type": "birth_date",
"query": f'"{full_name}" born birthday biography',
"context_hints": hints,
"tool": "exa_web_search_exa",
"priority": "high"
})
# 2. Publications search (for academics/researchers)
academic_keywords = ["professor", "researcher", "phd", "doctor", "lecturer", "scientist"]
is_academic = any(
kw in " ".join(context["roles"]).lower()
for kw in academic_keywords
)
if is_academic:
institution = context["institutions"][0] if context["institutions"] else ""
queries.append({
"type": "publications",
"query": f'"{full_name}" {institution} publications ORCID research',
"tool": "exa_web_search_exa",
"priority": "medium"
})
# 3. News/press mentions
if context["institutions"]:
queries.append({
"type": "news_mentions",
"query": f'"{full_name}" {context["institutions"][0]}',
"tool": "exa_web_search_exa",
"priority": "low"
})
# 4. Wikidata search (for notable persons)
queries.append({
"type": "wikidata",
"query": full_name,
"tool": "wikidata_search_entity",
"priority": "medium"
})
return queries
def process_search_result(
result: Dict[str, Any],
query_type: str,
full_name: str,
ppid_data: Dict
) -> List[Dict[str, Any]]:
"""
Process a search result and extract web claims.
Args:
result: Raw search result from Exa/Linkup
query_type: Type of query (birth_date, publications, etc.)
full_name: Person's full name
ppid_data: Current PPID data
Returns:
List of web claims to add
"""
claims = []
if not result:
return claims
# Extract text content from result
text = ""
source_url = ""
if isinstance(result, dict):
text = result.get("text", "") or result.get("content", "") or ""
source_url = result.get("url", "") or result.get("source_url", "")
elif isinstance(result, str):
text = result
if query_type == "birth_date":
birth_info = extract_birth_year_from_text(text, full_name)
if birth_info:
year, note = birth_info
claims.append(create_web_claim(
claim_type="birth_year",
claim_value=year,
source_url=source_url,
retrieval_agent="exa_web_search_exa",
confidence="medium" if "~" not in year else "low",
notes=note,
raw_response={"text_snippet": text[:200]}
))
elif query_type == "publications":
pubs = extract_publications_from_text(text, full_name)
for pub in pubs:
claims.append(create_web_claim(
claim_type=f"identifier_{pub['type']}",
claim_value=pub["value"],
source_url=source_url,
retrieval_agent="exa_web_search_exa",
confidence="high" if pub["type"] in ["doi", "orcid"] else "medium",
notes=pub.get("note")
))
elif query_type == "news_mentions":
# For news, we just record the mention
if full_name.lower() in text.lower():
claims.append(create_web_claim(
claim_type="news_mention",
claim_value=text[:500], # First 500 chars
source_url=source_url,
retrieval_agent="exa_web_search_exa",
confidence="medium",
notes="News/press mention found"
))
return claims
def enrich_ppid_file(
filepath: Path,
dry_run: bool = False,
verbose: bool = False
) -> Dict[str, Any]:
"""
Enrich a single PPID file with web-sourced claims.
This function builds queries but does not execute them directly.
Queries should be executed via MCP tools in the calling context.
Returns:
Dict with enrichment stats and pending queries
"""
stats = {
"filepath": str(filepath),
"queries_built": 0,
"claims_added": 0,
"errors": [],
"pending_queries": []
}
try:
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
except Exception as e:
stats["errors"].append(f"Failed to read file: {e}")
return stats
# Build enrichment queries
queries = build_enrichment_queries(data)
stats["queries_built"] = len(queries)
stats["pending_queries"] = queries
if verbose:
print(f" Built {len(queries)} queries for {filepath.name}")
for q in queries:
print(f" - {q['type']}: {q['query'][:50]}...")
return stats
def main():
parser = argparse.ArgumentParser(
description="Enrich PPID files with web-sourced claims (Rule 26 compliant)"
)
parser.add_argument("--dry-run", action="store_true", help="Don't write changes")
parser.add_argument("--limit", type=int, help="Process only N files")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
parser.add_argument("--sample", type=str, help="Process specific linkedin_slug")
parser.add_argument(
"--query-types",
type=str,
default="birth_date,publications,news_mentions,wikidata",
help="Comma-separated list of query types to run"
)
args = parser.parse_args()
person_dir = Path("/Users/kempersc/apps/glam/data/person")
# Get PPID files
if args.sample:
# Find file by linkedin slug
ppid_files = list(person_dir.glob(f"ID_*{args.sample.upper()}*.json"))
if not ppid_files:
# Try case-insensitive search
ppid_files = [
f for f in person_dir.glob("ID_*.json")
if args.sample.lower() in f.stem.lower()
]
else:
ppid_files = list(person_dir.glob("ID_*.json"))
if args.limit:
ppid_files = ppid_files[:args.limit]
print(f"Processing {len(ppid_files)} PPID files for web enrichment...")
if args.dry_run:
print("DRY RUN - no changes will be written")
query_types = set(args.query_types.split(","))
print(f"Query types: {query_types}")
# Statistics
total_stats = {
"processed": 0,
"queries_built": 0,
"by_type": {qt: 0 for qt in query_types},
"errors": 0,
}
all_pending_queries = []
for i, filepath in enumerate(ppid_files):
try:
stats = enrich_ppid_file(filepath, dry_run=args.dry_run, verbose=args.verbose)
total_stats["processed"] += 1
total_stats["queries_built"] += stats["queries_built"]
# Filter queries by requested types
for q in stats["pending_queries"]:
if q["type"] in query_types:
total_stats["by_type"][q["type"]] += 1
all_pending_queries.append({
"filepath": stats["filepath"],
**q
})
if stats["errors"]:
total_stats["errors"] += 1
if args.verbose:
print(f" ERROR {filepath.name}: {stats['errors']}")
if (i + 1) % 100 == 0:
print(f" Processed {i + 1}/{len(ppid_files)}...")
except Exception as e:
total_stats["errors"] += 1
if args.verbose:
print(f" ERROR {filepath.name}: {e}")
# Print summary
print("\n" + "=" * 60)
print("WEB ENRICHMENT QUERY SUMMARY")
print("=" * 60)
print(f"Processed: {total_stats['processed']}")
print(f"Queries built: {total_stats['queries_built']}")
print(f"By query type:")
for qt, count in total_stats["by_type"].items():
print(f" - {qt}: {count}")
print(f"Errors: {total_stats['errors']}")
# Output pending queries for MCP execution
if all_pending_queries and not args.dry_run:
output_file = person_dir.parent / "pending_web_queries.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump({
"generated_at": datetime.now(timezone.utc).isoformat(),
"total_queries": len(all_pending_queries),
"queries": all_pending_queries
}, f, indent=2, ensure_ascii=False)
print(f"\nPending queries saved to: {output_file}")
print("Execute these queries via MCP tools and run --apply-results to add claims.")
print("\nNote: This script builds queries. Execute via MCP tools:")
print(" - exa_web_search_exa for birth_date, publications, news_mentions")
print(" - wikidata_search_entity for wikidata matching")
if __name__ == "__main__":
main()