feat(api): Add heritage person classification and RAG retry logic

- Add GLAMORCUBESFIXPHDNT heritage type detection for person profiles
- Two-stage classification: blocklist non-heritage orgs, then match keywords
- Special handling for Digital (D) type: requires heritage org context
- Add career_history heritage_relevant and heritage_type fields
- Add exponential backoff retry for Anthropic API overload errors
- Fix DSPy 3.x async context with dspy.context() wrapper
This commit is contained in:
kempersc 2025-12-15 01:31:54 +01:00
parent 22709cc13e
commit 68c5aa2724
4 changed files with 707 additions and 27 deletions

View file

@ -35,9 +35,12 @@ from decimal import Decimal
from fastapi import FastAPI, HTTPException, Query, APIRouter
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware
from fastapi.responses import JSONResponse
from fastapi.responses import JSONResponse, Response
from pydantic import BaseModel, Field
import asyncpg
import httpx
import hashlib
from urllib.parse import urlparse
# ============================================================================
@ -155,6 +158,280 @@ class PersonDetail(BaseModel):
source_file: Optional[str]
# ============================================================================
# Heritage Classification (copied from main.py for experience item classification)
# ============================================================================
import re
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
HERITAGE_KEYWORDS = {
'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'],
'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'],
'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid',
'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief',
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'],
'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh',
'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
'collectiespecialist', 'collectie'],
'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'],
'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO',
'documentatie', 'documentation', 'kenniscentrum', 'historicus'],
'C': ['corporate archive', 'bedrijfsarchief', 'company history'],
'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy',
'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA',
'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor',
'associate professor', 'hoogleraar', 'educatie', 'educator'],
'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'],
'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer',
'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'],
}
NON_HERITAGE_KEYWORDS = [
'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting',
'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical',
'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking',
'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
]
# Organizations that are explicitly NOT heritage institutions
NON_HERITAGE_ORGANIZATIONS = [
# Banks & Financial
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
# Security companies
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
# Police/Government (non-cultural)
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
# Political parties
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
# Tech companies (non-heritage)
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
# Telecom
'kpn', 'vodafone', 't-mobile', 'ziggo',
# Postal / Logistics
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
# Healthcare
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
# Retail
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
# Consulting / Professional services
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
# Recruitment / HR
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
# Energy / Utilities
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
# Transport
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
# Other
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
]
# Heritage organization keywords - organizations that ARE heritage institutions
HERITAGE_ORGANIZATION_KEYWORDS = [
# Archives
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
# Museums
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
# Libraries
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
# Film/AV heritage
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
# Heritage platforms
'erfgoed', 'heritage', 'cultural', 'cultureel',
# Research institutes (heritage-focused)
'knaw', 'humanities cluster', 'meertens', 'huygens',
]
def detect_heritage_type(role: Optional[str], company: Optional[str]) -> tuple:
"""
Detect if a position is heritage-relevant and what type.
Two-stage classification:
1. Check if organization is explicitly non-heritage (blocklist)
2. Check if role/organization matches heritage patterns
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
This prevents generic IT workers at banks/police from being classified as heritage.
Args:
role: Job title/role text
company: Company/organization name
Returns:
Tuple of (heritage_relevant: bool, heritage_type: Optional[str])
"""
# Combine role and company for full context
role_text = role or ''
company_text = company or ''
combined = f"{role_text} {company_text}".lower()
if not combined.strip():
return (False, None)
# Stage 1: Check for non-heritage organizations (blocklist)
# Use word boundary matching to avoid false positives like "sharing" matching "ing "
for org in NON_HERITAGE_ORGANIZATIONS:
org_pattern = org.lower().strip()
# Use word boundary regex for patterns that could have false positives
if re.search(r'\b' + re.escape(org_pattern) + r'\b', combined):
return (False, None)
# Stage 2: Check for non-heritage role indicators
for keyword in NON_HERITAGE_KEYWORDS:
keyword_pattern = keyword.lower().strip()
if re.search(r'\b' + re.escape(keyword_pattern) + r'\b', combined):
return (False, None)
# Stage 3: Check if this is a heritage organization
is_heritage_org = False
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
if org_keyword.lower() in combined:
is_heritage_org = True
break
# Check heritage keywords by type (order matters - more specific first)
# 'D' (Digital) is checked last and requires heritage org validation
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop
for heritage_type in type_order:
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
for keyword in keywords:
if keyword.lower() in combined:
return (True, heritage_type)
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
if is_heritage_org:
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
for keyword in digital_keywords:
if keyword.lower() in combined:
return (True, 'D')
# Generic heritage terms (without specific type)
generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection']
for keyword in generic:
if keyword in combined:
return (True, None)
return (False, None)
def enrich_experience_with_heritage(experience: List) -> List[Dict]:
"""
Add heritage_relevant and heritage_type fields to each experience item.
Handles both dict and JSON string inputs (asyncpg returns jsonb array
elements as strings that need parsing).
Args:
experience: List of experience items (dicts or JSON strings)
Returns:
Same list with heritage_relevant and heritage_type added to each item
"""
if not experience:
return []
enriched = []
for exp in experience:
# Handle case where exp is a JSON string instead of dict
# (asyncpg returns jsonb array elements as strings)
if isinstance(exp, str):
try:
exp = json.loads(exp)
except json.JSONDecodeError:
continue
# Skip if still not a dict
if not isinstance(exp, dict):
continue
# Get role and company for classification
role = exp.get('title') or exp.get('role') or ''
company = exp.get('company') or exp.get('organization') or ''
# Detect heritage relevance
heritage_relevant, heritage_type = detect_heritage_type(role, company)
# Create new dict with heritage fields added
enriched_exp = {**exp}
enriched_exp['heritage_relevant'] = heritage_relevant
enriched_exp['heritage_type'] = heritage_type
enriched.append(enriched_exp)
return enriched
def parse_jsonb_list(data) -> List:
"""
Parse a jsonb list field from PostgreSQL.
asyncpg returns jsonb columns in various forms:
- Sometimes as a proper Python list with dict elements
- Sometimes as a JSON string that needs parsing
- Sometimes as a list where each element is a JSON string
- Sometimes as a list where each element is a Python repr string (single quotes)
This function handles all these cases.
Args:
data: Either a list, a JSON string representing a list, or None
Returns:
Parsed list with all elements as proper Python objects (empty list if None or invalid)
"""
import ast
if data is None:
return []
result = []
# If it's a string, try to parse the whole thing as JSON first
if isinstance(data, str):
try:
data = json.loads(data)
except json.JSONDecodeError:
return []
# Now data should be a list
if not isinstance(data, list):
return []
# Parse each element if it's a string
for item in data:
if isinstance(item, str):
# Try JSON first (double quotes)
try:
parsed_item = json.loads(item)
result.append(parsed_item)
continue
except json.JSONDecodeError:
pass
# Try Python literal (single quotes) - handles malformed data
try:
parsed_item = ast.literal_eval(item)
result.append(parsed_item)
continue
except (ValueError, SyntaxError):
pass
# Keep as string if neither works (e.g., plain skill strings)
result.append(item)
else:
result.append(item)
return result
# ============================================================================
# Global State
# ============================================================================
@ -2000,11 +2277,11 @@ async def get_person(staff_id: str):
linkedin_url=row['linkedin_url'],
profile_image_url=row['profile_image_url'],
heritage_relevant=row['heritage_relevant'] if row['heritage_relevant'] is not None else True,
heritage_types=row['heritage_types'] if row['heritage_types'] else [],
experience=row['experience'] if row['experience'] else [],
education=row['education'] if row['education'] else [],
skills=row['skills'] if row['skills'] else [],
languages=row['languages'] if row['languages'] else [],
heritage_types=parse_jsonb_list(row['heritage_types']),
experience=enrich_experience_with_heritage(parse_jsonb_list(row['experience'])),
education=parse_jsonb_list(row['education']),
skills=parse_jsonb_list(row['skills']),
languages=parse_jsonb_list(row['languages']),
about=row['about'],
connections=row['connections'],
extraction_date=row['extraction_date'].isoformat() if row['extraction_date'] else None,
@ -2013,6 +2290,148 @@ async def get_person(staff_id: str):
)
# ============================================================================
# Image Proxy (Avoid Hotlinking Issues)
# ============================================================================
# In-memory cache for proxied images (simple TTL-based)
_image_cache: Dict[str, tuple] = {} # hash -> (content, content_type, timestamp)
IMAGE_CACHE_TTL = 3600 # 1 hour
# Allowed image domains for security
ALLOWED_IMAGE_DOMAINS = {
# Google Maps
'lh3.googleusercontent.com',
'lh4.googleusercontent.com',
'lh5.googleusercontent.com',
'lh6.googleusercontent.com',
'maps.gstatic.com',
'maps.googleapis.com',
# Wikidata/Wikimedia
'upload.wikimedia.org',
'commons.wikimedia.org',
# Institution domains (add as needed)
# Generic patterns handled below
}
def is_allowed_image_url(url: str) -> bool:
"""Check if URL is from an allowed domain for proxying."""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Check exact matches
if domain in ALLOWED_IMAGE_DOMAINS:
return True
# Allow any .nl domain (Dutch institutions)
if domain.endswith('.nl'):
return True
# Allow any .org domain (many heritage institutions)
if domain.endswith('.org'):
return True
# Allow any .museum domain
if domain.endswith('.museum'):
return True
# Check for Google user content subdomains
if 'googleusercontent.com' in domain:
return True
return False
except Exception:
return False
@app.get("/image-proxy")
async def proxy_image(url: str = Query(..., description="Image URL to proxy")):
"""
Proxy external images to avoid hotlinking issues.
Many external servers block direct embedding (hotlinking) of their images.
This endpoint fetches the image server-side and returns it with proper headers.
Features:
- Validates URL is from allowed domains (security)
- Caches images in memory for 1 hour (performance)
- Sets proper Content-Type headers
- Avoids CORS issues
Usage: /image-proxy?url=https://example.com/logo.png
"""
# Security: validate URL
if not url or not url.startswith(('http://', 'https://')):
raise HTTPException(status_code=400, detail="Invalid URL")
if not is_allowed_image_url(url):
raise HTTPException(status_code=403, detail="Domain not allowed for proxying")
# Check cache
url_hash = hashlib.md5(url.encode()).hexdigest()
if url_hash in _image_cache:
content, content_type, timestamp = _image_cache[url_hash]
if datetime.now().timestamp() - timestamp < IMAGE_CACHE_TTL:
return Response(
content=content,
media_type=content_type,
headers={
"Cache-Control": "public, max-age=3600",
"X-Proxy-Cache": "HIT",
}
)
# Fetch image
try:
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
response = await client.get(
url,
headers={
# Spoof headers to avoid hotlink detection
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9,nl;q=0.8",
"Referer": urlparse(url).scheme + "://" + urlparse(url).netloc + "/",
}
)
if response.status_code != 200:
raise HTTPException(status_code=502, detail=f"Failed to fetch image: {response.status_code}")
content = response.content
content_type = response.headers.get("content-type", "image/png")
# Validate it's actually an image
if not content_type.startswith("image/"):
raise HTTPException(status_code=400, detail="URL does not point to an image")
# Cache the result
_image_cache[url_hash] = (content, content_type, datetime.now().timestamp())
# Limit cache size (simple LRU-like cleanup)
if len(_image_cache) > 1000:
# Remove oldest entries
sorted_entries = sorted(_image_cache.items(), key=lambda x: x[1][2])
for key, _ in sorted_entries[:500]:
del _image_cache[key]
return Response(
content=content,
media_type=content_type,
headers={
"Cache-Control": "public, max-age=3600",
"X-Proxy-Cache": "MISS",
}
)
except httpx.TimeoutException:
raise HTTPException(status_code=504, detail="Timeout fetching image")
except httpx.RequestError as e:
raise HTTPException(status_code=502, detail=f"Error fetching image: {str(e)}")
# ============================================================================
# Main
# ============================================================================

View file

@ -40,6 +40,171 @@ class Settings(BaseModel):
settings = Settings()
# ============================================================================
# Heritage Classification
# ============================================================================
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
HERITAGE_KEYWORDS = {
'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'],
'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'],
'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid',
'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief',
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'],
'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh',
'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
'collectiespecialist', 'collectie'],
'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'],
'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO',
'documentatie', 'documentation', 'kenniscentrum', 'historicus'],
'C': ['corporate archive', 'bedrijfsarchief', 'company history'],
'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy',
'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA',
'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor',
'associate professor', 'hoogleraar', 'educatie', 'educator'],
'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'],
'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer',
'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'],
}
NON_HERITAGE_KEYWORDS = [
'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting',
'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical',
'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking',
'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
]
# Organizations that are explicitly NOT heritage institutions
NON_HERITAGE_ORGANIZATIONS = [
# Banks & Financial
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
# Security companies
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
# Police/Government (non-cultural)
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
# Political parties
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
# Tech companies (non-heritage)
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
# Telecom
'kpn', 'vodafone', 't-mobile', 'ziggo',
# Postal / Logistics
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
# Healthcare
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
# Retail
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
# Consulting / Professional services
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
# Recruitment / HR
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
# Energy / Utilities
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
# Transport
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
# Other
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
]
# Heritage organization keywords - organizations that ARE heritage institutions
HERITAGE_ORGANIZATION_KEYWORDS = [
# Archives
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
# Museums
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
# Libraries
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
# Film/AV heritage
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
# Heritage platforms
'erfgoed', 'heritage', 'cultural', 'cultureel',
# Research institutes (heritage-focused)
'knaw', 'humanities cluster', 'meertens', 'huygens',
]
def detect_heritage_type(role: Optional[str], company: Optional[str]) -> tuple:
"""
Detect if a position is heritage-relevant and what type.
Two-stage classification:
1. Check if organization is explicitly non-heritage (blocklist)
2. Check if role/organization matches heritage patterns
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
This prevents generic IT workers at banks/police from being classified as heritage.
Args:
role: Job title/role text
company: Company/organization name
Returns:
Tuple of (heritage_relevant: bool, heritage_type: Optional[str])
"""
import re
# Combine role and company for full context
role_text = role or ''
company_text = company or ''
combined = f"{role_text} {company_text}".lower()
if not combined.strip():
return (False, None)
# Stage 1: Check for non-heritage organizations (blocklist)
# Use word boundary matching to avoid false positives like "sharing" matching "ing "
for org in NON_HERITAGE_ORGANIZATIONS:
org_pattern = org.lower().strip()
# Use word boundary regex for patterns that could have false positives
if re.search(r'\b' + re.escape(org_pattern) + r'\b', combined):
return (False, None)
# Stage 2: Check for non-heritage role indicators
for keyword in NON_HERITAGE_KEYWORDS:
keyword_pattern = keyword.lower().strip()
if re.search(r'\b' + re.escape(keyword_pattern) + r'\b', combined):
return (False, None)
# Stage 3: Check if this is a heritage organization
is_heritage_org = False
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
if org_keyword.lower() in combined:
is_heritage_org = True
break
# Check heritage keywords by type (order matters - more specific first)
# 'D' (Digital) is checked last and requires heritage org validation
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop
for heritage_type in type_order:
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
for keyword in keywords:
if keyword.lower() in combined:
return (True, heritage_type)
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
if is_heritage_org:
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
for keyword in digital_keywords:
if keyword.lower() in combined:
return (True, 'D')
# Generic heritage terms (without specific type)
generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection']
for keyword in generic:
if keyword in combined:
return (True, None)
return (False, None)
# ============================================================================
# Pydantic Models
# ============================================================================
@ -854,22 +1019,44 @@ async def get_profile(
if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile:
experience = inner_profile.get('experience', [])
if experience:
# Map field names: title→role, company→organization, duration→dates
# Map field names: title→role, company→organization, date_range→dates
# Also classify each position as heritage-relevant or not
career_history = []
for job in experience:
role = job.get('title')
company = job.get('company')
heritage_relevant, heritage_type = detect_heritage_type(role, company)
career_item = {
'role': job.get('title'),
'organization': job.get('company'),
'dates': job.get('duration'),
'role': role,
'organization': company,
'dates': job.get('date_range') or job.get('duration'), # date_range has year info
'location': job.get('location'),
'description': job.get('description'),
'company_size': job.get('company_details'),
'current': job.get('current', False),
'heritage_relevant': heritage_relevant,
'heritage_type': heritage_type,
}
career_history.append(career_item)
inner_profile['career_history'] = career_history
profile_data['profile_data'] = inner_profile
# Also add heritage classification to existing career_history entries that lack it
if inner_profile and 'career_history' in inner_profile:
career_history = inner_profile.get('career_history', [])
needs_update = False
for job in career_history:
if job.get('heritage_relevant') is None:
needs_update = True
role = job.get('role') or job.get('title')
company = job.get('organization') or job.get('company')
heritage_relevant, heritage_type = detect_heritage_type(role, company)
job['heritage_relevant'] = heritage_relevant
job['heritage_type'] = heritage_type
if needs_update:
inner_profile['career_history'] = career_history
profile_data['profile_data'] = inner_profile
return ProfileResponse(
profile_data=profile_data,
linkedin_slug=result['linkedin_slug'],
@ -892,24 +1079,44 @@ async def get_profile(
file_profile_data = data.get('profile_data', {})
# Transform experience → career_history for frontend compatibility
inner_profile = file_profile_data.get('profile_data', {})
# Handle both nested (profile_data.profile_data) and flat (profile_data) structures
nested_profile = file_profile_data.get('profile_data', {})
inner_profile = nested_profile if nested_profile else file_profile_data
if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile:
experience = inner_profile.get('experience', [])
if experience:
# Map field names: title→role, company→organization, date_range→dates
# Also classify each position as heritage-relevant or not
career_history = []
for job in experience:
role = job.get('title')
company = job.get('company')
heritage_relevant, heritage_type = detect_heritage_type(role, company)
career_item = {
'role': job.get('title'),
'organization': job.get('company'),
'dates': job.get('duration'),
'role': role,
'organization': company,
'dates': job.get('date_range') or job.get('duration'), # date_range has year info
'location': job.get('location'),
'description': job.get('description'),
'company_size': job.get('company_details'),
'current': job.get('current', False),
'heritage_relevant': heritage_relevant,
'heritage_type': heritage_type,
}
career_history.append(career_item)
inner_profile['career_history'] = career_history
file_profile_data['profile_data'] = inner_profile
# career_history is now in inner_profile which is either nested or file_profile_data directly
# Also add heritage classification to existing career_history entries that lack it
if inner_profile and 'career_history' in inner_profile:
career_history = inner_profile.get('career_history', [])
for job in career_history:
if job.get('heritage_relevant') is None:
role = job.get('role') or job.get('title')
company = job.get('organization') or job.get('company')
heritage_relevant, heritage_type = detect_heritage_type(role, company)
job['heritage_relevant'] = heritage_relevant
job['heritage_type'] = heritage_type
return ProfileResponse(
profile_data=file_profile_data,

View file

@ -2250,11 +2250,27 @@ async def stream_heritage_rag(
language: str = "nl",
router: Optional[HeritageQueryRouter] = None,
retriever: Optional[MultiHopHeritageRetriever] = None,
lm: Optional[Any] = None,
) -> AsyncIterator[str]:
"""Stream heritage RAG response with status updates.
Yields NDJSON messages with status updates and final results.
Args:
question: The user's question
language: Language code (default "nl")
router: Optional pre-configured HeritageQueryRouter
retriever: Optional pre-configured MultiHopHeritageRetriever
lm: Optional DSPy LM instance for async-safe context (DSPy 3.x requirement)
"""
from contextlib import nullcontext
# Create DSPy context manager for async-safe LM access
def get_context():
if lm is not None:
return dspy.context(lm=lm)
return nullcontext()
start_time = datetime.now(timezone.utc)
# Initialize modules if not provided
@ -2271,8 +2287,9 @@ async def stream_heritage_rag(
"timestamp": start_time.isoformat(),
}) + "\n"
# Route query
routing = router(question=question, language=language)
# Route query (wrapped with DSPy context for async-safe LM access)
with get_context():
routing = router(question=question, language=language)
yield json.dumps({
"type": "routing",
@ -2323,12 +2340,13 @@ async def stream_heritage_rag(
# Use dspy.streamify for token streaming (if available)
try:
# Create streamified version of synthesizer
streamified = dspy.streamify(retriever.synthesizer)
# Create streamified version of synthesizer (wrapped with DSPy context)
with get_context():
streamified = dspy.streamify(retriever.synthesizer)
listener = HeritageStreamListener()
# Stream tokens
# Stream tokens (streamified retains context from creation)
async for token in streamified(
question=question,
context="Sample context from retrieval",

View file

@ -1532,13 +1532,49 @@ async def dspy_query(request: DSPyQueryRequest) -> DSPyQueryResponse:
pipeline = HeritageRAGPipeline(retriever=qdrant_retriever)
# Execute query with conversation history
result = pipeline.forward(
embedding_model=request.embedding_model,
question=request.question,
language=request.language,
history=history,
include_viz=request.include_visualization,
)
# Retry logic for transient API errors (e.g., Anthropic "Overloaded" errors)
max_retries = 3
last_error: Exception | None = None
result = None
for attempt in range(max_retries):
try:
result = pipeline.forward(
embedding_model=request.embedding_model,
question=request.question,
language=request.language,
history=history,
include_viz=request.include_visualization,
)
break # Success, exit retry loop
except Exception as e:
last_error = e
error_str = str(e).lower()
# Check for retryable errors (API overload, rate limits, temporary failures)
is_retryable = any(keyword in error_str for keyword in [
"overloaded", "rate_limit", "rate limit", "too many requests",
"529", "503", "502", "504", # HTTP status codes
"temporarily unavailable", "service unavailable",
"connection reset", "connection refused", "timeout"
])
if is_retryable and attempt < max_retries - 1:
wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s
logger.warning(
f"Transient API error (attempt {attempt + 1}/{max_retries}): {e}. "
f"Retrying in {wait_time}s..."
)
time.sleep(wait_time)
continue
else:
# Non-retryable error or max retries reached
raise
# If we get here without a result (all retries exhausted), raise the last error
if result is None:
if last_error:
raise last_error
raise HTTPException(status_code=500, detail="Pipeline execution failed with no result")
elapsed_ms = (time.time() - start_time) * 1000