feat(api): Add heritage person classification and RAG retry logic
- Add GLAMORCUBESFIXPHDNT heritage type detection for person profiles - Two-stage classification: blocklist non-heritage orgs, then match keywords - Special handling for Digital (D) type: requires heritage org context - Add career_history heritage_relevant and heritage_type fields - Add exponential backoff retry for Anthropic API overload errors - Fix DSPy 3.x async context with dspy.context() wrapper
This commit is contained in:
parent
22709cc13e
commit
68c5aa2724
4 changed files with 707 additions and 27 deletions
|
|
@ -35,9 +35,12 @@ from decimal import Decimal
|
|||
from fastapi import FastAPI, HTTPException, Query, APIRouter
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.middleware.gzip import GZipMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
from pydantic import BaseModel, Field
|
||||
import asyncpg
|
||||
import httpx
|
||||
import hashlib
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
|
@ -155,6 +158,280 @@ class PersonDetail(BaseModel):
|
|||
source_file: Optional[str]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Heritage Classification (copied from main.py for experience item classification)
|
||||
# ============================================================================
|
||||
|
||||
import re
|
||||
|
||||
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
|
||||
HERITAGE_KEYWORDS = {
|
||||
'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'],
|
||||
'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'],
|
||||
'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid',
|
||||
'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief',
|
||||
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'],
|
||||
'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh',
|
||||
'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
|
||||
'collectiespecialist', 'collectie'],
|
||||
'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'],
|
||||
'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO',
|
||||
'documentatie', 'documentation', 'kenniscentrum', 'historicus'],
|
||||
'C': ['corporate archive', 'bedrijfsarchief', 'company history'],
|
||||
'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy',
|
||||
'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA',
|
||||
'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor',
|
||||
'associate professor', 'hoogleraar', 'educatie', 'educator'],
|
||||
'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'],
|
||||
'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer',
|
||||
'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'],
|
||||
}
|
||||
|
||||
NON_HERITAGE_KEYWORDS = [
|
||||
'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting',
|
||||
'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical',
|
||||
'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking',
|
||||
'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
||||
]
|
||||
|
||||
# Organizations that are explicitly NOT heritage institutions
|
||||
NON_HERITAGE_ORGANIZATIONS = [
|
||||
# Banks & Financial
|
||||
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
|
||||
# Security companies
|
||||
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
|
||||
# Police/Government (non-cultural)
|
||||
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
|
||||
# Political parties
|
||||
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
|
||||
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
|
||||
# Tech companies (non-heritage)
|
||||
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
|
||||
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
|
||||
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
|
||||
# Telecom
|
||||
'kpn', 'vodafone', 't-mobile', 'ziggo',
|
||||
# Postal / Logistics
|
||||
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
|
||||
# Healthcare
|
||||
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
|
||||
# Retail
|
||||
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
|
||||
# Consulting / Professional services
|
||||
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
|
||||
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
|
||||
# Recruitment / HR
|
||||
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
|
||||
# Energy / Utilities
|
||||
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
|
||||
# Transport
|
||||
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
|
||||
# Other
|
||||
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
|
||||
]
|
||||
|
||||
# Heritage organization keywords - organizations that ARE heritage institutions
|
||||
HERITAGE_ORGANIZATION_KEYWORDS = [
|
||||
# Archives
|
||||
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
|
||||
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
|
||||
# Museums
|
||||
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
||||
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
|
||||
# Libraries
|
||||
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
|
||||
# Film/AV heritage
|
||||
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
|
||||
# Heritage platforms
|
||||
'erfgoed', 'heritage', 'cultural', 'cultureel',
|
||||
# Research institutes (heritage-focused)
|
||||
'knaw', 'humanities cluster', 'meertens', 'huygens',
|
||||
]
|
||||
|
||||
|
||||
def detect_heritage_type(role: Optional[str], company: Optional[str]) -> tuple:
|
||||
"""
|
||||
Detect if a position is heritage-relevant and what type.
|
||||
|
||||
Two-stage classification:
|
||||
1. Check if organization is explicitly non-heritage (blocklist)
|
||||
2. Check if role/organization matches heritage patterns
|
||||
|
||||
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
|
||||
This prevents generic IT workers at banks/police from being classified as heritage.
|
||||
|
||||
Args:
|
||||
role: Job title/role text
|
||||
company: Company/organization name
|
||||
|
||||
Returns:
|
||||
Tuple of (heritage_relevant: bool, heritage_type: Optional[str])
|
||||
"""
|
||||
# Combine role and company for full context
|
||||
role_text = role or ''
|
||||
company_text = company or ''
|
||||
combined = f"{role_text} {company_text}".lower()
|
||||
|
||||
if not combined.strip():
|
||||
return (False, None)
|
||||
|
||||
# Stage 1: Check for non-heritage organizations (blocklist)
|
||||
# Use word boundary matching to avoid false positives like "sharing" matching "ing "
|
||||
for org in NON_HERITAGE_ORGANIZATIONS:
|
||||
org_pattern = org.lower().strip()
|
||||
# Use word boundary regex for patterns that could have false positives
|
||||
if re.search(r'\b' + re.escape(org_pattern) + r'\b', combined):
|
||||
return (False, None)
|
||||
|
||||
# Stage 2: Check for non-heritage role indicators
|
||||
for keyword in NON_HERITAGE_KEYWORDS:
|
||||
keyword_pattern = keyword.lower().strip()
|
||||
if re.search(r'\b' + re.escape(keyword_pattern) + r'\b', combined):
|
||||
return (False, None)
|
||||
|
||||
# Stage 3: Check if this is a heritage organization
|
||||
is_heritage_org = False
|
||||
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
|
||||
if org_keyword.lower() in combined:
|
||||
is_heritage_org = True
|
||||
break
|
||||
|
||||
# Check heritage keywords by type (order matters - more specific first)
|
||||
# 'D' (Digital) is checked last and requires heritage org validation
|
||||
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop
|
||||
|
||||
for heritage_type in type_order:
|
||||
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
||||
for keyword in keywords:
|
||||
if keyword.lower() in combined:
|
||||
return (True, heritage_type)
|
||||
|
||||
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
|
||||
if is_heritage_org:
|
||||
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
|
||||
for keyword in digital_keywords:
|
||||
if keyword.lower() in combined:
|
||||
return (True, 'D')
|
||||
|
||||
# Generic heritage terms (without specific type)
|
||||
generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
|
||||
'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection']
|
||||
for keyword in generic:
|
||||
if keyword in combined:
|
||||
return (True, None)
|
||||
|
||||
return (False, None)
|
||||
|
||||
|
||||
def enrich_experience_with_heritage(experience: List) -> List[Dict]:
|
||||
"""
|
||||
Add heritage_relevant and heritage_type fields to each experience item.
|
||||
|
||||
Handles both dict and JSON string inputs (asyncpg returns jsonb array
|
||||
elements as strings that need parsing).
|
||||
|
||||
Args:
|
||||
experience: List of experience items (dicts or JSON strings)
|
||||
|
||||
Returns:
|
||||
Same list with heritage_relevant and heritage_type added to each item
|
||||
"""
|
||||
if not experience:
|
||||
return []
|
||||
|
||||
enriched = []
|
||||
for exp in experience:
|
||||
# Handle case where exp is a JSON string instead of dict
|
||||
# (asyncpg returns jsonb array elements as strings)
|
||||
if isinstance(exp, str):
|
||||
try:
|
||||
exp = json.loads(exp)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Skip if still not a dict
|
||||
if not isinstance(exp, dict):
|
||||
continue
|
||||
|
||||
# Get role and company for classification
|
||||
role = exp.get('title') or exp.get('role') or ''
|
||||
company = exp.get('company') or exp.get('organization') or ''
|
||||
|
||||
# Detect heritage relevance
|
||||
heritage_relevant, heritage_type = detect_heritage_type(role, company)
|
||||
|
||||
# Create new dict with heritage fields added
|
||||
enriched_exp = {**exp}
|
||||
enriched_exp['heritage_relevant'] = heritage_relevant
|
||||
enriched_exp['heritage_type'] = heritage_type
|
||||
enriched.append(enriched_exp)
|
||||
|
||||
return enriched
|
||||
|
||||
|
||||
def parse_jsonb_list(data) -> List:
|
||||
"""
|
||||
Parse a jsonb list field from PostgreSQL.
|
||||
|
||||
asyncpg returns jsonb columns in various forms:
|
||||
- Sometimes as a proper Python list with dict elements
|
||||
- Sometimes as a JSON string that needs parsing
|
||||
- Sometimes as a list where each element is a JSON string
|
||||
- Sometimes as a list where each element is a Python repr string (single quotes)
|
||||
|
||||
This function handles all these cases.
|
||||
|
||||
Args:
|
||||
data: Either a list, a JSON string representing a list, or None
|
||||
|
||||
Returns:
|
||||
Parsed list with all elements as proper Python objects (empty list if None or invalid)
|
||||
"""
|
||||
import ast
|
||||
|
||||
if data is None:
|
||||
return []
|
||||
|
||||
result = []
|
||||
|
||||
# If it's a string, try to parse the whole thing as JSON first
|
||||
if isinstance(data, str):
|
||||
try:
|
||||
data = json.loads(data)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
# Now data should be a list
|
||||
if not isinstance(data, list):
|
||||
return []
|
||||
|
||||
# Parse each element if it's a string
|
||||
for item in data:
|
||||
if isinstance(item, str):
|
||||
# Try JSON first (double quotes)
|
||||
try:
|
||||
parsed_item = json.loads(item)
|
||||
result.append(parsed_item)
|
||||
continue
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try Python literal (single quotes) - handles malformed data
|
||||
try:
|
||||
parsed_item = ast.literal_eval(item)
|
||||
result.append(parsed_item)
|
||||
continue
|
||||
except (ValueError, SyntaxError):
|
||||
pass
|
||||
|
||||
# Keep as string if neither works (e.g., plain skill strings)
|
||||
result.append(item)
|
||||
else:
|
||||
result.append(item)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Global State
|
||||
# ============================================================================
|
||||
|
|
@ -2000,11 +2277,11 @@ async def get_person(staff_id: str):
|
|||
linkedin_url=row['linkedin_url'],
|
||||
profile_image_url=row['profile_image_url'],
|
||||
heritage_relevant=row['heritage_relevant'] if row['heritage_relevant'] is not None else True,
|
||||
heritage_types=row['heritage_types'] if row['heritage_types'] else [],
|
||||
experience=row['experience'] if row['experience'] else [],
|
||||
education=row['education'] if row['education'] else [],
|
||||
skills=row['skills'] if row['skills'] else [],
|
||||
languages=row['languages'] if row['languages'] else [],
|
||||
heritage_types=parse_jsonb_list(row['heritage_types']),
|
||||
experience=enrich_experience_with_heritage(parse_jsonb_list(row['experience'])),
|
||||
education=parse_jsonb_list(row['education']),
|
||||
skills=parse_jsonb_list(row['skills']),
|
||||
languages=parse_jsonb_list(row['languages']),
|
||||
about=row['about'],
|
||||
connections=row['connections'],
|
||||
extraction_date=row['extraction_date'].isoformat() if row['extraction_date'] else None,
|
||||
|
|
@ -2013,6 +2290,148 @@ async def get_person(staff_id: str):
|
|||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Image Proxy (Avoid Hotlinking Issues)
|
||||
# ============================================================================
|
||||
|
||||
# In-memory cache for proxied images (simple TTL-based)
|
||||
_image_cache: Dict[str, tuple] = {} # hash -> (content, content_type, timestamp)
|
||||
IMAGE_CACHE_TTL = 3600 # 1 hour
|
||||
|
||||
# Allowed image domains for security
|
||||
ALLOWED_IMAGE_DOMAINS = {
|
||||
# Google Maps
|
||||
'lh3.googleusercontent.com',
|
||||
'lh4.googleusercontent.com',
|
||||
'lh5.googleusercontent.com',
|
||||
'lh6.googleusercontent.com',
|
||||
'maps.gstatic.com',
|
||||
'maps.googleapis.com',
|
||||
# Wikidata/Wikimedia
|
||||
'upload.wikimedia.org',
|
||||
'commons.wikimedia.org',
|
||||
# Institution domains (add as needed)
|
||||
# Generic patterns handled below
|
||||
}
|
||||
|
||||
|
||||
def is_allowed_image_url(url: str) -> bool:
|
||||
"""Check if URL is from an allowed domain for proxying."""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# Check exact matches
|
||||
if domain in ALLOWED_IMAGE_DOMAINS:
|
||||
return True
|
||||
|
||||
# Allow any .nl domain (Dutch institutions)
|
||||
if domain.endswith('.nl'):
|
||||
return True
|
||||
|
||||
# Allow any .org domain (many heritage institutions)
|
||||
if domain.endswith('.org'):
|
||||
return True
|
||||
|
||||
# Allow any .museum domain
|
||||
if domain.endswith('.museum'):
|
||||
return True
|
||||
|
||||
# Check for Google user content subdomains
|
||||
if 'googleusercontent.com' in domain:
|
||||
return True
|
||||
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
@app.get("/image-proxy")
|
||||
async def proxy_image(url: str = Query(..., description="Image URL to proxy")):
|
||||
"""
|
||||
Proxy external images to avoid hotlinking issues.
|
||||
|
||||
Many external servers block direct embedding (hotlinking) of their images.
|
||||
This endpoint fetches the image server-side and returns it with proper headers.
|
||||
|
||||
Features:
|
||||
- Validates URL is from allowed domains (security)
|
||||
- Caches images in memory for 1 hour (performance)
|
||||
- Sets proper Content-Type headers
|
||||
- Avoids CORS issues
|
||||
|
||||
Usage: /image-proxy?url=https://example.com/logo.png
|
||||
"""
|
||||
# Security: validate URL
|
||||
if not url or not url.startswith(('http://', 'https://')):
|
||||
raise HTTPException(status_code=400, detail="Invalid URL")
|
||||
|
||||
if not is_allowed_image_url(url):
|
||||
raise HTTPException(status_code=403, detail="Domain not allowed for proxying")
|
||||
|
||||
# Check cache
|
||||
url_hash = hashlib.md5(url.encode()).hexdigest()
|
||||
if url_hash in _image_cache:
|
||||
content, content_type, timestamp = _image_cache[url_hash]
|
||||
if datetime.now().timestamp() - timestamp < IMAGE_CACHE_TTL:
|
||||
return Response(
|
||||
content=content,
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Cache-Control": "public, max-age=3600",
|
||||
"X-Proxy-Cache": "HIT",
|
||||
}
|
||||
)
|
||||
|
||||
# Fetch image
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
|
||||
response = await client.get(
|
||||
url,
|
||||
headers={
|
||||
# Spoof headers to avoid hotlink detection
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9,nl;q=0.8",
|
||||
"Referer": urlparse(url).scheme + "://" + urlparse(url).netloc + "/",
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise HTTPException(status_code=502, detail=f"Failed to fetch image: {response.status_code}")
|
||||
|
||||
content = response.content
|
||||
content_type = response.headers.get("content-type", "image/png")
|
||||
|
||||
# Validate it's actually an image
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(status_code=400, detail="URL does not point to an image")
|
||||
|
||||
# Cache the result
|
||||
_image_cache[url_hash] = (content, content_type, datetime.now().timestamp())
|
||||
|
||||
# Limit cache size (simple LRU-like cleanup)
|
||||
if len(_image_cache) > 1000:
|
||||
# Remove oldest entries
|
||||
sorted_entries = sorted(_image_cache.items(), key=lambda x: x[1][2])
|
||||
for key, _ in sorted_entries[:500]:
|
||||
del _image_cache[key]
|
||||
|
||||
return Response(
|
||||
content=content,
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Cache-Control": "public, max-age=3600",
|
||||
"X-Proxy-Cache": "MISS",
|
||||
}
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
raise HTTPException(status_code=504, detail="Timeout fetching image")
|
||||
except httpx.RequestError as e:
|
||||
raise HTTPException(status_code=502, detail=f"Error fetching image: {str(e)}")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
|
|
|||
|
|
@ -40,6 +40,171 @@ class Settings(BaseModel):
|
|||
settings = Settings()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Heritage Classification
|
||||
# ============================================================================
|
||||
|
||||
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
|
||||
HERITAGE_KEYWORDS = {
|
||||
'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'],
|
||||
'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'],
|
||||
'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid',
|
||||
'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief',
|
||||
'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'],
|
||||
'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh',
|
||||
'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder',
|
||||
'collectiespecialist', 'collectie'],
|
||||
'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'],
|
||||
'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO',
|
||||
'documentatie', 'documentation', 'kenniscentrum', 'historicus'],
|
||||
'C': ['corporate archive', 'bedrijfsarchief', 'company history'],
|
||||
'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy',
|
||||
'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA',
|
||||
'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor',
|
||||
'associate professor', 'hoogleraar', 'educatie', 'educator'],
|
||||
'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'],
|
||||
'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer',
|
||||
'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'],
|
||||
}
|
||||
|
||||
NON_HERITAGE_KEYWORDS = [
|
||||
'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting',
|
||||
'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical',
|
||||
'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking',
|
||||
'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
||||
]
|
||||
|
||||
# Organizations that are explicitly NOT heritage institutions
|
||||
NON_HERITAGE_ORGANIZATIONS = [
|
||||
# Banks & Financial
|
||||
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
|
||||
# Security companies
|
||||
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
|
||||
# Police/Government (non-cultural)
|
||||
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
|
||||
# Political parties
|
||||
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
|
||||
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
|
||||
# Tech companies (non-heritage)
|
||||
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
|
||||
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
|
||||
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
|
||||
# Telecom
|
||||
'kpn', 'vodafone', 't-mobile', 'ziggo',
|
||||
# Postal / Logistics
|
||||
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
|
||||
# Healthcare
|
||||
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
|
||||
# Retail
|
||||
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
|
||||
# Consulting / Professional services
|
||||
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
|
||||
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
|
||||
# Recruitment / HR
|
||||
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
|
||||
# Energy / Utilities
|
||||
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
|
||||
# Transport
|
||||
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
|
||||
# Other
|
||||
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
|
||||
]
|
||||
|
||||
# Heritage organization keywords - organizations that ARE heritage institutions
|
||||
HERITAGE_ORGANIZATION_KEYWORDS = [
|
||||
# Archives
|
||||
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
|
||||
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
|
||||
# Museums
|
||||
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
||||
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
|
||||
# Libraries
|
||||
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
|
||||
# Film/AV heritage
|
||||
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
|
||||
# Heritage platforms
|
||||
'erfgoed', 'heritage', 'cultural', 'cultureel',
|
||||
# Research institutes (heritage-focused)
|
||||
'knaw', 'humanities cluster', 'meertens', 'huygens',
|
||||
]
|
||||
|
||||
|
||||
def detect_heritage_type(role: Optional[str], company: Optional[str]) -> tuple:
|
||||
"""
|
||||
Detect if a position is heritage-relevant and what type.
|
||||
|
||||
Two-stage classification:
|
||||
1. Check if organization is explicitly non-heritage (blocklist)
|
||||
2. Check if role/organization matches heritage patterns
|
||||
|
||||
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
|
||||
This prevents generic IT workers at banks/police from being classified as heritage.
|
||||
|
||||
Args:
|
||||
role: Job title/role text
|
||||
company: Company/organization name
|
||||
|
||||
Returns:
|
||||
Tuple of (heritage_relevant: bool, heritage_type: Optional[str])
|
||||
"""
|
||||
import re
|
||||
|
||||
# Combine role and company for full context
|
||||
role_text = role or ''
|
||||
company_text = company or ''
|
||||
combined = f"{role_text} {company_text}".lower()
|
||||
|
||||
if not combined.strip():
|
||||
return (False, None)
|
||||
|
||||
# Stage 1: Check for non-heritage organizations (blocklist)
|
||||
# Use word boundary matching to avoid false positives like "sharing" matching "ing "
|
||||
for org in NON_HERITAGE_ORGANIZATIONS:
|
||||
org_pattern = org.lower().strip()
|
||||
# Use word boundary regex for patterns that could have false positives
|
||||
if re.search(r'\b' + re.escape(org_pattern) + r'\b', combined):
|
||||
return (False, None)
|
||||
|
||||
# Stage 2: Check for non-heritage role indicators
|
||||
for keyword in NON_HERITAGE_KEYWORDS:
|
||||
keyword_pattern = keyword.lower().strip()
|
||||
if re.search(r'\b' + re.escape(keyword_pattern) + r'\b', combined):
|
||||
return (False, None)
|
||||
|
||||
# Stage 3: Check if this is a heritage organization
|
||||
is_heritage_org = False
|
||||
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
|
||||
if org_keyword.lower() in combined:
|
||||
is_heritage_org = True
|
||||
break
|
||||
|
||||
# Check heritage keywords by type (order matters - more specific first)
|
||||
# 'D' (Digital) is checked last and requires heritage org validation
|
||||
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop
|
||||
|
||||
for heritage_type in type_order:
|
||||
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
||||
for keyword in keywords:
|
||||
if keyword.lower() in combined:
|
||||
return (True, heritage_type)
|
||||
|
||||
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
|
||||
if is_heritage_org:
|
||||
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
|
||||
for keyword in digital_keywords:
|
||||
if keyword.lower() in combined:
|
||||
return (True, 'D')
|
||||
|
||||
# Generic heritage terms (without specific type)
|
||||
generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
|
||||
'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection']
|
||||
for keyword in generic:
|
||||
if keyword in combined:
|
||||
return (True, None)
|
||||
|
||||
return (False, None)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Pydantic Models
|
||||
# ============================================================================
|
||||
|
|
@ -854,22 +1019,44 @@ async def get_profile(
|
|||
if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile:
|
||||
experience = inner_profile.get('experience', [])
|
||||
if experience:
|
||||
# Map field names: title→role, company→organization, duration→dates
|
||||
# Map field names: title→role, company→organization, date_range→dates
|
||||
# Also classify each position as heritage-relevant or not
|
||||
career_history = []
|
||||
for job in experience:
|
||||
role = job.get('title')
|
||||
company = job.get('company')
|
||||
heritage_relevant, heritage_type = detect_heritage_type(role, company)
|
||||
career_item = {
|
||||
'role': job.get('title'),
|
||||
'organization': job.get('company'),
|
||||
'dates': job.get('duration'),
|
||||
'role': role,
|
||||
'organization': company,
|
||||
'dates': job.get('date_range') or job.get('duration'), # date_range has year info
|
||||
'location': job.get('location'),
|
||||
'description': job.get('description'),
|
||||
'company_size': job.get('company_details'),
|
||||
'current': job.get('current', False),
|
||||
'heritage_relevant': heritage_relevant,
|
||||
'heritage_type': heritage_type,
|
||||
}
|
||||
career_history.append(career_item)
|
||||
inner_profile['career_history'] = career_history
|
||||
profile_data['profile_data'] = inner_profile
|
||||
|
||||
# Also add heritage classification to existing career_history entries that lack it
|
||||
if inner_profile and 'career_history' in inner_profile:
|
||||
career_history = inner_profile.get('career_history', [])
|
||||
needs_update = False
|
||||
for job in career_history:
|
||||
if job.get('heritage_relevant') is None:
|
||||
needs_update = True
|
||||
role = job.get('role') or job.get('title')
|
||||
company = job.get('organization') or job.get('company')
|
||||
heritage_relevant, heritage_type = detect_heritage_type(role, company)
|
||||
job['heritage_relevant'] = heritage_relevant
|
||||
job['heritage_type'] = heritage_type
|
||||
if needs_update:
|
||||
inner_profile['career_history'] = career_history
|
||||
profile_data['profile_data'] = inner_profile
|
||||
|
||||
return ProfileResponse(
|
||||
profile_data=profile_data,
|
||||
linkedin_slug=result['linkedin_slug'],
|
||||
|
|
@ -892,24 +1079,44 @@ async def get_profile(
|
|||
file_profile_data = data.get('profile_data', {})
|
||||
|
||||
# Transform experience → career_history for frontend compatibility
|
||||
inner_profile = file_profile_data.get('profile_data', {})
|
||||
# Handle both nested (profile_data.profile_data) and flat (profile_data) structures
|
||||
nested_profile = file_profile_data.get('profile_data', {})
|
||||
inner_profile = nested_profile if nested_profile else file_profile_data
|
||||
if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile:
|
||||
experience = inner_profile.get('experience', [])
|
||||
if experience:
|
||||
# Map field names: title→role, company→organization, date_range→dates
|
||||
# Also classify each position as heritage-relevant or not
|
||||
career_history = []
|
||||
for job in experience:
|
||||
role = job.get('title')
|
||||
company = job.get('company')
|
||||
heritage_relevant, heritage_type = detect_heritage_type(role, company)
|
||||
career_item = {
|
||||
'role': job.get('title'),
|
||||
'organization': job.get('company'),
|
||||
'dates': job.get('duration'),
|
||||
'role': role,
|
||||
'organization': company,
|
||||
'dates': job.get('date_range') or job.get('duration'), # date_range has year info
|
||||
'location': job.get('location'),
|
||||
'description': job.get('description'),
|
||||
'company_size': job.get('company_details'),
|
||||
'current': job.get('current', False),
|
||||
'heritage_relevant': heritage_relevant,
|
||||
'heritage_type': heritage_type,
|
||||
}
|
||||
career_history.append(career_item)
|
||||
inner_profile['career_history'] = career_history
|
||||
file_profile_data['profile_data'] = inner_profile
|
||||
# career_history is now in inner_profile which is either nested or file_profile_data directly
|
||||
|
||||
# Also add heritage classification to existing career_history entries that lack it
|
||||
if inner_profile and 'career_history' in inner_profile:
|
||||
career_history = inner_profile.get('career_history', [])
|
||||
for job in career_history:
|
||||
if job.get('heritage_relevant') is None:
|
||||
role = job.get('role') or job.get('title')
|
||||
company = job.get('organization') or job.get('company')
|
||||
heritage_relevant, heritage_type = detect_heritage_type(role, company)
|
||||
job['heritage_relevant'] = heritage_relevant
|
||||
job['heritage_type'] = heritage_type
|
||||
|
||||
return ProfileResponse(
|
||||
profile_data=file_profile_data,
|
||||
|
|
|
|||
|
|
@ -2250,11 +2250,27 @@ async def stream_heritage_rag(
|
|||
language: str = "nl",
|
||||
router: Optional[HeritageQueryRouter] = None,
|
||||
retriever: Optional[MultiHopHeritageRetriever] = None,
|
||||
lm: Optional[Any] = None,
|
||||
) -> AsyncIterator[str]:
|
||||
"""Stream heritage RAG response with status updates.
|
||||
|
||||
Yields NDJSON messages with status updates and final results.
|
||||
|
||||
Args:
|
||||
question: The user's question
|
||||
language: Language code (default "nl")
|
||||
router: Optional pre-configured HeritageQueryRouter
|
||||
retriever: Optional pre-configured MultiHopHeritageRetriever
|
||||
lm: Optional DSPy LM instance for async-safe context (DSPy 3.x requirement)
|
||||
"""
|
||||
from contextlib import nullcontext
|
||||
|
||||
# Create DSPy context manager for async-safe LM access
|
||||
def get_context():
|
||||
if lm is not None:
|
||||
return dspy.context(lm=lm)
|
||||
return nullcontext()
|
||||
|
||||
start_time = datetime.now(timezone.utc)
|
||||
|
||||
# Initialize modules if not provided
|
||||
|
|
@ -2271,8 +2287,9 @@ async def stream_heritage_rag(
|
|||
"timestamp": start_time.isoformat(),
|
||||
}) + "\n"
|
||||
|
||||
# Route query
|
||||
routing = router(question=question, language=language)
|
||||
# Route query (wrapped with DSPy context for async-safe LM access)
|
||||
with get_context():
|
||||
routing = router(question=question, language=language)
|
||||
|
||||
yield json.dumps({
|
||||
"type": "routing",
|
||||
|
|
@ -2323,12 +2340,13 @@ async def stream_heritage_rag(
|
|||
|
||||
# Use dspy.streamify for token streaming (if available)
|
||||
try:
|
||||
# Create streamified version of synthesizer
|
||||
streamified = dspy.streamify(retriever.synthesizer)
|
||||
# Create streamified version of synthesizer (wrapped with DSPy context)
|
||||
with get_context():
|
||||
streamified = dspy.streamify(retriever.synthesizer)
|
||||
|
||||
listener = HeritageStreamListener()
|
||||
|
||||
# Stream tokens
|
||||
# Stream tokens (streamified retains context from creation)
|
||||
async for token in streamified(
|
||||
question=question,
|
||||
context="Sample context from retrieval",
|
||||
|
|
|
|||
|
|
@ -1532,13 +1532,49 @@ async def dspy_query(request: DSPyQueryRequest) -> DSPyQueryResponse:
|
|||
pipeline = HeritageRAGPipeline(retriever=qdrant_retriever)
|
||||
|
||||
# Execute query with conversation history
|
||||
result = pipeline.forward(
|
||||
embedding_model=request.embedding_model,
|
||||
question=request.question,
|
||||
language=request.language,
|
||||
history=history,
|
||||
include_viz=request.include_visualization,
|
||||
)
|
||||
# Retry logic for transient API errors (e.g., Anthropic "Overloaded" errors)
|
||||
max_retries = 3
|
||||
last_error: Exception | None = None
|
||||
result = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
result = pipeline.forward(
|
||||
embedding_model=request.embedding_model,
|
||||
question=request.question,
|
||||
language=request.language,
|
||||
history=history,
|
||||
include_viz=request.include_visualization,
|
||||
)
|
||||
break # Success, exit retry loop
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
error_str = str(e).lower()
|
||||
# Check for retryable errors (API overload, rate limits, temporary failures)
|
||||
is_retryable = any(keyword in error_str for keyword in [
|
||||
"overloaded", "rate_limit", "rate limit", "too many requests",
|
||||
"529", "503", "502", "504", # HTTP status codes
|
||||
"temporarily unavailable", "service unavailable",
|
||||
"connection reset", "connection refused", "timeout"
|
||||
])
|
||||
|
||||
if is_retryable and attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s
|
||||
logger.warning(
|
||||
f"Transient API error (attempt {attempt + 1}/{max_retries}): {e}. "
|
||||
f"Retrying in {wait_time}s..."
|
||||
)
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
else:
|
||||
# Non-retryable error or max retries reached
|
||||
raise
|
||||
|
||||
# If we get here without a result (all retries exhausted), raise the last error
|
||||
if result is None:
|
||||
if last_error:
|
||||
raise last_error
|
||||
raise HTTPException(status_code=500, detail="Pipeline execution failed with no result")
|
||||
|
||||
elapsed_ms = (time.time() - start_time) * 1000
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue