diff --git a/backend/postgres/geo_api.py b/backend/postgres/geo_api.py index af254e9c2d..a74c67bdad 100644 --- a/backend/postgres/geo_api.py +++ b/backend/postgres/geo_api.py @@ -35,9 +35,12 @@ from decimal import Decimal from fastapi import FastAPI, HTTPException, Query, APIRouter from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.gzip import GZipMiddleware -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, Response from pydantic import BaseModel, Field import asyncpg +import httpx +import hashlib +from urllib.parse import urlparse # ============================================================================ @@ -155,6 +158,280 @@ class PersonDetail(BaseModel): source_file: Optional[str] +# ============================================================================ +# Heritage Classification (copied from main.py for experience item classification) +# ============================================================================ + +import re + +# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy +HERITAGE_KEYWORDS = { + 'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'], + 'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'], + 'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid', + 'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief', + 'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'], + 'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh', + 'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder', + 'collectiespecialist', 'collectie'], + 'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'], + 'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO', + 'documentatie', 'documentation', 'kenniscentrum', 'historicus'], + 'C': ['corporate archive', 'bedrijfsarchief', 'company history'], + 'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy', + 'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA', + 'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor', + 'associate professor', 'hoogleraar', 'educatie', 'educator'], + 'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'], + 'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer', + 'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'], +} + +NON_HERITAGE_KEYWORDS = [ + 'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting', + 'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical', + 'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking', + 'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse' +] + +# Organizations that are explicitly NOT heritage institutions +NON_HERITAGE_ORGANIZATIONS = [ + # Banks & Financial + 'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos', + # Security companies + 'i-sec', 'g4s', 'securitas', 'trigion', 'chubb', + # Police/Government (non-cultural) + 'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie', + # Political parties + 'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt', + 'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ', + # Tech companies (non-heritage) + 'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix', + 'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird', + 'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat', + # Telecom + 'kpn', 'vodafone', 't-mobile', 'ziggo', + # Postal / Logistics + 'postnl', 'postkantoren', 'dhl', 'ups', 'fedex', + # Healthcare + 'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg', + # Retail + 'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action', + # Consulting / Professional services + 'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg', + 'accenture', 'capgemini', 'ordina', 'atos', 'cgi ', + # Recruitment / HR + 'randstad', 'tempo-team', 'manpower', 'hays', 'brunel', + # Energy / Utilities + 'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon', + # Transport + 'ns ', 'prorail', 'schiphol', 'klm', 'transavia', + # Other + 'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf', +] + +# Heritage organization keywords - organizations that ARE heritage institutions +HERITAGE_ORGANIZATION_KEYWORDS = [ + # Archives + 'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief', + 'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg', + # Museums + 'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis', + 'tropenmuseum', 'allard pierson', 'kröller', 'boijmans', + # Libraries + 'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ', + # Film/AV heritage + 'eye film', 'filmmuseum', 'eye ', 'sound and vision', + # Heritage platforms + 'erfgoed', 'heritage', 'cultural', 'cultureel', + # Research institutes (heritage-focused) + 'knaw', 'humanities cluster', 'meertens', 'huygens', +] + + +def detect_heritage_type(role: Optional[str], company: Optional[str]) -> tuple: + """ + Detect if a position is heritage-relevant and what type. + + Two-stage classification: + 1. Check if organization is explicitly non-heritage (blocklist) + 2. Check if role/organization matches heritage patterns + + For 'D' (Digital) type, require BOTH a tech role AND a heritage organization. + This prevents generic IT workers at banks/police from being classified as heritage. + + Args: + role: Job title/role text + company: Company/organization name + + Returns: + Tuple of (heritage_relevant: bool, heritage_type: Optional[str]) + """ + # Combine role and company for full context + role_text = role or '' + company_text = company or '' + combined = f"{role_text} {company_text}".lower() + + if not combined.strip(): + return (False, None) + + # Stage 1: Check for non-heritage organizations (blocklist) + # Use word boundary matching to avoid false positives like "sharing" matching "ing " + for org in NON_HERITAGE_ORGANIZATIONS: + org_pattern = org.lower().strip() + # Use word boundary regex for patterns that could have false positives + if re.search(r'\b' + re.escape(org_pattern) + r'\b', combined): + return (False, None) + + # Stage 2: Check for non-heritage role indicators + for keyword in NON_HERITAGE_KEYWORDS: + keyword_pattern = keyword.lower().strip() + if re.search(r'\b' + re.escape(keyword_pattern) + r'\b', combined): + return (False, None) + + # Stage 3: Check if this is a heritage organization + is_heritage_org = False + for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS: + if org_keyword.lower() in combined: + is_heritage_org = True + break + + # Check heritage keywords by type (order matters - more specific first) + # 'D' (Digital) is checked last and requires heritage org validation + type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop + + for heritage_type in type_order: + keywords = HERITAGE_KEYWORDS.get(heritage_type, []) + for keyword in keywords: + if keyword.lower() in combined: + return (True, heritage_type) + + # Special handling for 'D' (Digital) - ONLY if at a heritage organization + if is_heritage_org: + digital_keywords = HERITAGE_KEYWORDS.get('D', []) + for keyword in digital_keywords: + if keyword.lower() in combined: + return (True, 'D') + + # Generic heritage terms (without specific type) + generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema', + 'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection'] + for keyword in generic: + if keyword in combined: + return (True, None) + + return (False, None) + + +def enrich_experience_with_heritage(experience: List) -> List[Dict]: + """ + Add heritage_relevant and heritage_type fields to each experience item. + + Handles both dict and JSON string inputs (asyncpg returns jsonb array + elements as strings that need parsing). + + Args: + experience: List of experience items (dicts or JSON strings) + + Returns: + Same list with heritage_relevant and heritage_type added to each item + """ + if not experience: + return [] + + enriched = [] + for exp in experience: + # Handle case where exp is a JSON string instead of dict + # (asyncpg returns jsonb array elements as strings) + if isinstance(exp, str): + try: + exp = json.loads(exp) + except json.JSONDecodeError: + continue + + # Skip if still not a dict + if not isinstance(exp, dict): + continue + + # Get role and company for classification + role = exp.get('title') or exp.get('role') or '' + company = exp.get('company') or exp.get('organization') or '' + + # Detect heritage relevance + heritage_relevant, heritage_type = detect_heritage_type(role, company) + + # Create new dict with heritage fields added + enriched_exp = {**exp} + enriched_exp['heritage_relevant'] = heritage_relevant + enriched_exp['heritage_type'] = heritage_type + enriched.append(enriched_exp) + + return enriched + + +def parse_jsonb_list(data) -> List: + """ + Parse a jsonb list field from PostgreSQL. + + asyncpg returns jsonb columns in various forms: + - Sometimes as a proper Python list with dict elements + - Sometimes as a JSON string that needs parsing + - Sometimes as a list where each element is a JSON string + - Sometimes as a list where each element is a Python repr string (single quotes) + + This function handles all these cases. + + Args: + data: Either a list, a JSON string representing a list, or None + + Returns: + Parsed list with all elements as proper Python objects (empty list if None or invalid) + """ + import ast + + if data is None: + return [] + + result = [] + + # If it's a string, try to parse the whole thing as JSON first + if isinstance(data, str): + try: + data = json.loads(data) + except json.JSONDecodeError: + return [] + + # Now data should be a list + if not isinstance(data, list): + return [] + + # Parse each element if it's a string + for item in data: + if isinstance(item, str): + # Try JSON first (double quotes) + try: + parsed_item = json.loads(item) + result.append(parsed_item) + continue + except json.JSONDecodeError: + pass + + # Try Python literal (single quotes) - handles malformed data + try: + parsed_item = ast.literal_eval(item) + result.append(parsed_item) + continue + except (ValueError, SyntaxError): + pass + + # Keep as string if neither works (e.g., plain skill strings) + result.append(item) + else: + result.append(item) + + return result + + # ============================================================================ # Global State # ============================================================================ @@ -2000,11 +2277,11 @@ async def get_person(staff_id: str): linkedin_url=row['linkedin_url'], profile_image_url=row['profile_image_url'], heritage_relevant=row['heritage_relevant'] if row['heritage_relevant'] is not None else True, - heritage_types=row['heritage_types'] if row['heritage_types'] else [], - experience=row['experience'] if row['experience'] else [], - education=row['education'] if row['education'] else [], - skills=row['skills'] if row['skills'] else [], - languages=row['languages'] if row['languages'] else [], + heritage_types=parse_jsonb_list(row['heritage_types']), + experience=enrich_experience_with_heritage(parse_jsonb_list(row['experience'])), + education=parse_jsonb_list(row['education']), + skills=parse_jsonb_list(row['skills']), + languages=parse_jsonb_list(row['languages']), about=row['about'], connections=row['connections'], extraction_date=row['extraction_date'].isoformat() if row['extraction_date'] else None, @@ -2013,6 +2290,148 @@ async def get_person(staff_id: str): ) +# ============================================================================ +# Image Proxy (Avoid Hotlinking Issues) +# ============================================================================ + +# In-memory cache for proxied images (simple TTL-based) +_image_cache: Dict[str, tuple] = {} # hash -> (content, content_type, timestamp) +IMAGE_CACHE_TTL = 3600 # 1 hour + +# Allowed image domains for security +ALLOWED_IMAGE_DOMAINS = { + # Google Maps + 'lh3.googleusercontent.com', + 'lh4.googleusercontent.com', + 'lh5.googleusercontent.com', + 'lh6.googleusercontent.com', + 'maps.gstatic.com', + 'maps.googleapis.com', + # Wikidata/Wikimedia + 'upload.wikimedia.org', + 'commons.wikimedia.org', + # Institution domains (add as needed) + # Generic patterns handled below +} + + +def is_allowed_image_url(url: str) -> bool: + """Check if URL is from an allowed domain for proxying.""" + try: + parsed = urlparse(url) + domain = parsed.netloc.lower() + + # Check exact matches + if domain in ALLOWED_IMAGE_DOMAINS: + return True + + # Allow any .nl domain (Dutch institutions) + if domain.endswith('.nl'): + return True + + # Allow any .org domain (many heritage institutions) + if domain.endswith('.org'): + return True + + # Allow any .museum domain + if domain.endswith('.museum'): + return True + + # Check for Google user content subdomains + if 'googleusercontent.com' in domain: + return True + + return False + except Exception: + return False + + +@app.get("/image-proxy") +async def proxy_image(url: str = Query(..., description="Image URL to proxy")): + """ + Proxy external images to avoid hotlinking issues. + + Many external servers block direct embedding (hotlinking) of their images. + This endpoint fetches the image server-side and returns it with proper headers. + + Features: + - Validates URL is from allowed domains (security) + - Caches images in memory for 1 hour (performance) + - Sets proper Content-Type headers + - Avoids CORS issues + + Usage: /image-proxy?url=https://example.com/logo.png + """ + # Security: validate URL + if not url or not url.startswith(('http://', 'https://')): + raise HTTPException(status_code=400, detail="Invalid URL") + + if not is_allowed_image_url(url): + raise HTTPException(status_code=403, detail="Domain not allowed for proxying") + + # Check cache + url_hash = hashlib.md5(url.encode()).hexdigest() + if url_hash in _image_cache: + content, content_type, timestamp = _image_cache[url_hash] + if datetime.now().timestamp() - timestamp < IMAGE_CACHE_TTL: + return Response( + content=content, + media_type=content_type, + headers={ + "Cache-Control": "public, max-age=3600", + "X-Proxy-Cache": "HIT", + } + ) + + # Fetch image + try: + async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client: + response = await client.get( + url, + headers={ + # Spoof headers to avoid hotlink detection + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9,nl;q=0.8", + "Referer": urlparse(url).scheme + "://" + urlparse(url).netloc + "/", + } + ) + + if response.status_code != 200: + raise HTTPException(status_code=502, detail=f"Failed to fetch image: {response.status_code}") + + content = response.content + content_type = response.headers.get("content-type", "image/png") + + # Validate it's actually an image + if not content_type.startswith("image/"): + raise HTTPException(status_code=400, detail="URL does not point to an image") + + # Cache the result + _image_cache[url_hash] = (content, content_type, datetime.now().timestamp()) + + # Limit cache size (simple LRU-like cleanup) + if len(_image_cache) > 1000: + # Remove oldest entries + sorted_entries = sorted(_image_cache.items(), key=lambda x: x[1][2]) + for key, _ in sorted_entries[:500]: + del _image_cache[key] + + return Response( + content=content, + media_type=content_type, + headers={ + "Cache-Control": "public, max-age=3600", + "X-Proxy-Cache": "MISS", + } + ) + + except httpx.TimeoutException: + raise HTTPException(status_code=504, detail="Timeout fetching image") + except httpx.RequestError as e: + raise HTTPException(status_code=502, detail=f"Error fetching image: {str(e)}") + + # ============================================================================ # Main # ============================================================================ diff --git a/backend/postgres/main.py b/backend/postgres/main.py index 0c93391a54..eedaee5acb 100644 --- a/backend/postgres/main.py +++ b/backend/postgres/main.py @@ -40,6 +40,171 @@ class Settings(BaseModel): settings = Settings() +# ============================================================================ +# Heritage Classification +# ============================================================================ + +# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy +HERITAGE_KEYWORDS = { + 'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space'], + 'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library'], + 'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid', + 'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief', + 'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist'], + 'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh', + 'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder', + 'collectiespecialist', 'collectie'], + 'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW'], + 'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO', + 'documentatie', 'documentation', 'kenniscentrum', 'historicus'], + 'C': ['corporate archive', 'bedrijfsarchief', 'company history'], + 'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy', + 'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA', + 'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor', + 'associate professor', 'hoogleraar', 'educatie', 'educator'], + 'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging'], + 'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer', + 'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist'], +} + +NON_HERITAGE_KEYWORDS = [ + 'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting', + 'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical', + 'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking', + 'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse' +] + +# Organizations that are explicitly NOT heritage institutions +NON_HERITAGE_ORGANIZATIONS = [ + # Banks & Financial + 'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos', + # Security companies + 'i-sec', 'g4s', 'securitas', 'trigion', 'chubb', + # Police/Government (non-cultural) + 'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie', + # Political parties + 'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt', + 'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ', + # Tech companies (non-heritage) + 'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix', + 'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird', + 'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat', + # Telecom + 'kpn', 'vodafone', 't-mobile', 'ziggo', + # Postal / Logistics + 'postnl', 'postkantoren', 'dhl', 'ups', 'fedex', + # Healthcare + 'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg', + # Retail + 'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action', + # Consulting / Professional services + 'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg', + 'accenture', 'capgemini', 'ordina', 'atos', 'cgi ', + # Recruitment / HR + 'randstad', 'tempo-team', 'manpower', 'hays', 'brunel', + # Energy / Utilities + 'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon', + # Transport + 'ns ', 'prorail', 'schiphol', 'klm', 'transavia', + # Other + 'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf', +] + +# Heritage organization keywords - organizations that ARE heritage institutions +HERITAGE_ORGANIZATION_KEYWORDS = [ + # Archives + 'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief', + 'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg', + # Museums + 'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis', + 'tropenmuseum', 'allard pierson', 'kröller', 'boijmans', + # Libraries + 'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ', + # Film/AV heritage + 'eye film', 'filmmuseum', 'eye ', 'sound and vision', + # Heritage platforms + 'erfgoed', 'heritage', 'cultural', 'cultureel', + # Research institutes (heritage-focused) + 'knaw', 'humanities cluster', 'meertens', 'huygens', +] + + +def detect_heritage_type(role: Optional[str], company: Optional[str]) -> tuple: + """ + Detect if a position is heritage-relevant and what type. + + Two-stage classification: + 1. Check if organization is explicitly non-heritage (blocklist) + 2. Check if role/organization matches heritage patterns + + For 'D' (Digital) type, require BOTH a tech role AND a heritage organization. + This prevents generic IT workers at banks/police from being classified as heritage. + + Args: + role: Job title/role text + company: Company/organization name + + Returns: + Tuple of (heritage_relevant: bool, heritage_type: Optional[str]) + """ + import re + + # Combine role and company for full context + role_text = role or '' + company_text = company or '' + combined = f"{role_text} {company_text}".lower() + + if not combined.strip(): + return (False, None) + + # Stage 1: Check for non-heritage organizations (blocklist) + # Use word boundary matching to avoid false positives like "sharing" matching "ing " + for org in NON_HERITAGE_ORGANIZATIONS: + org_pattern = org.lower().strip() + # Use word boundary regex for patterns that could have false positives + if re.search(r'\b' + re.escape(org_pattern) + r'\b', combined): + return (False, None) + + # Stage 2: Check for non-heritage role indicators + for keyword in NON_HERITAGE_KEYWORDS: + keyword_pattern = keyword.lower().strip() + if re.search(r'\b' + re.escape(keyword_pattern) + r'\b', combined): + return (False, None) + + # Stage 3: Check if this is a heritage organization + is_heritage_org = False + for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS: + if org_keyword.lower() in combined: + is_heritage_org = True + break + + # Check heritage keywords by type (order matters - more specific first) + # 'D' (Digital) is checked last and requires heritage org validation + type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop + + for heritage_type in type_order: + keywords = HERITAGE_KEYWORDS.get(heritage_type, []) + for keyword in keywords: + if keyword.lower() in combined: + return (True, heritage_type) + + # Special handling for 'D' (Digital) - ONLY if at a heritage organization + if is_heritage_org: + digital_keywords = HERITAGE_KEYWORDS.get('D', []) + for keyword in digital_keywords: + if keyword.lower() in combined: + return (True, 'D') + + # Generic heritage terms (without specific type) + generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema', + 'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection'] + for keyword in generic: + if keyword in combined: + return (True, None) + + return (False, None) + + # ============================================================================ # Pydantic Models # ============================================================================ @@ -854,22 +1019,44 @@ async def get_profile( if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile: experience = inner_profile.get('experience', []) if experience: - # Map field names: title→role, company→organization, duration→dates + # Map field names: title→role, company→organization, date_range→dates + # Also classify each position as heritage-relevant or not career_history = [] for job in experience: + role = job.get('title') + company = job.get('company') + heritage_relevant, heritage_type = detect_heritage_type(role, company) career_item = { - 'role': job.get('title'), - 'organization': job.get('company'), - 'dates': job.get('duration'), + 'role': role, + 'organization': company, + 'dates': job.get('date_range') or job.get('duration'), # date_range has year info 'location': job.get('location'), 'description': job.get('description'), 'company_size': job.get('company_details'), 'current': job.get('current', False), + 'heritage_relevant': heritage_relevant, + 'heritage_type': heritage_type, } career_history.append(career_item) inner_profile['career_history'] = career_history profile_data['profile_data'] = inner_profile + # Also add heritage classification to existing career_history entries that lack it + if inner_profile and 'career_history' in inner_profile: + career_history = inner_profile.get('career_history', []) + needs_update = False + for job in career_history: + if job.get('heritage_relevant') is None: + needs_update = True + role = job.get('role') or job.get('title') + company = job.get('organization') or job.get('company') + heritage_relevant, heritage_type = detect_heritage_type(role, company) + job['heritage_relevant'] = heritage_relevant + job['heritage_type'] = heritage_type + if needs_update: + inner_profile['career_history'] = career_history + profile_data['profile_data'] = inner_profile + return ProfileResponse( profile_data=profile_data, linkedin_slug=result['linkedin_slug'], @@ -892,24 +1079,44 @@ async def get_profile( file_profile_data = data.get('profile_data', {}) # Transform experience → career_history for frontend compatibility - inner_profile = file_profile_data.get('profile_data', {}) + # Handle both nested (profile_data.profile_data) and flat (profile_data) structures + nested_profile = file_profile_data.get('profile_data', {}) + inner_profile = nested_profile if nested_profile else file_profile_data if inner_profile and 'experience' in inner_profile and 'career_history' not in inner_profile: experience = inner_profile.get('experience', []) if experience: + # Map field names: title→role, company→organization, date_range→dates + # Also classify each position as heritage-relevant or not career_history = [] for job in experience: + role = job.get('title') + company = job.get('company') + heritage_relevant, heritage_type = detect_heritage_type(role, company) career_item = { - 'role': job.get('title'), - 'organization': job.get('company'), - 'dates': job.get('duration'), + 'role': role, + 'organization': company, + 'dates': job.get('date_range') or job.get('duration'), # date_range has year info 'location': job.get('location'), 'description': job.get('description'), 'company_size': job.get('company_details'), 'current': job.get('current', False), + 'heritage_relevant': heritage_relevant, + 'heritage_type': heritage_type, } career_history.append(career_item) inner_profile['career_history'] = career_history - file_profile_data['profile_data'] = inner_profile + # career_history is now in inner_profile which is either nested or file_profile_data directly + + # Also add heritage classification to existing career_history entries that lack it + if inner_profile and 'career_history' in inner_profile: + career_history = inner_profile.get('career_history', []) + for job in career_history: + if job.get('heritage_relevant') is None: + role = job.get('role') or job.get('title') + company = job.get('organization') or job.get('company') + heritage_relevant, heritage_type = detect_heritage_type(role, company) + job['heritage_relevant'] = heritage_relevant + job['heritage_type'] = heritage_type return ProfileResponse( profile_data=file_profile_data, diff --git a/backend/rag/dspy_heritage_rag.py b/backend/rag/dspy_heritage_rag.py index 2f14d501ca..6da476f783 100644 --- a/backend/rag/dspy_heritage_rag.py +++ b/backend/rag/dspy_heritage_rag.py @@ -2250,11 +2250,27 @@ async def stream_heritage_rag( language: str = "nl", router: Optional[HeritageQueryRouter] = None, retriever: Optional[MultiHopHeritageRetriever] = None, + lm: Optional[Any] = None, ) -> AsyncIterator[str]: """Stream heritage RAG response with status updates. Yields NDJSON messages with status updates and final results. + + Args: + question: The user's question + language: Language code (default "nl") + router: Optional pre-configured HeritageQueryRouter + retriever: Optional pre-configured MultiHopHeritageRetriever + lm: Optional DSPy LM instance for async-safe context (DSPy 3.x requirement) """ + from contextlib import nullcontext + + # Create DSPy context manager for async-safe LM access + def get_context(): + if lm is not None: + return dspy.context(lm=lm) + return nullcontext() + start_time = datetime.now(timezone.utc) # Initialize modules if not provided @@ -2271,8 +2287,9 @@ async def stream_heritage_rag( "timestamp": start_time.isoformat(), }) + "\n" - # Route query - routing = router(question=question, language=language) + # Route query (wrapped with DSPy context for async-safe LM access) + with get_context(): + routing = router(question=question, language=language) yield json.dumps({ "type": "routing", @@ -2323,12 +2340,13 @@ async def stream_heritage_rag( # Use dspy.streamify for token streaming (if available) try: - # Create streamified version of synthesizer - streamified = dspy.streamify(retriever.synthesizer) + # Create streamified version of synthesizer (wrapped with DSPy context) + with get_context(): + streamified = dspy.streamify(retriever.synthesizer) listener = HeritageStreamListener() - # Stream tokens + # Stream tokens (streamified retains context from creation) async for token in streamified( question=question, context="Sample context from retrieval", diff --git a/backend/rag/main.py b/backend/rag/main.py index e9321b0811..0d2060c1d9 100644 --- a/backend/rag/main.py +++ b/backend/rag/main.py @@ -1532,13 +1532,49 @@ async def dspy_query(request: DSPyQueryRequest) -> DSPyQueryResponse: pipeline = HeritageRAGPipeline(retriever=qdrant_retriever) # Execute query with conversation history - result = pipeline.forward( - embedding_model=request.embedding_model, - question=request.question, - language=request.language, - history=history, - include_viz=request.include_visualization, - ) + # Retry logic for transient API errors (e.g., Anthropic "Overloaded" errors) + max_retries = 3 + last_error: Exception | None = None + result = None + + for attempt in range(max_retries): + try: + result = pipeline.forward( + embedding_model=request.embedding_model, + question=request.question, + language=request.language, + history=history, + include_viz=request.include_visualization, + ) + break # Success, exit retry loop + except Exception as e: + last_error = e + error_str = str(e).lower() + # Check for retryable errors (API overload, rate limits, temporary failures) + is_retryable = any(keyword in error_str for keyword in [ + "overloaded", "rate_limit", "rate limit", "too many requests", + "529", "503", "502", "504", # HTTP status codes + "temporarily unavailable", "service unavailable", + "connection reset", "connection refused", "timeout" + ]) + + if is_retryable and attempt < max_retries - 1: + wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s + logger.warning( + f"Transient API error (attempt {attempt + 1}/{max_retries}): {e}. " + f"Retrying in {wait_time}s..." + ) + time.sleep(wait_time) + continue + else: + # Non-retryable error or max retries reached + raise + + # If we get here without a result (all retries exhausted), raise the last error + if result is None: + if last_error: + raise last_error + raise HTTPException(status_code=500, detail="Pipeline execution failed with no result") elapsed_ms = (time.time() - start_time) * 1000