#!/usr/bin/env python3 """ Index Heritage Persons in Qdrant This script reads person entity JSON files and indexes them in Qdrant for semantic search and RAG-enhanced queries about heritage sector professionals. Usage: python scripts/index_persons_qdrant.py [--data-dir DATA_DIR] [--host HOST] [--port PORT] Examples: # Index all persons from default data directory python scripts/index_persons_qdrant.py # Index from specific directory python scripts/index_persons_qdrant.py --data-dir data/custodian/person/entity/ # Connect to remote Qdrant python scripts/index_persons_qdrant.py --host 91.98.224.44 --port 6333 """ import argparse import json import logging import os import sys from pathlib import Path from typing import Any # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) logger = logging.getLogger(__name__) def load_json_file(filepath: Path) -> dict[str, Any] | None: """Load a JSON file and return its contents.""" try: with open(filepath, "r", encoding="utf-8") as f: return json.load(f) except Exception as e: logger.warning(f"Failed to load {filepath}: {e}") return None def extract_person_text(data: dict[str, Any]) -> str: """Extract searchable text from person entity data.""" parts = [] profile = data.get("profile_data", {}) # Full name (primary identifier) name = profile.get("full_name", "") if name: parts.append(f"Name: {name}") # Headline / current position headline = profile.get("headline", "") if headline: parts.append(f"Role: {headline}") # Location location = profile.get("location", "") if location: parts.append(f"Location: {location}") # About / summary about = profile.get("about", "") if about: parts.append(about[:500]) # Truncate long about sections # Current position details current_pos = profile.get("current_position", {}) if current_pos and isinstance(current_pos, dict): company = current_pos.get("company", "") title = current_pos.get("title", "") if company: parts.append(f"Currently at: {company}") if title and title != headline: parts.append(f"Current title: {title}") # Career history - focus on heritage-relevant positions heritage_exp = profile.get("heritage_relevant_experience", []) if heritage_exp: for exp in heritage_exp[:5]: # Limit to 5 most relevant if isinstance(exp, dict): title = exp.get("title", "") company = exp.get("company", "") note = exp.get("relevance_note", "") if title and company: parts.append(f"Experience: {title} at {company}") if note: parts.append(note) # If no heritage-specific experience, use general career history if not heritage_exp: career = profile.get("career_history", []) for job in career[:5]: if isinstance(job, dict): title = job.get("title", "") company = job.get("company", "") if title and company: parts.append(f"Experience: {title} at {company}") # Custodian affiliations affiliations = data.get("custodian_affiliations", []) if affiliations: aff_names = [] for aff in affiliations[:5]: if isinstance(aff, dict): name = aff.get("custodian_name", "") role = aff.get("role", "") current = aff.get("current", False) if name: status = "(current)" if current else "(past)" aff_names.append(f"{name} {status}") if aff_names: parts.append(f"Affiliations: {', '.join(aff_names)}") # Education education = profile.get("education") or [] for edu in education[:3]: if isinstance(edu, dict): degree = edu.get("degree", "") institution = edu.get("institution", "") field = edu.get("field", "") if institution: edu_str = institution if degree: edu_str = f"{degree} from {institution}" if field: edu_str += f" ({field})" parts.append(f"Education: {edu_str}") # Skills skills = profile.get("skills", []) if skills: parts.append(f"Skills: {', '.join(skills[:15])}") # Languages languages = profile.get("languages", []) if languages: lang_strs = [] for lang in languages: if isinstance(lang, str): lang_strs.append(lang) elif isinstance(lang, dict): lang_strs.append(lang.get("name", lang.get("language", str(lang)))) if lang_strs: parts.append(f"Languages: {', '.join(lang_strs)}") # Network analysis notes network = data.get("network_analysis", {}) if network: notes = network.get("notes", "") if notes: parts.append(notes[:300]) # Heritage sector assessment assessment = data.get("heritage_sector_assessment", {}) if assessment: classification = assessment.get("sector_classification", "") leadership = assessment.get("leadership_level", "") if classification: parts.append(f"Sector: {classification}") if leadership: parts.append(f"Leadership level: {leadership}") return "\n".join(parts) def calculate_richness_score(data: dict[str, Any]) -> float: """Calculate a metadata richness score (0.0 - 1.0) for a person profile. This score is used to boost search results for profiles with more complete data. Profiles with rich metadata (about section, career history, skills, education) should rank higher than sparse profiles with only a name and headline. Scoring components (max 1.0): - Has full name: 0.05 - Has headline: 0.05 - Has location: 0.05 - Has about/summary (>100 chars): 0.15 - Has about/summary (>300 chars): 0.05 bonus - Has career history (1-3 jobs): 0.10 - Has career history (4+ jobs): 0.10 bonus - Has skills (1-5): 0.10 - Has skills (6+): 0.05 bonus - Has education (1+): 0.10 - Has languages: 0.05 - Has heritage-relevant experience: 0.10 - Has LinkedIn URL: 0.05 """ score = 0.0 profile = data.get("profile_data", {}) # Basic info (0.15 max) if profile.get("full_name"): score += 0.05 if profile.get("headline"): score += 0.05 if profile.get("location"): score += 0.05 # About section (0.20 max) - most important for context about = profile.get("about", "") or "" if len(about) > 100: score += 0.15 if len(about) > 300: score += 0.05 # Career history (0.20 max) career = profile.get("career_history", []) or [] if len(career) >= 1: score += 0.10 if len(career) >= 4: score += 0.10 # Skills (0.15 max) skills = profile.get("skills", []) or [] if len(skills) >= 1: score += 0.10 if len(skills) >= 6: score += 0.05 # Education (0.10 max) education = profile.get("education", []) or [] if len(education) >= 1: score += 0.10 # Languages (0.05 max) languages = profile.get("languages", []) or [] if len(languages) >= 1: score += 0.05 # Heritage-relevant experience (0.10 max) - important for domain relevance heritage_exp = profile.get("heritage_relevant_experience", []) or [] if len(heritage_exp) >= 1: score += 0.10 # LinkedIn URL (0.05 max) - indicates verifiable profile if data.get("linkedin_profile_url") or data.get("extraction_metadata", {}).get("linkedin_url"): score += 0.05 return min(score, 1.0) # Cap at 1.0 def extract_metadata(data: dict[str, Any], filepath: Path) -> dict[str, Any]: """Extract metadata for filtering from person data.""" metadata: dict[str, Any] = { "filename": filepath.name, "type": "person", } profile = data.get("profile_data", {}) extraction = data.get("extraction_metadata", {}) # Full name - check multiple possible field names name = ( profile.get("name", "") or profile.get("full_name", "") or data.get("name", "") ) if name: metadata["name"] = name # LinkedIn URL - check multiple locations linkedin_url = ( data.get("linkedin_profile_url", "") or profile.get("linkedin_url", "") or extraction.get("linkedin_url", "") ) if linkedin_url: metadata["linkedin_url"] = linkedin_url # Extract slug from URL if "/in/" in linkedin_url: slug = linkedin_url.split("/in/")[-1].rstrip("/") metadata["linkedin_slug"] = slug # Current position/headline headline = profile.get("headline", "") if headline: metadata["headline"] = headline # Location location = profile.get("location", "") if location: metadata["location"] = location # Try to extract city/country if ", " in location: parts = [p.strip() for p in location.split(",")] metadata["city"] = parts[0] if len(parts) >= 3: metadata["country"] = parts[-1] # Current company - from current_position or experience current_pos = profile.get("current_position", {}) if current_pos and isinstance(current_pos, dict): company = current_pos.get("company", "") if company: metadata["current_company"] = company # If no current_position, try to find current job in experience if "current_company" not in metadata: experience = profile.get("experience", []) for exp in experience: if isinstance(exp, dict) and exp.get("current"): company = exp.get("company", "") if company: metadata["current_company"] = company break # Heritage relevance - check heritage_relevance section heritage_rel = data.get("heritage_relevance", {}) if heritage_rel: is_relevant = heritage_rel.get("is_heritage_relevant", False) metadata["heritage_relevant"] = is_relevant # Primary heritage type primary_type = heritage_rel.get("primary_heritage_type", "") if primary_type: metadata["heritage_type"] = primary_type # Heritage institution types worked at (from experience) heritage_exp = profile.get("heritage_relevant_experience", []) heritage_types = set() for exp in heritage_exp: if isinstance(exp, dict): h_type = exp.get("heritage_type", "") if h_type: heritage_types.add(h_type) if heritage_types: metadata["heritage_types"] = list(heritage_types) # If heritage_relevant not set but has heritage experience, mark as relevant if "heritage_relevant" not in metadata: metadata["heritage_relevant"] = True # Auto-detect heritage relevance from headline/company if not explicitly set if "heritage_relevant" not in metadata: heritage_keywords = [ "museum", "archief", "archive", "bibliotheek", "library", "erfgoed", "heritage", "collectie", "collection", "curator", "archivist", "conservator", "nationaal archief", "rijksmuseum", "rijksarchief", "digitaal erfgoed", "digital heritage", "cultureel erfgoed", "cultural heritage" ] text_to_check = f"{headline} {metadata.get('current_company', '')}".lower() if any(kw in text_to_check for kw in heritage_keywords): metadata["heritage_relevant"] = True # Default heritage_relevant to False if still not set if "heritage_relevant" not in metadata: metadata["heritage_relevant"] = False # Current affiliations affiliations = data.get("custodian_affiliations", []) current_affiliations = [] for aff in affiliations: if isinstance(aff, dict) and aff.get("current"): custodian_name = aff.get("custodian_name", "") if custodian_name: current_affiliations.append(custodian_name) if current_affiliations: metadata["current_affiliations"] = current_affiliations # Extraction metadata if extraction: extraction_date = extraction.get("extraction_date", "") if extraction_date: metadata["extraction_date"] = extraction_date # Calculate richness score for search ranking metadata["richness_score"] = calculate_richness_score(data) return metadata def find_person_files(data_dir: Path) -> list[Path]: """Find all person JSON files in the data directory.""" files = [] # Look for JSON files patterns = [ "*.json", ] for pattern in patterns: files.extend(data_dir.glob(pattern)) # Filter out non-person files excluded_patterns = [ "_schema", "_config", "_template", "test_", "example_", ".DS_Store", "_connections_", # Connection files, not person profiles "_staff_", # Staff list aggregates, not individual profiles ] filtered = [] for f in files: if not any(excl in f.name for excl in excluded_patterns): filtered.append(f) return sorted(filtered) class PersonRetriever: """Qdrant retriever specifically for person entities.""" def __init__( self, host: str = "localhost", port: int = 6333, collection_name: str = "heritage_persons", embedding_model: str = "text-embedding-3-small", embedding_dim: int = 1536, api_key: str | None = None, url: str | None = None, https: bool = False, prefix: str | None = None, ): from qdrant_client import QdrantClient from qdrant_client.http.models import Distance, VectorParams self.collection_name = collection_name self.embedding_model = embedding_model self.embedding_dim = embedding_dim self.api_key = api_key or os.getenv("OPENAI_API_KEY") # Initialize Qdrant client if url: self.client = QdrantClient(url=url, prefer_grpc=False, timeout=60) elif https or port == 443: self.client = QdrantClient( host=host, port=port, https=True, prefix=prefix, prefer_grpc=False, timeout=60 ) else: self.client = QdrantClient(host=host, port=port, timeout=60) self._openai_client = None @property def openai_client(self): """Lazy-load OpenAI client.""" if self._openai_client is None: import openai self._openai_client = openai.OpenAI(api_key=self.api_key) return self._openai_client def _get_embeddings_batch(self, texts: list[str]) -> list[list[float]]: """Get embedding vectors for multiple texts.""" if not texts: return [] response = self.openai_client.embeddings.create( input=texts, model=self.embedding_model ) return [item.embedding for item in sorted(response.data, key=lambda x: x.index)] def ensure_collection(self) -> None: """Ensure the collection exists, create if not.""" from qdrant_client.http.models import Distance, VectorParams collections = self.client.get_collections().collections collection_names = [c.name for c in collections] if self.collection_name not in collection_names: logger.info(f"Creating collection: {self.collection_name}") self.client.create_collection( collection_name=self.collection_name, vectors_config=VectorParams( size=self.embedding_dim, distance=Distance.COSINE ) ) def delete_collection(self) -> None: """Delete the collection if it exists.""" try: self.client.delete_collection(self.collection_name) logger.info(f"Deleted collection: {self.collection_name}") except Exception as e: logger.warning(f"Could not delete collection: {e}") def add_documents( self, documents: list[dict[str, Any]], batch_size: int = 50 ) -> int: """Add documents to the collection.""" import hashlib from qdrant_client.http import models self.ensure_collection() valid_docs = [d for d in documents if d.get("text")] total_indexed = 0 for i in range(0, len(valid_docs), batch_size): batch = valid_docs[i:i + batch_size] texts = [d["text"] for d in batch] logger.info(f"Embedding batch {i//batch_size + 1}/{(len(valid_docs) + batch_size - 1)//batch_size} ({len(batch)} docs)") embeddings = self._get_embeddings_batch(texts) points = [] for j, (doc, embedding) in enumerate(zip(batch, embeddings)): # Generate deterministic ID from text doc_id = hashlib.md5(doc["text"].encode()).hexdigest() points.append(models.PointStruct( id=doc_id, vector=embedding, payload={ "text": doc["text"], **doc.get("metadata", {}) } )) self.client.upsert( collection_name=self.collection_name, points=points ) total_indexed += len(points) logger.info(f"Indexed {total_indexed}/{len(valid_docs)} documents") return total_indexed def get_collection_info(self) -> dict[str, Any]: """Get collection information.""" try: info = self.client.get_collection(self.collection_name) return { "status": info.status, "vectors_count": getattr(info, "vectors_count", None) or getattr(info, "points_count", 0), "points_count": getattr(info, "points_count", 0), } except Exception as e: return {"error": str(e)} def main(): parser = argparse.ArgumentParser( description="Index heritage persons in Qdrant for semantic search" ) parser.add_argument( "--data-dir", type=Path, default=PROJECT_ROOT / "data" / "custodian" / "person" / "entity", help="Directory containing person JSON files" ) parser.add_argument( "--host", default=os.getenv("QDRANT_HOST", "localhost"), help="Qdrant server hostname" ) parser.add_argument( "--port", type=int, default=int(os.getenv("QDRANT_PORT", "6333")), help="Qdrant REST API port" ) parser.add_argument( "--url", default=os.getenv("QDRANT_URL", ""), help="Full Qdrant URL. Overrides host/port." ) parser.add_argument( "--collection", default="heritage_persons", help="Qdrant collection name" ) parser.add_argument( "--batch-size", type=int, default=50, help="Number of documents to index per batch" ) parser.add_argument( "--recreate", action="store_true", help="Delete and recreate the collection" ) parser.add_argument( "--dry-run", action="store_true", help="Parse files but don't index" ) parser.add_argument( "--https", action="store_true", help="Use HTTPS for connection" ) parser.add_argument( "--prefix", default=None, help="URL path prefix (e.g., 'qdrant' for /qdrant/*)" ) args = parser.parse_args() # Check data directory exists if not args.data_dir.exists(): logger.error(f"Data directory not found: {args.data_dir}") sys.exit(1) # Find person files logger.info(f"Scanning for person files in {args.data_dir}") files = find_person_files(args.data_dir) logger.info(f"Found {len(files)} person files") if not files: logger.warning("No person files found") sys.exit(0) # Prepare documents documents = [] for filepath in files: data = load_json_file(filepath) if not data: continue text = extract_person_text(data) if not text or len(text) < 20: logger.debug(f"Skipping {filepath.name}: insufficient text") continue metadata = extract_metadata(data, filepath) documents.append({ "text": text, "metadata": metadata, }) logger.info(f"Prepared {len(documents)} documents for indexing") if args.dry_run: logger.info("Dry run - not indexing") for doc in documents[:5]: logger.info(f" - {doc['metadata'].get('name', 'Unknown')}: {len(doc['text'])} chars") logger.info(f" Metadata: {list(doc['metadata'].keys())}") sys.exit(0) # Check for OpenAI API key if not os.getenv("OPENAI_API_KEY"): logger.error("OPENAI_API_KEY environment variable is required for embeddings") sys.exit(1) # Create retriever if args.url: logger.info(f"Connecting to Qdrant at {args.url}") retriever = PersonRetriever(url=args.url, collection_name=args.collection) elif args.https or args.prefix: prefix_str = f"/{args.prefix}" if args.prefix else "" logger.info(f"Connecting to Qdrant at https://{args.host}:{args.port}{prefix_str}") retriever = PersonRetriever( host=args.host, port=args.port, collection_name=args.collection, https=args.https, prefix=args.prefix, ) else: logger.info(f"Connecting to Qdrant at {args.host}:{args.port}") retriever = PersonRetriever( host=args.host, port=args.port, collection_name=args.collection, ) # Optionally recreate collection if args.recreate: logger.warning(f"Deleting collection: {args.collection}") retriever.delete_collection() # Index documents logger.info(f"Indexing {len(documents)} documents...") indexed = retriever.add_documents(documents, batch_size=args.batch_size) # Report results info = retriever.get_collection_info() logger.info("Indexing complete!") logger.info(f" Documents indexed: {indexed}") logger.info(f" Collection status: {info.get('status', 'unknown')}") logger.info(f" Total vectors: {info.get('vectors_count', 0)}") if __name__ == "__main__": main()