glam/scripts/index_persons_qdrant.py
2025-12-26 21:41:18 +01:00

766 lines
26 KiB
Python

#!/usr/bin/env python3
"""
Index Heritage Persons in Qdrant
This script reads person entity JSON files and indexes them in Qdrant
for semantic search and RAG-enhanced queries about heritage sector professionals.
Usage:
python scripts/index_persons_qdrant.py [--data-dir DATA_DIR] [--host HOST] [--port PORT]
Examples:
# Index all persons from default data directory
python scripts/index_persons_qdrant.py
# Index from specific directory
python scripts/index_persons_qdrant.py --data-dir data/custodian/person/entity/
# Connect to remote Qdrant
python scripts/index_persons_qdrant.py --host 91.98.224.44 --port 6333
"""
import argparse
import json
import logging
import os
import sys
from pathlib import Path
from typing import Any
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)
def load_json_file(filepath: Path) -> dict[str, Any] | None:
"""Load a JSON file and return its contents."""
try:
with open(filepath, "r", encoding="utf-8") as f:
return json.load(f)
except Exception as e:
logger.warning(f"Failed to load {filepath}: {e}")
return None
def extract_person_text(data: dict[str, Any]) -> str:
"""Extract searchable text from person entity data."""
parts = []
profile = data.get("profile_data", {})
person = data.get("person", {})
source_staff = data.get("source_staff_info", {})
extraction = data.get("extraction_metadata", {})
# Full name - check ALL possible locations in order of priority
name = (
profile.get("full_name") or
profile.get("name") or
person.get("full_name") or
person.get("name") or
source_staff.get("name") or
source_staff.get("person_name") or
extraction.get("person_name") or
data.get("name") or
""
)
if name:
parts.append(f"Name: {name}")
# Headline / current position
headline = profile.get("headline", "")
if headline:
parts.append(f"Role: {headline}")
# Location
location = profile.get("location", "")
if location:
parts.append(f"Location: {location}")
# About / summary
about = profile.get("about", "")
if about:
parts.append(about[:500]) # Truncate long about sections
# Current position details
current_pos = profile.get("current_position", {})
if current_pos and isinstance(current_pos, dict):
company = current_pos.get("company", "")
title = current_pos.get("title", "")
if company:
parts.append(f"Currently at: {company}")
if title and title != headline:
parts.append(f"Current title: {title}")
# Career history - focus on heritage-relevant positions
heritage_exp = profile.get("heritage_relevant_experience", [])
if heritage_exp:
for exp in heritage_exp[:5]: # Limit to 5 most relevant
if isinstance(exp, dict):
title = exp.get("title", "")
company = exp.get("company", "")
note = exp.get("relevance_note", "")
if title and company:
parts.append(f"Experience: {title} at {company}")
if note:
parts.append(note)
# If no heritage-specific experience, use general career history
if not heritage_exp:
career = profile.get("career_history", [])
for job in career[:5]:
if isinstance(job, dict):
title = job.get("title", "")
company = job.get("company", "")
if title and company:
parts.append(f"Experience: {title} at {company}")
# Custodian affiliations
affiliations = data.get("custodian_affiliations", [])
if affiliations:
aff_names = []
for aff in affiliations[:5]:
if isinstance(aff, dict):
name = aff.get("custodian_name", "")
role = aff.get("role", "")
current = aff.get("current", False)
if name:
status = "(current)" if current else "(past)"
aff_names.append(f"{name} {status}")
if aff_names:
parts.append(f"Affiliations: {', '.join(aff_names)}")
# Education
education = profile.get("education") or []
for edu in education[:3]:
if isinstance(edu, dict):
degree = edu.get("degree", "")
institution = edu.get("institution", "")
field = edu.get("field", "")
if institution:
edu_str = institution
if degree:
edu_str = f"{degree} from {institution}"
if field:
edu_str += f" ({field})"
parts.append(f"Education: {edu_str}")
# Skills
skills = profile.get("skills", [])
if skills:
parts.append(f"Skills: {', '.join(skills[:15])}")
# Languages
languages = profile.get("languages", [])
if languages:
lang_strs = []
for lang in languages:
if isinstance(lang, str):
lang_strs.append(lang)
elif isinstance(lang, dict):
lang_strs.append(lang.get("name", lang.get("language", str(lang))))
if lang_strs:
parts.append(f"Languages: {', '.join(lang_strs)}")
# Network analysis notes
network = data.get("network_analysis", {})
if network:
notes = network.get("notes", "")
if notes:
parts.append(notes[:300])
# Heritage sector assessment
assessment = data.get("heritage_sector_assessment", {})
if assessment:
classification = assessment.get("sector_classification", "")
leadership = assessment.get("leadership_level", "")
if classification:
parts.append(f"Sector: {classification}")
if leadership:
parts.append(f"Leadership level: {leadership}")
return "\n".join(parts)
def calculate_richness_score(data: dict[str, Any]) -> float:
"""Calculate a metadata richness score (0.0 - 1.0) for a person profile.
This score is used to boost search results for profiles with more complete data.
Profiles with rich metadata (about section, career history, skills, education)
should rank higher than sparse profiles with only a name and headline.
Scoring components (max 1.0):
- Has full name: 0.05
- Has headline: 0.05
- Has location: 0.05
- Has about/summary (>100 chars): 0.15
- Has about/summary (>300 chars): 0.05 bonus
- Has career history (1-3 jobs): 0.10
- Has career history (4+ jobs): 0.10 bonus
- Has skills (1-5): 0.10
- Has skills (6+): 0.05 bonus
- Has education (1+): 0.10
- Has languages: 0.05
- Has heritage-relevant experience: 0.10
- Has LinkedIn URL: 0.05
"""
score = 0.0
profile = data.get("profile_data", {})
# Basic info (0.15 max)
if profile.get("full_name"):
score += 0.05
if profile.get("headline"):
score += 0.05
if profile.get("location"):
score += 0.05
# About section (0.20 max) - most important for context
about = profile.get("about", "") or ""
if len(about) > 100:
score += 0.15
if len(about) > 300:
score += 0.05
# Career history (0.20 max)
career = profile.get("career_history", []) or []
if len(career) >= 1:
score += 0.10
if len(career) >= 4:
score += 0.10
# Skills (0.15 max)
skills = profile.get("skills", []) or []
if len(skills) >= 1:
score += 0.10
if len(skills) >= 6:
score += 0.05
# Education (0.10 max)
education = profile.get("education", []) or []
if len(education) >= 1:
score += 0.10
# Languages (0.05 max)
languages = profile.get("languages", []) or []
if len(languages) >= 1:
score += 0.05
# Heritage-relevant experience (0.10 max) - important for domain relevance
heritage_exp = profile.get("heritage_relevant_experience", []) or []
if len(heritage_exp) >= 1:
score += 0.10
# LinkedIn URL (0.05 max) - indicates verifiable profile
if data.get("linkedin_profile_url") or data.get("extraction_metadata", {}).get("linkedin_url"):
score += 0.05
return min(score, 1.0) # Cap at 1.0
def extract_metadata(data: dict[str, Any], filepath: Path) -> dict[str, Any]:
"""Extract metadata for filtering from person data."""
metadata: dict[str, Any] = {
"filename": filepath.name,
"type": "person",
}
profile = data.get("profile_data", {})
person = data.get("person", {})
source_staff = data.get("source_staff_info", {})
extraction = data.get("extraction_metadata", {})
# Full name - check ALL possible field names (same as extract_person_text)
name = (
profile.get("full_name") or
profile.get("name") or
person.get("full_name") or
person.get("name") or
source_staff.get("name") or
source_staff.get("person_name") or
extraction.get("person_name") or
data.get("name") or
""
)
if name:
metadata["name"] = name
# LinkedIn URL - check multiple locations
linkedin_url = (
data.get("linkedin_profile_url", "") or
profile.get("linkedin_url", "") or
extraction.get("linkedin_url", "")
)
if linkedin_url:
metadata["linkedin_url"] = linkedin_url
# Extract slug from URL
if "/in/" in linkedin_url:
slug = linkedin_url.split("/in/")[-1].rstrip("/")
metadata["linkedin_slug"] = slug
# Current position/headline
headline = profile.get("headline", "")
if headline:
metadata["headline"] = headline
# Location
location = profile.get("location", "")
if location:
metadata["location"] = location
# Try to extract city/country
if ", " in location:
parts = [p.strip() for p in location.split(",")]
metadata["city"] = parts[0]
if len(parts) >= 3:
metadata["country"] = parts[-1]
# Current company - from current_position or experience
current_pos = profile.get("current_position", {})
if current_pos and isinstance(current_pos, dict):
company = current_pos.get("company", "")
if company:
metadata["current_company"] = company
# If no current_position, try to find current job in experience
if "current_company" not in metadata:
experience = profile.get("experience", [])
for exp in experience:
if isinstance(exp, dict) and exp.get("current"):
company = exp.get("company", "")
if company:
metadata["current_company"] = company
break
# Heritage relevance - check heritage_relevance section
heritage_rel = data.get("heritage_relevance", {})
if heritage_rel:
is_relevant = heritage_rel.get("is_heritage_relevant", False)
metadata["heritage_relevant"] = is_relevant
# Primary heritage type
primary_type = heritage_rel.get("primary_heritage_type", "")
if primary_type:
metadata["heritage_type"] = primary_type
# Heritage institution types worked at (from experience)
heritage_exp = profile.get("heritage_relevant_experience", [])
heritage_types = set()
for exp in heritage_exp:
if isinstance(exp, dict):
h_type = exp.get("heritage_type", "")
if h_type:
heritage_types.add(h_type)
if heritage_types:
metadata["heritage_types"] = list(heritage_types)
# If heritage_relevant not set but has heritage experience, mark as relevant
if "heritage_relevant" not in metadata:
metadata["heritage_relevant"] = True
# Auto-detect heritage relevance from headline/company if not explicitly set
if "heritage_relevant" not in metadata:
heritage_keywords = [
"museum", "archief", "archive", "bibliotheek", "library",
"erfgoed", "heritage", "collectie", "collection", "curator",
"archivist", "conservator", "nationaal archief", "rijksmuseum",
"rijksarchief", "digitaal erfgoed", "digital heritage",
"cultureel erfgoed", "cultural heritage"
]
text_to_check = f"{headline} {metadata.get('current_company', '')}".lower()
if any(kw in text_to_check for kw in heritage_keywords):
metadata["heritage_relevant"] = True
# Default heritage_relevant to False if still not set
if "heritage_relevant" not in metadata:
metadata["heritage_relevant"] = False
# Current affiliations and custodian information for filtering
# Try both "affiliations" (correct key) and "custodian_affiliations" (legacy key)
affiliations = data.get("affiliations", []) or data.get("custodian_affiliations", [])
current_affiliations = []
custodian_slugs = []
custodian_names = []
for aff in affiliations:
if isinstance(aff, dict):
custodian_name = aff.get("custodian_name", "")
custodian_slug = aff.get("custodian_slug", "")
is_current = aff.get("current", False)
if custodian_name:
custodian_names.append(custodian_name)
if is_current:
current_affiliations.append(custodian_name)
if custodian_slug:
custodian_slugs.append(custodian_slug)
# Also check source_staff_info for custodian data
source_staff = data.get("source_staff_info", {})
if source_staff:
staff_custodian = source_staff.get("custodian", "")
staff_slug = source_staff.get("custodian_slug", "")
if staff_custodian and staff_custodian not in custodian_names:
custodian_names.append(staff_custodian)
if staff_slug and staff_slug not in custodian_slugs:
custodian_slugs.append(staff_slug)
# Store all custodian information for filtering
if current_affiliations:
metadata["current_affiliations"] = current_affiliations
if custodian_names:
metadata["custodian_names"] = custodian_names
# Also store primary custodian (first one, usually current)
metadata["custodian_name"] = custodian_names[0]
if custodian_slugs:
metadata["custodian_slugs"] = custodian_slugs
# Also store primary custodian slug for simple filtering
metadata["custodian_slug"] = custodian_slugs[0]
# Generate custodian_slug from custodian_name if not available
# This allows text-based filtering when slug is missing
if "custodian_slug" not in metadata and custodian_names:
# Create a simple slug from the name (lowercase, hyphenated)
import re
name = custodian_names[0]
slug = re.sub(r'[^a-z0-9\s-]', '', name.lower())
slug = re.sub(r'[\s_]+', '-', slug)
slug = re.sub(r'-+', '-', slug).strip('-')
if slug:
metadata["custodian_slug"] = slug
# Extraction metadata
if extraction:
extraction_date = extraction.get("extraction_date", "")
if extraction_date:
metadata["extraction_date"] = extraction_date
# Calculate richness score for search ranking
metadata["richness_score"] = calculate_richness_score(data)
return metadata
def find_person_files(data_dir: Path) -> list[Path]:
"""Find all person JSON files in the data directory."""
files = []
# Look for JSON files
patterns = [
"*.json",
]
for pattern in patterns:
files.extend(data_dir.glob(pattern))
# Filter out non-person files
excluded_patterns = [
"_schema",
"_config",
"_template",
"test_",
"example_",
".DS_Store",
"_connections_", # Connection files, not person profiles
"_staff_", # Staff list aggregates, not individual profiles
]
filtered = []
for f in files:
if not any(excl in f.name for excl in excluded_patterns):
filtered.append(f)
return sorted(filtered)
class PersonRetriever:
"""Qdrant retriever specifically for person entities.
Uses MiniLM (384-dim) embeddings by default for consistency with
the hybrid_retriever.py query-time embedding model.
"""
def __init__(
self,
host: str = "localhost",
port: int = 6333,
collection_name: str = "heritage_persons",
embedding_model: str = "all-MiniLM-L6-v2", # MiniLM for local embeddings
embedding_dim: int = 384, # MiniLM output dimension
url: str | None = None,
https: bool = False,
prefix: str | None = None,
):
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
self.collection_name = collection_name
self.embedding_model = embedding_model
self.embedding_dim = embedding_dim
# MiniLM model runs locally, no API key needed
# Initialize Qdrant client
if url:
self.client = QdrantClient(url=url, prefer_grpc=False, timeout=60)
elif https or port == 443:
self.client = QdrantClient(
host=host,
port=port,
https=True,
prefix=prefix,
prefer_grpc=False,
timeout=60
)
else:
self.client = QdrantClient(host=host, port=port, timeout=60)
self._sentence_model = None
@property
def sentence_model(self):
"""Lazy-load SentenceTransformer model."""
if self._sentence_model is None:
from sentence_transformers import SentenceTransformer
logger.info(f"Loading embedding model: {self.embedding_model}")
self._sentence_model = SentenceTransformer(self.embedding_model)
return self._sentence_model
def _get_embeddings_batch(self, texts: list[str]) -> list[list[float]]:
"""Get embedding vectors for multiple texts using MiniLM."""
if not texts:
return []
embeddings = self.sentence_model.encode(texts, show_progress_bar=False)
return embeddings.tolist()
def ensure_collection(self) -> None:
"""Ensure the collection exists, create if not."""
from qdrant_client.http.models import Distance, VectorParams
collections = self.client.get_collections().collections
collection_names = [c.name for c in collections]
if self.collection_name not in collection_names:
logger.info(f"Creating collection: {self.collection_name}")
self.client.create_collection(
collection_name=self.collection_name,
vectors_config=VectorParams(
size=self.embedding_dim,
distance=Distance.COSINE
)
)
def delete_collection(self) -> None:
"""Delete the collection if it exists."""
try:
self.client.delete_collection(self.collection_name)
logger.info(f"Deleted collection: {self.collection_name}")
except Exception as e:
logger.warning(f"Could not delete collection: {e}")
def add_documents(
self,
documents: list[dict[str, Any]],
batch_size: int = 50
) -> int:
"""Add documents to the collection."""
import hashlib
from qdrant_client.http import models
self.ensure_collection()
valid_docs = [d for d in documents if d.get("text")]
total_indexed = 0
for i in range(0, len(valid_docs), batch_size):
batch = valid_docs[i:i + batch_size]
texts = [d["text"] for d in batch]
logger.info(f"Embedding batch {i//batch_size + 1}/{(len(valid_docs) + batch_size - 1)//batch_size} ({len(batch)} docs)")
embeddings = self._get_embeddings_batch(texts)
points = []
for j, (doc, embedding) in enumerate(zip(batch, embeddings)):
# Generate deterministic ID from text
doc_id = hashlib.md5(doc["text"].encode()).hexdigest()
points.append(models.PointStruct(
id=doc_id,
vector=embedding,
payload={
"text": doc["text"],
**doc.get("metadata", {})
}
))
self.client.upsert(
collection_name=self.collection_name,
points=points
)
total_indexed += len(points)
logger.info(f"Indexed {total_indexed}/{len(valid_docs)} documents")
return total_indexed
def get_collection_info(self) -> dict[str, Any]:
"""Get collection information."""
try:
info = self.client.get_collection(self.collection_name)
return {
"status": info.status,
"vectors_count": getattr(info, "vectors_count", None) or getattr(info, "points_count", 0),
"points_count": getattr(info, "points_count", 0),
}
except Exception as e:
return {"error": str(e)}
def main():
parser = argparse.ArgumentParser(
description="Index heritage persons in Qdrant for semantic search"
)
parser.add_argument(
"--data-dir",
type=Path,
default=PROJECT_ROOT / "data" / "custodian" / "person" / "entity",
help="Directory containing person JSON files"
)
parser.add_argument(
"--host",
default=os.getenv("QDRANT_HOST", "localhost"),
help="Qdrant server hostname"
)
parser.add_argument(
"--port",
type=int,
default=int(os.getenv("QDRANT_PORT", "6333")),
help="Qdrant REST API port"
)
parser.add_argument(
"--url",
default=os.getenv("QDRANT_URL", ""),
help="Full Qdrant URL. Overrides host/port."
)
parser.add_argument(
"--collection",
default="heritage_persons",
help="Qdrant collection name"
)
parser.add_argument(
"--batch-size",
type=int,
default=50,
help="Number of documents to index per batch"
)
parser.add_argument(
"--recreate",
action="store_true",
help="Delete and recreate the collection"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Parse files but don't index"
)
parser.add_argument(
"--https",
action="store_true",
help="Use HTTPS for connection"
)
parser.add_argument(
"--prefix",
default=None,
help="URL path prefix (e.g., 'qdrant' for /qdrant/*)"
)
args = parser.parse_args()
# Check data directory exists
if not args.data_dir.exists():
logger.error(f"Data directory not found: {args.data_dir}")
sys.exit(1)
# Find person files
logger.info(f"Scanning for person files in {args.data_dir}")
files = find_person_files(args.data_dir)
logger.info(f"Found {len(files)} person files")
if not files:
logger.warning("No person files found")
sys.exit(0)
# Prepare documents
documents = []
for filepath in files:
data = load_json_file(filepath)
if not data:
continue
text = extract_person_text(data)
if not text or len(text) < 20:
logger.debug(f"Skipping {filepath.name}: insufficient text")
continue
metadata = extract_metadata(data, filepath)
documents.append({
"text": text,
"metadata": metadata,
})
logger.info(f"Prepared {len(documents)} documents for indexing")
if args.dry_run:
logger.info("Dry run - not indexing")
for doc in documents[:5]:
logger.info(f" - {doc['metadata'].get('name', 'Unknown')}: {len(doc['text'])} chars")
logger.info(f" Metadata: {list(doc['metadata'].keys())}")
sys.exit(0)
# Note: MiniLM model runs locally, no API key needed
# Create retriever
if args.url:
logger.info(f"Connecting to Qdrant at {args.url}")
retriever = PersonRetriever(url=args.url, collection_name=args.collection)
elif args.https or args.prefix:
prefix_str = f"/{args.prefix}" if args.prefix else ""
logger.info(f"Connecting to Qdrant at https://{args.host}:{args.port}{prefix_str}")
retriever = PersonRetriever(
host=args.host,
port=args.port,
collection_name=args.collection,
https=args.https,
prefix=args.prefix,
)
else:
logger.info(f"Connecting to Qdrant at {args.host}:{args.port}")
retriever = PersonRetriever(
host=args.host,
port=args.port,
collection_name=args.collection,
)
# Optionally recreate collection
if args.recreate:
logger.warning(f"Deleting collection: {args.collection}")
retriever.delete_collection()
# Index documents
logger.info(f"Indexing {len(documents)} documents...")
indexed = retriever.add_documents(documents, batch_size=args.batch_size)
# Report results
info = retriever.get_collection_info()
logger.info("Indexing complete!")
logger.info(f" Documents indexed: {indexed}")
logger.info(f" Collection status: {info.get('status', 'unknown')}")
logger.info(f" Total vectors: {info.get('vectors_count', 0)}")
if __name__ == "__main__":
main()