710 lines
25 KiB
Python
710 lines
25 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Qdrant Person Sync Module - Sync person/staff JSON files to Qdrant vector database.
|
|
|
|
This module syncs all person data (staff lists and entity profiles) to Qdrant
|
|
for semantic search of heritage institution personnel.
|
|
|
|
Usage:
|
|
python -m scripts.sync.qdrant_person_sync [--dry-run] [--limit N] [--host HOST] [--port PORT]
|
|
|
|
Data Sources:
|
|
- data/custodian/person/bu/*.json - Staff lists by custodian organization
|
|
- data/custodian/person/entity/*.json - Individual person profile files
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
import hashlib
|
|
|
|
# Add project root to path
|
|
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
from scripts.sync import BaseSyncer, SyncResult, SyncStatus
|
|
|
|
# Configuration
|
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
|
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
|
|
QDRANT_URL = os.getenv("QDRANT_URL", "")
|
|
COLLECTION_NAME = "heritage_persons"
|
|
BATCH_SIZE = 100
|
|
|
|
# Person data directory
|
|
PERSON_DATA_DIR = PROJECT_ROOT / "data" / "custodian" / "person"
|
|
|
|
|
|
def generate_person_id(name: str, custodian_slug: str, staff_id: str = "") -> str:
|
|
"""Generate a unique ID for a person record."""
|
|
if staff_id:
|
|
return staff_id
|
|
# Create deterministic ID from name + custodian
|
|
unique_str = f"{custodian_slug}:{name}".lower()
|
|
return hashlib.md5(unique_str.encode()).hexdigest()[:16]
|
|
|
|
|
|
def extract_person_text_from_staff(person: dict, custodian_name: str, custodian_slug: str) -> str:
|
|
"""Extract searchable text from a staff record."""
|
|
parts = []
|
|
|
|
name = person.get("name", "")
|
|
if name:
|
|
parts.append(f"Name: {name}")
|
|
|
|
if custodian_name:
|
|
parts.append(f"Works at: {custodian_name}")
|
|
|
|
headline = person.get("headline", "")
|
|
if headline:
|
|
parts.append(f"Role: {headline}")
|
|
|
|
heritage_type = person.get("heritage_type", "")
|
|
if heritage_type:
|
|
type_names = {
|
|
"A": "Archive",
|
|
"M": "Museum",
|
|
"L": "Library",
|
|
"G": "Gallery",
|
|
"R": "Research center",
|
|
"E": "Education",
|
|
"D": "Digital platform",
|
|
"O": "Official institution",
|
|
}
|
|
type_name = type_names.get(heritage_type, heritage_type)
|
|
parts.append(f"Sector: {type_name}")
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
def extract_person_text_from_entity(profile: dict) -> str:
|
|
"""Extract searchable text from an entity profile."""
|
|
parts = []
|
|
|
|
profile_data = profile.get("profile_data", {})
|
|
extraction_meta = profile.get("extraction_metadata", {})
|
|
|
|
# Name
|
|
name = profile_data.get("name") or profile_data.get("full_name", "")
|
|
if name:
|
|
parts.append(f"Name: {name}")
|
|
|
|
# Headline/role
|
|
headline = profile_data.get("headline", "")
|
|
if headline:
|
|
parts.append(f"Role: {headline}")
|
|
|
|
# Location
|
|
location = profile_data.get("location", "")
|
|
if location:
|
|
parts.append(f"Location: {location}")
|
|
|
|
# About/summary
|
|
about = profile_data.get("about") or profile_data.get("summary", "")
|
|
if about:
|
|
# Truncate long about sections
|
|
about_text = about[:500] if len(about) > 500 else about
|
|
parts.append(f"About: {about_text}")
|
|
|
|
# Experience - extract current/recent positions
|
|
experience = profile_data.get("experience", []) or profile_data.get("career_history", [])
|
|
if experience:
|
|
for exp in experience[:3]: # Top 3 positions
|
|
if isinstance(exp, dict):
|
|
title = exp.get("title") or exp.get("role", "")
|
|
company = exp.get("company") or exp.get("organization", "")
|
|
if title and company:
|
|
parts.append(f"Experience: {title} at {company}")
|
|
elif title:
|
|
parts.append(f"Experience: {title}")
|
|
|
|
# Heritage-relevant experience
|
|
heritage_exp = profile_data.get("heritage_relevant_experience", [])
|
|
if heritage_exp:
|
|
parts.append("Heritage sector experience:")
|
|
for exp in heritage_exp[:3]:
|
|
if isinstance(exp, dict):
|
|
title = exp.get("title") or exp.get("role", "")
|
|
company = exp.get("company") or exp.get("organization", "")
|
|
if title and company:
|
|
parts.append(f" - {title} at {company}")
|
|
|
|
# Skills
|
|
skills = profile_data.get("skills", [])
|
|
if skills and isinstance(skills, list):
|
|
skill_names = []
|
|
for s in skills[:10]:
|
|
if isinstance(s, str):
|
|
skill_names.append(s)
|
|
elif isinstance(s, dict):
|
|
skill_names.append(s.get("name", s.get("skill", "")))
|
|
if skill_names:
|
|
parts.append(f"Skills: {', '.join(skill_names)}")
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
def extract_person_metadata_from_staff(
|
|
person: dict,
|
|
custodian_name: str,
|
|
custodian_slug: str,
|
|
source_file: str,
|
|
) -> dict:
|
|
"""Extract metadata for filtering from a staff record."""
|
|
metadata = {
|
|
"source_type": "staff_list",
|
|
"source_file": source_file,
|
|
"custodian_name": custodian_name,
|
|
"custodian_slug": custodian_slug,
|
|
}
|
|
|
|
name = person.get("name", "")
|
|
if name:
|
|
metadata["name"] = name
|
|
|
|
staff_id = person.get("staff_id", "")
|
|
if staff_id:
|
|
metadata["staff_id"] = staff_id
|
|
|
|
headline = person.get("headline", "")
|
|
if headline:
|
|
metadata["headline"] = headline
|
|
|
|
heritage_relevant = person.get("heritage_relevant", False)
|
|
metadata["heritage_relevant"] = heritage_relevant
|
|
|
|
heritage_type = person.get("heritage_type", "")
|
|
if heritage_type:
|
|
metadata["heritage_type"] = heritage_type
|
|
|
|
degree = person.get("degree", "")
|
|
if degree:
|
|
metadata["connection_degree"] = degree
|
|
|
|
return metadata
|
|
|
|
|
|
def extract_person_metadata_from_entity(profile: dict, source_file: str) -> dict:
|
|
"""Extract metadata for filtering from an entity profile."""
|
|
metadata = {
|
|
"source_type": "entity_profile",
|
|
"source_file": source_file,
|
|
}
|
|
|
|
profile_data = profile.get("profile_data", {})
|
|
extraction_meta = profile.get("extraction_metadata", {})
|
|
|
|
name = profile_data.get("name") or profile_data.get("full_name", "")
|
|
if name:
|
|
metadata["name"] = name
|
|
|
|
linkedin_url = (
|
|
profile_data.get("linkedin_url") or
|
|
extraction_meta.get("linkedin_url", "")
|
|
)
|
|
if linkedin_url:
|
|
metadata["linkedin_url"] = linkedin_url
|
|
|
|
staff_id = extraction_meta.get("staff_id", "")
|
|
if staff_id:
|
|
metadata["staff_id"] = staff_id
|
|
|
|
headline = profile_data.get("headline", "")
|
|
if headline:
|
|
metadata["headline"] = headline
|
|
|
|
location = profile_data.get("location", "")
|
|
if location:
|
|
metadata["location"] = location
|
|
|
|
# Extract custodian from experience if available
|
|
experience = profile_data.get("experience", []) or profile_data.get("career_history", [])
|
|
if experience and isinstance(experience, list) and len(experience) > 0:
|
|
current_exp = experience[0]
|
|
if isinstance(current_exp, dict):
|
|
company = current_exp.get("company") or current_exp.get("organization", "")
|
|
if company:
|
|
metadata["current_organization"] = company
|
|
|
|
return metadata
|
|
|
|
|
|
class QdrantPersonSyncer(BaseSyncer):
|
|
"""Sync person/staff JSON files to Qdrant vector database."""
|
|
|
|
database_name = "qdrant_persons"
|
|
|
|
def __init__(
|
|
self,
|
|
host: str = QDRANT_HOST,
|
|
port: int = QDRANT_PORT,
|
|
url: str = QDRANT_URL,
|
|
collection: str = COLLECTION_NAME,
|
|
batch_size: int = BATCH_SIZE,
|
|
person_dir: Path = PERSON_DATA_DIR,
|
|
**kwargs
|
|
):
|
|
super().__init__(**kwargs)
|
|
self.host = host
|
|
self.port = port
|
|
self.url = url
|
|
self.collection = collection
|
|
self.batch_size = batch_size
|
|
self.person_dir = person_dir
|
|
self._retriever = None
|
|
self._client = None
|
|
self._embedding_model = None
|
|
self._embedding_dim = None
|
|
self._use_openai = None
|
|
|
|
def _get_client(self):
|
|
"""Lazy-load the Qdrant client (lightweight, for connection checks)."""
|
|
if self._client is None:
|
|
try:
|
|
from qdrant_client import QdrantClient
|
|
|
|
if self.url:
|
|
self._client = QdrantClient(url=self.url, timeout=10, check_compatibility=False)
|
|
else:
|
|
self._client = QdrantClient(
|
|
host=self.host,
|
|
port=self.port,
|
|
timeout=10,
|
|
check_compatibility=False,
|
|
)
|
|
except ImportError as e:
|
|
self.logger.error(f"Cannot import qdrant_client: {e}")
|
|
raise
|
|
return self._client
|
|
|
|
def _get_embedding_model(self):
|
|
"""Get the embedding model (sentence-transformers or OpenAI wrapper)."""
|
|
if self._embedding_model is None:
|
|
has_openai = bool(os.getenv("OPENAI_API_KEY"))
|
|
|
|
if not has_openai:
|
|
# Use sentence-transformers
|
|
from sentence_transformers import SentenceTransformer
|
|
self._embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
self._embedding_dim = 384
|
|
self._use_openai = False
|
|
self.logger.info("Using sentence-transformers for embeddings")
|
|
else:
|
|
# Use OpenAI
|
|
import openai
|
|
self._embedding_model = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
self._embedding_dim = 1536
|
|
self._use_openai = True
|
|
self.logger.info("Using OpenAI for embeddings")
|
|
|
|
return self._embedding_model
|
|
|
|
def _get_embeddings(self, texts: list[str]) -> list[list[float]]:
|
|
"""Get embeddings for a batch of texts."""
|
|
model = self._get_embedding_model()
|
|
|
|
if self._use_openai:
|
|
response = model.embeddings.create(
|
|
input=texts,
|
|
model="text-embedding-3-small"
|
|
)
|
|
return [item.embedding for item in sorted(response.data, key=lambda x: x.index)]
|
|
else:
|
|
# Sentence-transformers
|
|
embeddings = model.encode(texts, show_progress_bar=False)
|
|
return embeddings.tolist()
|
|
|
|
def _ensure_collection(self, recreate: bool = False):
|
|
"""Ensure the collection exists with correct settings."""
|
|
from qdrant_client.http.models import Distance, VectorParams
|
|
|
|
client = self._get_client()
|
|
_ = self._get_embedding_model() # Initialize to get embedding_dim
|
|
assert self._embedding_dim is not None, "Embedding dimension not set"
|
|
|
|
collections = client.get_collections().collections
|
|
collection_names = [c.name for c in collections]
|
|
|
|
if recreate and self.collection in collection_names:
|
|
self.logger.info(f"Deleting collection: {self.collection}")
|
|
client.delete_collection(self.collection)
|
|
collection_names.remove(self.collection)
|
|
|
|
if self.collection not in collection_names:
|
|
self.logger.info(f"Creating collection: {self.collection} (dim={self._embedding_dim})")
|
|
client.create_collection(
|
|
collection_name=self.collection,
|
|
vectors_config=VectorParams(
|
|
size=self._embedding_dim,
|
|
distance=Distance.COSINE
|
|
)
|
|
)
|
|
|
|
def _index_documents(self, documents: list[dict]) -> int:
|
|
"""Index documents to Qdrant with embeddings."""
|
|
from qdrant_client.http.models import PointStruct
|
|
import hashlib
|
|
import uuid
|
|
|
|
client = self._get_client()
|
|
total_indexed = 0
|
|
|
|
# Process in batches
|
|
for i in range(0, len(documents), self.batch_size):
|
|
batch = documents[i:i + self.batch_size]
|
|
texts = [d["text"] for d in batch if d.get("text")]
|
|
|
|
if not texts:
|
|
continue
|
|
|
|
# Get embeddings
|
|
embeddings = self._get_embeddings(texts)
|
|
|
|
# Create points
|
|
points = []
|
|
for doc, embedding in zip(batch, embeddings):
|
|
# Generate UUID v5 from text for deterministic ID
|
|
doc_text = doc.get("text", "")
|
|
doc_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, doc_text))
|
|
point = PointStruct(
|
|
id=doc_id,
|
|
vector=embedding,
|
|
payload=doc.get("metadata", {})
|
|
)
|
|
points.append(point)
|
|
|
|
# Upsert
|
|
client.upsert(
|
|
collection_name=self.collection,
|
|
points=points
|
|
)
|
|
total_indexed += len(points)
|
|
|
|
if i % 1000 == 0:
|
|
self.logger.info(f" Indexed {total_indexed}/{len(documents)} documents...")
|
|
|
|
return total_indexed
|
|
|
|
def check_connection(self) -> bool:
|
|
"""Check if Qdrant is available."""
|
|
try:
|
|
client = self._get_client()
|
|
collections = client.get_collections()
|
|
return True
|
|
except Exception as e:
|
|
self.logger.error(f"Qdrant connection failed: {e}")
|
|
return False
|
|
|
|
def get_status(self) -> dict:
|
|
"""Get Qdrant status for person collection."""
|
|
try:
|
|
client = self._get_client()
|
|
collections = client.get_collections().collections
|
|
collection_names = [c.name for c in collections]
|
|
|
|
if self.collection in collection_names:
|
|
info = client.get_collection(self.collection)
|
|
return {
|
|
"status": info.status.value if info.status else "unknown",
|
|
"vectors_count": info.vectors_count or 0,
|
|
"points_count": info.points_count or 0,
|
|
"collection_exists": True,
|
|
}
|
|
else:
|
|
return {
|
|
"status": "ready",
|
|
"vectors_count": 0,
|
|
"points_count": 0,
|
|
"collection_exists": False,
|
|
"available_collections": collection_names,
|
|
}
|
|
except Exception as e:
|
|
return {"status": "unavailable", "error": str(e)}
|
|
|
|
def _list_staff_files(self) -> list[Path]:
|
|
"""List all staff JSON files in bu/ directory."""
|
|
bu_dir = self.person_dir / "bu"
|
|
if not bu_dir.exists():
|
|
self.logger.warning(f"Staff directory not found: {bu_dir}")
|
|
return []
|
|
return sorted(bu_dir.glob("*_staff_*.json"))
|
|
|
|
def _list_entity_files(self) -> list[Path]:
|
|
"""List all entity profile JSON files."""
|
|
entity_dir = self.person_dir / "entity"
|
|
if not entity_dir.exists():
|
|
self.logger.warning(f"Entity directory not found: {entity_dir}")
|
|
return []
|
|
return sorted(entity_dir.glob("*.json"))
|
|
|
|
def _process_staff_file(self, filepath: Path) -> list[dict]:
|
|
"""Process a staff JSON file and return documents for indexing."""
|
|
documents = []
|
|
|
|
try:
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
except Exception as e:
|
|
self.logger.warning(f"Error reading {filepath.name}: {e}")
|
|
return documents
|
|
|
|
custodian_meta = data.get("custodian_metadata", {})
|
|
custodian_name = custodian_meta.get("custodian_name") or custodian_meta.get("name", "")
|
|
custodian_slug = custodian_meta.get("custodian_slug", "")
|
|
|
|
# Derive slug from filename if not in metadata
|
|
if not custodian_slug:
|
|
# e.g., "nationaal-archief_staff_20251209T2354.json" -> "nationaal-archief"
|
|
custodian_slug = filepath.stem.split("_staff_")[0]
|
|
|
|
staff_list = data.get("staff", [])
|
|
|
|
for person in staff_list:
|
|
if not isinstance(person, dict):
|
|
continue
|
|
|
|
name = person.get("name", "")
|
|
if not name or name.lower() == "linkedin member":
|
|
continue
|
|
|
|
text = extract_person_text_from_staff(person, custodian_name, custodian_slug)
|
|
if not text or len(text) < 10:
|
|
continue
|
|
|
|
metadata = extract_person_metadata_from_staff(
|
|
person, custodian_name, custodian_slug, filepath.name
|
|
)
|
|
|
|
documents.append({
|
|
"text": text,
|
|
"metadata": metadata,
|
|
})
|
|
|
|
return documents
|
|
|
|
def _process_entity_file(self, filepath: Path) -> Optional[dict]:
|
|
"""Process an entity profile JSON file and return document for indexing."""
|
|
try:
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
except Exception as e:
|
|
self.logger.warning(f"Error reading {filepath.name}: {e}")
|
|
return None
|
|
|
|
profile_data = data.get("profile_data", {})
|
|
name = profile_data.get("name") or profile_data.get("full_name", "")
|
|
|
|
if not name:
|
|
return None
|
|
|
|
text = extract_person_text_from_entity(data)
|
|
if not text or len(text) < 10:
|
|
return None
|
|
|
|
metadata = extract_person_metadata_from_entity(data, filepath.name)
|
|
|
|
return {
|
|
"text": text,
|
|
"metadata": metadata,
|
|
}
|
|
|
|
def sync(
|
|
self,
|
|
limit: Optional[int] = None,
|
|
dry_run: bool = False,
|
|
recreate: bool = True,
|
|
include_staff: bool = True,
|
|
include_entities: bool = True,
|
|
) -> SyncResult:
|
|
"""Sync all person JSON files to Qdrant."""
|
|
result = SyncResult(
|
|
database="qdrant_persons",
|
|
status=SyncStatus.IN_PROGRESS,
|
|
start_time=datetime.now(timezone.utc),
|
|
)
|
|
|
|
# Check for embedding capability (OpenAI or sentence-transformers)
|
|
has_openai = bool(os.getenv("OPENAI_API_KEY"))
|
|
try:
|
|
from sentence_transformers import SentenceTransformer
|
|
has_st = True
|
|
except ImportError:
|
|
has_st = False
|
|
|
|
if not dry_run and not has_openai and not has_st:
|
|
result.status = SyncStatus.FAILED
|
|
result.error_message = (
|
|
"No embedding capability available. Either set OPENAI_API_KEY or "
|
|
"install sentence-transformers: pip install sentence-transformers"
|
|
)
|
|
result.end_time = datetime.now(timezone.utc)
|
|
return result
|
|
|
|
if not has_openai and has_st:
|
|
self.logger.info("Using sentence-transformers for embeddings (384 dimensions)")
|
|
elif has_openai:
|
|
self.logger.info("Using OpenAI for embeddings (1536 dimensions)")
|
|
|
|
documents = []
|
|
|
|
# Process staff files
|
|
if include_staff:
|
|
staff_files = self._list_staff_files()
|
|
self.logger.info(f"Found {len(staff_files)} staff files")
|
|
|
|
for filepath in staff_files:
|
|
docs = self._process_staff_file(filepath)
|
|
documents.extend(docs)
|
|
result.records_processed += 1
|
|
|
|
if limit and len(documents) >= limit:
|
|
break
|
|
|
|
# Process entity files
|
|
if include_entities:
|
|
entity_files = self._list_entity_files()
|
|
self.logger.info(f"Found {len(entity_files)} entity files")
|
|
|
|
for filepath in entity_files:
|
|
if limit and len(documents) >= limit:
|
|
break
|
|
|
|
doc = self._process_entity_file(filepath)
|
|
if doc:
|
|
documents.append(doc)
|
|
result.records_processed += 1
|
|
|
|
# Apply limit
|
|
if limit:
|
|
documents = documents[:limit]
|
|
|
|
result.records_succeeded = len(documents)
|
|
result.details["documents_prepared"] = len(documents)
|
|
result.details["staff_count"] = sum(
|
|
1 for d in documents if d["metadata"].get("source_type") == "staff_list"
|
|
)
|
|
result.details["entity_count"] = sum(
|
|
1 for d in documents if d["metadata"].get("source_type") == "entity_profile"
|
|
)
|
|
|
|
self.logger.info(
|
|
f"Prepared {len(documents)} documents "
|
|
f"(staff: {result.details['staff_count']}, entities: {result.details['entity_count']})"
|
|
)
|
|
|
|
if dry_run:
|
|
self.logger.info(f"[DRY RUN] Would index {len(documents)} documents to Qdrant")
|
|
result.status = SyncStatus.SUCCESS
|
|
result.details["dry_run"] = True
|
|
result.end_time = datetime.now(timezone.utc)
|
|
return result
|
|
|
|
# Check connection
|
|
if not self.check_connection():
|
|
result.status = SyncStatus.FAILED
|
|
result.error_message = f"Cannot connect to Qdrant at {self.url or f'{self.host}:{self.port}'}"
|
|
result.end_time = datetime.now(timezone.utc)
|
|
return result
|
|
|
|
# Ensure collection exists (recreate if requested)
|
|
try:
|
|
self._ensure_collection(recreate=recreate)
|
|
except Exception as e:
|
|
result.status = SyncStatus.FAILED
|
|
result.error_message = f"Failed to create collection: {e}"
|
|
result.end_time = datetime.now(timezone.utc)
|
|
return result
|
|
|
|
# Index documents
|
|
self.logger.info(f"Indexing {len(documents)} documents...")
|
|
try:
|
|
indexed = self._index_documents(documents)
|
|
result.details["indexed_count"] = indexed
|
|
|
|
# Get final count
|
|
client = self._get_client()
|
|
info = client.get_collection(self.collection)
|
|
result.details["final_vectors_count"] = info.points_count or 0
|
|
|
|
result.status = SyncStatus.SUCCESS
|
|
except Exception as e:
|
|
self.logger.error(f"Indexing failed: {e}")
|
|
result.status = SyncStatus.FAILED
|
|
result.error_message = str(e)
|
|
|
|
result.end_time = datetime.now(timezone.utc)
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Sync person/staff JSON files to Qdrant")
|
|
parser.add_argument("--dry-run", action="store_true", help="Parse files but don't index")
|
|
parser.add_argument("--limit", type=int, help="Limit number of documents to process")
|
|
parser.add_argument("--host", default=QDRANT_HOST, help="Qdrant server hostname")
|
|
parser.add_argument("--port", type=int, default=QDRANT_PORT, help="Qdrant REST API port")
|
|
parser.add_argument("--url", default=QDRANT_URL, help="Full Qdrant URL (overrides host/port)")
|
|
parser.add_argument("--collection", default=COLLECTION_NAME, help="Qdrant collection name")
|
|
parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Documents per batch")
|
|
parser.add_argument("--no-recreate", action="store_true", help="Don't recreate collection")
|
|
parser.add_argument("--staff-only", action="store_true", help="Only index staff lists")
|
|
parser.add_argument("--entities-only", action="store_true", help="Only index entity profiles")
|
|
args = parser.parse_args()
|
|
|
|
import logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
)
|
|
|
|
syncer = QdrantPersonSyncer(
|
|
host=args.host,
|
|
port=args.port,
|
|
url=args.url,
|
|
collection=args.collection,
|
|
batch_size=args.batch_size,
|
|
)
|
|
|
|
print("=" * 60)
|
|
print("Qdrant Person Sync")
|
|
print("=" * 60)
|
|
|
|
if not args.dry_run:
|
|
location = args.url or f"{args.host}:{args.port}"
|
|
print(f"Checking connection to {location}...")
|
|
try:
|
|
status = syncer.get_status()
|
|
print(f" Status: {status.get('status', 'unknown')}")
|
|
if status.get('vectors_count'):
|
|
print(f" Vectors: {status['vectors_count']:,}")
|
|
except Exception as e:
|
|
print(f" Connection check failed: {e}")
|
|
|
|
include_staff = not args.entities_only
|
|
include_entities = not args.staff_only
|
|
|
|
result = syncer.sync(
|
|
limit=args.limit,
|
|
dry_run=args.dry_run,
|
|
recreate=not args.no_recreate,
|
|
include_staff=include_staff,
|
|
include_entities=include_entities,
|
|
)
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"Sync Result: {result.status.value.upper()}")
|
|
print(f" Processed: {result.records_processed}")
|
|
print(f" Documents: {result.details.get('documents_prepared', 0)}")
|
|
print(f" - Staff records: {result.details.get('staff_count', 0)}")
|
|
print(f" - Entity profiles: {result.details.get('entity_count', 0)}")
|
|
print(f" Duration: {result.duration_seconds:.2f}s")
|
|
if result.error_message:
|
|
print(f" Error: {result.error_message}")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|