glam/scripts/index_institutions_direct.py

#!/usr/bin/env python3
"""
Index Heritage Institutions in Qdrant using Direct HTTP API

This script bypasses the qdrant-client library which has issues with reverse proxy URLs.
Uses requests library directly for reliable operation.

Usage:
    python scripts/index_institutions_direct.py --data-dir /tmp/dutch_custodians
"""

import argparse
import logging
import os
import sys
import uuid
from pathlib import Path
from typing import Any

import requests
import yaml
from openai import OpenAI

PROJECT_ROOT = Path(__file__).parent.parent

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

# Qdrant configuration
QDRANT_BASE_URL = "https://bronhouder.nl/qdrant"
COLLECTION_NAME = "heritage_custodians"
EMBEDDING_MODEL = "text-embedding-3-small"
EMBEDDING_DIM = 1536


def load_yaml_file(filepath: Path) -> dict[str, Any] | None:
    """Load a YAML file and return its contents."""
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return yaml.safe_load(f)
    except Exception as e:
        logger.warning(f"Failed to load {filepath}: {e}")
        return None


def extract_institution_text(data: dict[str, Any]) -> str:
    """Extract searchable text from institution data."""
    parts = []

    original = data.get("original_entry", {})

    # Name
    name = (
        data.get("custodian_name", {}).get("claim_value") or
        data.get("custodian_name", {}).get("emic_name") or
        original.get("name") or
        data.get("name", "")
    )
    if name:
        parts.append(f"Name: {name}")

    # Alternative names from Wikidata
    wikidata = data.get("wikidata_enrichment", {})
    labels = wikidata.get("wikidata_labels", {})
    if labels:
        alt_names = [v for k, v in labels.items() if v and v != name][:5]
        if alt_names:
            parts.append(f"Also known as: {', '.join(set(alt_names))}")

    # Description
    description = wikidata.get("wikidata_description_en", "")
    if not description:
        descriptions = wikidata.get("wikidata_descriptions", {})
        description = descriptions.get("en", "")
    if description:
        parts.append(description)

    # Institution type
    inst_type = original.get("institution_type") or data.get("institution_type", "")
    if inst_type:
        parts.append(f"Type: {inst_type}")

    # Location
    locations = original.get("locations", []) or data.get("locations", [])
    location = data.get("location", {})

    def safe_str(val):
        if val is None:
            return None
        if isinstance(val, str):
            return val
        if isinstance(val, dict):
            return val.get("name") or val.get("label") or str(val)
        return str(val)

    city = None
    region = None
    country = None

    if locations and isinstance(locations, list) and len(locations) > 0:
        loc = locations[0]
        if isinstance(loc, dict):
            city = safe_str(loc.get("city"))
            region = safe_str(loc.get("region"))
            country = safe_str(loc.get("country"))

    if not city:
        city = safe_str(location.get("city"))
    if not region:
        region = safe_str(location.get("region"))
    if not country:
        country = safe_str(location.get("country"))

    # Try wikidata location
    if not city and not region:
        wikidata_loc = wikidata.get("located_in", {})
        if wikidata_loc:
            city = safe_str(wikidata_loc.get("label"))

    location_parts = [p for p in [city, region, country] if p]
    if location_parts:
        parts.append(f"Location: {', '.join(location_parts)}")

    return "\n".join(parts)


def extract_metadata(data: dict[str, Any], filepath: Path) -> dict[str, Any]:
    """Extract metadata for Qdrant payload."""
    metadata = {}

    original = data.get("original_entry", {})

    # GHCID
    ghcid = data.get("ghcid", {})
    ghcid_current = ghcid.get("ghcid_current") or original.get("ghcid") or filepath.stem
    metadata["ghcid"] = ghcid_current

    # Name
    name = (
        data.get("custodian_name", {}).get("claim_value") or
        data.get("custodian_name", {}).get("emic_name") or
        original.get("name") or
        data.get("name", "")
    )
    if name:
        metadata["name"] = name

    # Institution type
    inst_type = original.get("institution_type") or data.get("institution_type", "")
    if inst_type:
        metadata["institution_type"] = inst_type

    # Location
    locations = original.get("locations", []) or data.get("locations", [])
    location = data.get("location", {})

    def safe_str(val):
        if val is None:
            return None
        if isinstance(val, str):
            return val
        if isinstance(val, dict):
            return val.get("name") or val.get("label") or str(val)
        return str(val)

    if locations and isinstance(locations, list) and len(locations) > 0:
        loc = locations[0]
        if isinstance(loc, dict):
            if loc.get("city"):
                metadata["city"] = safe_str(loc["city"])
            # Use region_code (ISO 3166-2) for filtering, fallback to region name
            if loc.get("region_code"):
                metadata["region"] = loc["region_code"]  # e.g., "NH" not "Noord-Holland"
            elif loc.get("region"):
                metadata["region"] = safe_str(loc["region"])
            if loc.get("country"):
                metadata["country"] = safe_str(loc["country"])

    # Fallback to location dict
    if "city" not in metadata and location.get("city"):
        metadata["city"] = safe_str(location["city"])
    if "region" not in metadata:
        if location.get("region_code"):
            metadata["region"] = location["region_code"]
        elif location.get("region"):
            metadata["region"] = safe_str(location["region"])
    if "country" not in metadata and location.get("country"):
        metadata["country"] = safe_str(location["country"])

    # Coordinates
    lat = data.get("latitude")
    lon = data.get("longitude")

    if lat is None or lon is None:
        google_maps = data.get("google_maps_enrichment", {})
        coords = google_maps.get("coordinates", {})
        if coords:
            lat = lat or coords.get("latitude")
            lon = lon or coords.get("longitude")

    if lat is None or lon is None:
        wikidata = data.get("wikidata_enrichment", {})
        wikidata_coords = wikidata.get("wikidata_coordinates", {})
        if wikidata_coords:
            lat = lat or wikidata_coords.get("latitude")
            lon = lon or wikidata_coords.get("longitude")

    if lat is not None:
        try:
            metadata["latitude"] = float(lat)
        except (ValueError, TypeError):
            pass
    if lon is not None:
        try:
            metadata["longitude"] = float(lon)
        except (ValueError, TypeError):
            pass

    # Wikidata ID
    wikidata_id = (
        original.get("wikidata_id") or
        data.get("wikidata_enrichment", {}).get("wikidata_entity_id", "")
    )
    if wikidata_id:
        metadata["wikidata_id"] = wikidata_id

    return metadata


def find_institution_files(data_dir: Path) -> list[Path]:
    """Find all institution YAML files in the data directory."""
    files = []

    excluded_patterns = ["_schema", "_config", "_template", "test_", "example_"]

    def is_valid_file(name: str) -> bool:
        if not name.endswith(('.yaml', '.yml')):
            return False
        if name.startswith('.'):
            return False
        name_lower = name.lower()
        return not any(excl in name_lower for excl in excluded_patterns)

    try:
        for name in os.listdir(data_dir):
            if is_valid_file(name):
                filepath = data_dir / name
                if filepath.is_file():
                    files.append(filepath)
    except PermissionError:
        logger.warning(f"Permission denied accessing {data_dir}")

    return sorted(files)


def create_collection():
    """Create the Qdrant collection with proper settings."""
    # Check if collection exists
    resp = requests.get(f"{QDRANT_BASE_URL}/collections/{COLLECTION_NAME}", timeout=30)
    if resp.status_code == 200:
        logger.info(f"Collection {COLLECTION_NAME} already exists")
        return True

    # Create collection
    create_data = {
        "vectors": {
            "size": EMBEDDING_DIM,
            "distance": "Cosine"
        }
    }
    resp = requests.put(
        f"{QDRANT_BASE_URL}/collections/{COLLECTION_NAME}",
        json=create_data,
        timeout=30
    )

    if resp.status_code in (200, 201):
        logger.info(f"Created collection {COLLECTION_NAME}")
        return True
    else:
        logger.error(f"Failed to create collection: {resp.status_code} - {resp.text}")
        return False


def get_embeddings(texts: list[str], client: OpenAI) -> list[list[float]]:
    """Get embeddings for a batch of texts."""
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=texts
    )
    return [item.embedding for item in response.data]


def upsert_points(points: list[dict], timeout: int = 120):
    """Upsert points to Qdrant collection."""
    resp = requests.put(
        f"{QDRANT_BASE_URL}/collections/{COLLECTION_NAME}/points",
        json={"points": points},
        timeout=timeout
    )

    if resp.status_code in (200, 201):
        return True
    else:
        logger.error(f"Failed to upsert points: {resp.status_code} - {resp.text}")
        return False


def main():
    parser = argparse.ArgumentParser(
        description="Index heritage institutions in Qdrant using direct HTTP API"
    )
    parser.add_argument(
        "--data-dir",
        type=Path,
        default=PROJECT_ROOT / "data" / "custodian",
        help="Directory containing institution YAML files"
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=50,
        help="Number of documents to index per batch"
    )
    parser.add_argument(
        "--recreate",
        action="store_true",
        help="Delete and recreate the collection"
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=None,
        help="Limit number of files to process (for testing)"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Parse files but don't index"
    )

    args = parser.parse_args()

    # Check API key
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        logger.error("OPENAI_API_KEY environment variable is required")
        sys.exit(1)

    # Check data directory
    if not args.data_dir.exists():
        logger.error(f"Data directory not found: {args.data_dir}")
        sys.exit(1)

    # Find files
    logger.info(f"Scanning for institution files in {args.data_dir}")
    files = find_institution_files(args.data_dir)
    logger.info(f"Found {len(files)} institution files")

    if args.limit:
        files = files[:args.limit]
        logger.info(f"Limited to {len(files)} files")

    if not files:
        logger.warning("No institution files found")
        sys.exit(0)

    # Prepare documents
    documents = []
    for filepath in files:
        data = load_yaml_file(filepath)
        if not data:
            continue

        text = extract_institution_text(data)
        if not text or len(text) < 20:
            continue

        metadata = extract_metadata(data, filepath)
        documents.append({
            "text": text,
            "metadata": metadata,
        })

    logger.info(f"Prepared {len(documents)} documents for indexing")

    if args.dry_run:
        logger.info("Dry run - not indexing")
        for doc in documents[:5]:
            logger.info(f"  - {doc['metadata'].get('name', 'Unknown')}: {len(doc['text'])} chars")
            logger.info(f"    Region: {doc['metadata'].get('region', 'N/A')}")
        sys.exit(0)

    # Handle collection
    if args.recreate:
        logger.info(f"Deleting collection {COLLECTION_NAME}")
        resp = requests.delete(f"{QDRANT_BASE_URL}/collections/{COLLECTION_NAME}", timeout=30)
        logger.info(f"Delete result: {resp.status_code}")

    if not create_collection():
        sys.exit(1)

    # Initialize OpenAI client
    client = OpenAI(api_key=api_key)

    # Index in batches
    total_indexed = 0
    for i in range(0, len(documents), args.batch_size):
        batch = documents[i:i + args.batch_size]
        texts = [doc["text"] for doc in batch]

        logger.info(f"Processing batch {i // args.batch_size + 1}/{(len(documents) + args.batch_size - 1) // args.batch_size} ({len(batch)} docs)")

        # Get embeddings
        embeddings = get_embeddings(texts, client)

        # Prepare points
        points = []
        for j, (doc, embedding) in enumerate(zip(batch, embeddings)):
            point_id = str(uuid.uuid4())
            points.append({
                "id": point_id,
                "vector": embedding,
                "payload": doc["metadata"]
            })

        # Upsert
        if upsert_points(points):
            total_indexed += len(points)
            logger.info(f"Indexed {total_indexed}/{len(documents)} documents")
        else:
            logger.error(f"Failed to index batch starting at {i}")

    # Final stats
    resp = requests.get(f"{QDRANT_BASE_URL}/collections/{COLLECTION_NAME}", timeout=30)
    if resp.status_code == 200:
        info = resp.json().get("result", {})
        vectors_count = info.get("vectors_count", 0)
        logger.info(f"Indexing complete! Collection has {vectors_count} vectors")

    logger.info(f"Total documents indexed: {total_indexed}")


if __name__ == "__main__":
    main()