Spaces:

traversaal-internal
/

traversaal_pro_embed_fastapi

Running

File size: 2,814 Bytes
# qdrant_utils.py contains utility functions to interact with Qdrant, a vector similarity search engine.

from qdrant_client import QdrantClient
from qdrant_client.http import models
import os
import logging
import uuid

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

qdrant_client = QdrantClient(
    url=os.getenv('QDRANT_URL'),
    api_key=os.getenv('QDRANT_API_KEY')
)

def create_collection_if_not_exists(collection_name, vector_size):
    try:
        # Check if collection exists
        collections = qdrant_client.get_collections().collections
        if not any(collection.name == collection_name for collection in collections):
            # Create the collection if it doesn't exist
            qdrant_client.create_collection(
                collection_name=collection_name,
                vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE)
            )
            logging.info(f"Created new collection: {collection_name}")
        else:
            logging.info(f"Collection {collection_name} already exists")
    except Exception as e:
        logging.error(f"Error creating collection: {str(e)}")
        raise

def store_embeddings(chunks, embeddings, user_id, data_source_id, file_id, organization_id, s3_bucket_key, total_tokens):
    try:
        collection_name = "embed"    # Name of the collection in Qdrant
        vector_size = len(embeddings[0]) 

        # Ensure the collection exists
        create_collection_if_not_exists(collection_name, vector_size)

        # Prepare points for Qdrant
        points = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            chunk_id = str(uuid.uuid4())  # Generate a unique ID for each chunk
            points.append(
                models.PointStruct(
                    id=chunk_id,
                    vector=embedding.tolist(),  # Convert numpy array to list
                    payload={
                        "user_id": user_id,
                        "data_source_id": data_source_id,
                        "file_id": file_id,
                        "organization_id": organization_id,
                        "chunk_index": i,
                        "chunk_text": chunk,
                        "s3_bucket_key": s3_bucket_key,
                        "total_tokens": total_tokens
                        
                    }
                )
            )

        # Store embeddings in Qdrant
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points
        )
        logging.info(f"Successfully stored {len(points)} embeddings")
    except Exception as e:
        logging.error(f"Error storing embeddings in Qdrant: {str(e)}")
        raise