Add handler.py, start_emulator.sh and test scripts

Browse files

Files changed (5) hide show

embed_single_query.sh +9 -0
embed_two_chunks.sh +9 -0
handler.py +65 -0
start_emulator.sh +4 -0
test_endpoint.py +67 -0

embed_single_query.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/bin/sh
+set -x
+curl \
+  --request POST \
+  --url http://localhost:4999 \
+  --header 'Content-Type: application/json' \
+  --data '{"inputs": "Please embed me"}' \
+  -w "\n"

embed_two_chunks.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/bin/sh
+set -x
+curl \
+  --request POST \
+  --url http://localhost:4999 \
+  --header 'Content-Type: application/json' \
+  --data '{"inputs": ["Please embed me", "And me too, please!"]}' \
+  -w "\n"

handler.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import Any, Dict, List
+from colbert.infra import ColBERTConfig
+from colbert.modeling.checkpoint import Checkpoint
+import torch
+import logging
+logger = logging.getLogger(__name__)
+MODEL = "fdurant/colbert-xm-for-inference-api"
+class EndpointHandler():
+    def __init__(self, path=""):
+        self._config = ColBERTConfig(
+            # Defaults copied from https://github.com/datastax/ragstack-ai/blob/main/libs/colbert/ragstack_colbert/colbert_embedding_model.py
+            doc_maxlen=512, # Maximum number of tokens for document chunks. Should equal the chunk_size.
+            nbits=2, # The number bits that each dimension encodes to.
+            kmeans_niters=4, # Number of iterations for k-means clustering during quantization.
+            nranks=-1, # Number of ranks (processors) to use for distributed computing; -1 uses all available CPUs/GPUs.
+            checkpoint=MODEL,
+        )
+        self._checkpoint = Checkpoint(self._config.checkpoint, colbert_config=self._config, verbose=3)
+    def __call__(self, data: Any) -> List[Dict[str, Any]]:
+        inputs = data["inputs"]
+        texts = []
+        if isinstance(inputs, str):
+            texts = [inputs]
+        elif isinstance(inputs, list) and all(isinstance(text, str) for text in inputs):
+            texts = inputs
+        else:
+            raise ValueError("Invalid input data format")
+        with torch.inference_mode():
+            if len(texts) == 1:
+                # It's a query
+                logger.info(f"Query: {texts}")
+                embedding = self._checkpoint.queryFromText(
+                    queries=texts,
+                    full_length_search=False,  # Indicates whether to encode the query for a full-length search.
+                )
+                logger.info(f"Query embedding shape: {embedding.shape}")
+                return [
+                    {"input": inputs, "query_embedding": embedding.tolist()[0]}
+                ]
+            elif len(texts) > 1:
+                # It's a batch of chunks
+                logger.info(f"Batch of chunks: {texts}")
+                embeddings, token_counts = self._checkpoint.docFromText(
+                    docs=texts,
+                    bsize=self._config.bsize, # Batch size
+                    keep_dims=True, # Do NOT flatten the embeddings
+                    return_tokens=True, # Return the tokens as well
+                )
+                for text, embedding, token_count in zip(texts, embeddings, token_counts):
+                    logger.info(f"Chunk: {text}")
+                    logger.info(f"Chunk embedding shape: {embedding.shape}")
+                    logger.info(f"Chunk count: {token_count}")
+                return [
+                    {"input": _input, "chunk_embedding": embedding.tolist(), "token_count": token_count.tolist()}
+                    for _input, embedding, token_count in zip(texts, embeddings, token_counts)
+                ]
+            else:
+                raise ValueError("No data to process")

start_emulator.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash -e
+export SHELL=/bin/bash
+hf-endpoints-emulator "$@"

test_endpoint.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import pytest
+import requests
+URL = "http://localhost:4999/"
+HEADERS = {"Content-Type": "application/json"}
+def test_returns_200():
+    payload = {"inputs": "try me"}
+    response = requests.request("POST", URL, json=payload, headers=HEADERS)
+    assert response.status_code == 200
+def test_query_returns_expected_result():
+    query = "try me"
+    payload = {"inputs": query}
+    response = requests.request("POST", URL, json=payload, headers=HEADERS)
+    response_data = response.json()
+#    print(response_data)
+    # Check structure and input
+    assert isinstance(response_data, list)
+    assert len(response_data) == 1
+    assert isinstance(response_data[0], dict)
+    assert response_data[0].get("input") == query
+    # Check query embedding (actually a list of embeddings, one per token in the query)
+    query_embedding = response_data[0].get("query_embedding")
+    assert isinstance(query_embedding, list)
+    assert len(query_embedding) == 32
+    # Check first of the token embeddings
+    first_token_embedding = query_embedding[0]
+    assert isinstance(first_token_embedding, list)
+    assert len(first_token_embedding) == 128
+    assert all(isinstance(value, float) for value in first_token_embedding)
+def test_batch_returns_expected_result():
+    chunks = ["try me", "try me again and again and again"]
+    expected_token_counts = [11, 11]  # Including start and stop tokens, I presume. Not exactly clear!
+    payload = {"inputs": chunks}
+    response = requests.request("POST", URL, json=payload, headers=HEADERS)
+    response_data = response.json()
+    # Check structure
+    assert isinstance(response_data, list)
+    assert len(response_data) == len(chunks)
+    for i, response_chunk in enumerate(response_data):
+        # Check input
+        assert response_chunk.get("input") == chunks[i]
+        # Check chunk embedding (actually a list of embeddings, one per token in the chunk)
+        chunk_embedding = response_chunk.get("chunk_embedding")
+        token_count = response_chunk.get("token_count")
+        assert isinstance(chunk_embedding, list)
+        assert len(chunk_embedding) == len(token_count)
+        assert len(token_count) == expected_token_counts[i]
+        # Check first of the token embeddings
+        first_token_embedding = chunk_embedding[0]
+        assert len(first_token_embedding) == 128
+        assert all(isinstance(value, float) for value in first_token_embedding)