## Setup & Installation

In [2]:
%%writefile requirements.txt
bitsandbytes
git+https://github.com/huggingface/transformers.git
accelerate
sentencepiece

Writing requirements.txt


In [None]:
!pip install -r requirements.txt

## 3. Create Custom Handler for Inference Endpoints


In [1]:
%%writefile pipeline.py
from typing import Dict, List, Any
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class PreTrainedPipeline():
 def __init__(self, path=""):
 # load the optimized model
 self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map="auto", load_in_8bit=True)
 self.tokenizer = AutoTokenizer.from_pretrained(path)

 def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
 """
 Args:
 data (:obj:):
 includes the input data and the parameters for the inference.
 Return:
 A :obj:`list`:. The list contains the embeddings of the inference inputs
 """
 inputs = data.get("inputs", data)
 parameters = data.get("parameters", {})

 # tokenize the input
 input_ids = self.tokenizer(inputs,return_tensors="pt").input_ids.to(self.model.device)
 # run the model
 logits = self.model.generate(input_ids, **parameters)
 # Perform pooling
 # postprocess the prediction
 return {"generated_text": self.tokenizer.decode(logits[0].tolist())}

Overwriting pipeline.py


test custom pipeline

In [2]:
from pipeline import PreTrainedPipeline

# init handler
my_handler = PreTrainedPipeline(path=".")


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link
CUDA SETUP: CUDA runtime path found: /home/ubuntu/miniconda/envs/dev/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/ubuntu/miniconda/envs/dev/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


In [3]:

# prepare sample payload
request = {"inputs": "def hello_world():"}

# test the handler
my_handler(request)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 attn_weights = torch.where(causal_mask, attn_weights, mask_value)


{'generated_text': 'def hello_world():\n return "Hello World"\n\n@app.route(\'/'}

In [6]:
# prepare sample payload
request = {
 "inputs": "# load distilbert model and initialize text-classification pipeline\nmodel_id = 'distil",
 "parameters": {
 "top_k": 100,
 "max_length": 64,
 "early_stopping": True,
 "do_sample": True,
 "eos_token_id": 50256,
 },
}

# test the handler
print(my_handler(request))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'generated_text': "# load distilbert model and initialize text-classification pipeline\nmodel_id = 'distilbert-base-uncased'\nmodel_url = 'https://tfhub.dev/tensorflow/small_bert/1'\n\nmodel_dir = './distilBERT'"}


In [13]:
my_handler.tokenizer.convert_tokens_to_ids(my_handler.tokenizer.eos_token)


50256