philschmid HF staff commited on
Commit
17e379b
1 Parent(s): a745926

added pipeline

Browse files
Files changed (4) hide show
  1. README.md +59 -0
  2. create_handler.ipynb +251 -0
  3. pipeline.py +31 -0
  4. requirements.txt +4 -0
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: bsd-3-clause
3
+ tags:
4
+ - endpoints-template
5
+ pipeline_tag: text-generation
6
+ ---
7
+ # Sharded fork of [Salesforce/codegen-6B-mono](https://huggingface.co/Salesforce/codegen-6B-mono) with a custom pipeline.py
8
+
9
+ This repository implements a custom `pipeline` task for `text-generation` for 🤗 Inference Endpoints for LLM inference using bitsandbytes quantization. The code for the customized pipeline is in the [pipeline.py](https://huggingface.co/philschmid/codegen-6B-mono-sharded-bnb/blob/main/pipeline.py).
10
+
11
+ There is also a [notebook](https://huggingface.co/philschmid/codegen-6B-mono-sharded-bnb/blob/main/create_handler.ipynb) included.
12
+
13
+ ### expected Request payload
14
+ ```json
15
+ {
16
+ "inputs": "# load distilbert model and initialize text-classification pipeline\nmodel_id = 'distil",
17
+ "parameters": {
18
+ "top_k": 100,
19
+ "max_length": 64,
20
+ "early_stopping": true,
21
+ "do_sample": true,
22
+ "eos_token_id": 50256,
23
+ }
24
+ }
25
+ ```
26
+
27
+ below is an example on how to run a request using Python and `requests`.
28
+
29
+ ## Run Request
30
+ ```python
31
+ import json
32
+ from typing import List
33
+ import requests as r
34
+ import base64
35
+ ENDPOINT_URL = ""
36
+ HF_TOKEN = ""
37
+
38
+ parameters={
39
+ "top_k": 100,
40
+ "max_length": 64,
41
+ "early_stopping": True,
42
+ "do_sample": True,
43
+ "eos_token_id": 50256,
44
+ }
45
+
46
+ def predict(code_snippet:str=None):
47
+ payload = {"inputs": code_snippet,"parameters": parameters}
48
+ response = r.post(
49
+ ENDPOINT_URL, headers={"Authorization": f"Bearer {HF_TOKEN}"}, json=payload
50
+ )
51
+ return response.json()
52
+ prediction = predict(
53
+ code_snippet="# load distilbert model and initialize text-classification pipeline\nmodel_id = 'distil"
54
+ )
55
+ ```
56
+ expected output
57
+ ```python
58
+ {'generated_text': "# load distilbert model and initialize text-classification pipeline\nmodel_id = 'distilbert-base-uncased'\nmodel_url = 'https://tfhub.dev/tensorflow/small_bert/1'\n\nmodel_dir = './distilBERT'"}
59
+ ```
create_handler.ipynb ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## Setup & Installation"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 2,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stdout",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "Writing requirements.txt\n"
20
+ ]
21
+ }
22
+ ],
23
+ "source": [
24
+ "%%writefile requirements.txt\n",
25
+ "bitsandbytes\n",
26
+ "git+https://github.com/huggingface/transformers.git\n",
27
+ "accelerate\n",
28
+ "sentencepiece"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "!pip install -r requirements.txt"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "markdown",
42
+ "metadata": {},
43
+ "source": [
44
+ "## 3. Create Custom Handler for Inference Endpoints\n"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 1,
50
+ "metadata": {},
51
+ "outputs": [
52
+ {
53
+ "name": "stdout",
54
+ "output_type": "stream",
55
+ "text": [
56
+ "Overwriting pipeline.py\n"
57
+ ]
58
+ }
59
+ ],
60
+ "source": [
61
+ "%%writefile pipeline.py\n",
62
+ "from typing import Dict, List, Any\n",
63
+ "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
64
+ "import torch\n",
65
+ "\n",
66
+ "class PreTrainedPipeline():\n",
67
+ " def __init__(self, path=\"\"):\n",
68
+ " # load the optimized model\n",
69
+ " self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map=\"auto\", load_in_8bit=True)\n",
70
+ " self.tokenizer = AutoTokenizer.from_pretrained(path)\n",
71
+ "\n",
72
+ " def __call__(self, data: Any) -> List[List[Dict[str, float]]]:\n",
73
+ " \"\"\"\n",
74
+ " Args:\n",
75
+ " data (:obj:):\n",
76
+ " includes the input data and the parameters for the inference.\n",
77
+ " Return:\n",
78
+ " A :obj:`list`:. The list contains the embeddings of the inference inputs\n",
79
+ " \"\"\"\n",
80
+ " inputs = data.get(\"inputs\", data)\n",
81
+ " parameters = data.get(\"parameters\", {})\n",
82
+ "\n",
83
+ " # tokenize the input\n",
84
+ " input_ids = self.tokenizer(inputs,return_tensors=\"pt\").input_ids.to(self.model.device)\n",
85
+ " # run the model\n",
86
+ " logits = self.model.generate(input_ids, **parameters)\n",
87
+ " # Perform pooling\n",
88
+ " # postprocess the prediction\n",
89
+ " return {\"generated_text\": self.tokenizer.decode(logits[0].tolist())}"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "metadata": {},
95
+ "source": [
96
+ "test custom pipeline"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 2,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "name": "stdout",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "\n",
109
+ "===================================BUG REPORT===================================\n",
110
+ "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
111
+ "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n",
112
+ "================================================================================\n",
113
+ "CUDA SETUP: CUDA runtime path found: /home/ubuntu/miniconda/envs/dev/lib/libcudart.so\n",
114
+ "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n",
115
+ "CUDA SETUP: Detected CUDA version 113\n",
116
+ "CUDA SETUP: Loading binary /home/ubuntu/miniconda/envs/dev/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...\n"
117
+ ]
118
+ }
119
+ ],
120
+ "source": [
121
+ "from pipeline import PreTrainedPipeline\n",
122
+ "\n",
123
+ "# init handler\n",
124
+ "my_handler = PreTrainedPipeline(path=\".\")"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 3,
130
+ "metadata": {},
131
+ "outputs": [
132
+ {
133
+ "name": "stderr",
134
+ "output_type": "stream",
135
+ "text": [
136
+ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
137
+ "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
138
+ "/home/ubuntu/miniconda/envs/dev/lib/python3.9/site-packages/transformers/generation_utils.py:1228: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 20 (`self.config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
139
+ " warnings.warn(\n",
140
+ "/home/ubuntu/miniconda/envs/dev/lib/python3.9/site-packages/transformers/models/codegen/modeling_codegen.py:167: UserWarning: where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead. (Triggered internally at ../aten/src/ATen/native/TensorCompare.cpp:333.)\n",
141
+ " attn_weights = torch.where(causal_mask, attn_weights, mask_value)\n"
142
+ ]
143
+ },
144
+ {
145
+ "data": {
146
+ "text/plain": [
147
+ "{'generated_text': 'def hello_world():\\n return \"Hello World\"\\n\\n@app.route(\\'/'}"
148
+ ]
149
+ },
150
+ "execution_count": 3,
151
+ "metadata": {},
152
+ "output_type": "execute_result"
153
+ }
154
+ ],
155
+ "source": [
156
+ "\n",
157
+ "# prepare sample payload\n",
158
+ "request = {\"inputs\": \"def hello_world():\"}\n",
159
+ "\n",
160
+ "# test the handler\n",
161
+ "my_handler(request)"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 6,
167
+ "metadata": {},
168
+ "outputs": [
169
+ {
170
+ "name": "stderr",
171
+ "output_type": "stream",
172
+ "text": [
173
+ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
174
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
175
+ ]
176
+ },
177
+ {
178
+ "name": "stdout",
179
+ "output_type": "stream",
180
+ "text": [
181
+ "{'generated_text': \"# load distilbert model and initialize text-classification pipeline\\nmodel_id = 'distilbert-base-uncased'\\nmodel_url = 'https://tfhub.dev/tensorflow/small_bert/1'\\n\\nmodel_dir = './distilBERT'\"}\n"
182
+ ]
183
+ }
184
+ ],
185
+ "source": [
186
+ "# prepare sample payload\n",
187
+ "request = {\n",
188
+ " \"inputs\": \"# load distilbert model and initialize text-classification pipeline\\nmodel_id = 'distil\",\n",
189
+ " \"parameters\": {\n",
190
+ " \"top_k\": 100,\n",
191
+ " \"max_length\": 64,\n",
192
+ " \"early_stopping\": True,\n",
193
+ " \"do_sample\": True,\n",
194
+ " \"eos_token_id\": 50256,\n",
195
+ " },\n",
196
+ "}\n",
197
+ "\n",
198
+ "# test the handler\n",
199
+ "print(my_handler(request))\n"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": 13,
205
+ "metadata": {},
206
+ "outputs": [
207
+ {
208
+ "data": {
209
+ "text/plain": [
210
+ "50256"
211
+ ]
212
+ },
213
+ "execution_count": 13,
214
+ "metadata": {},
215
+ "output_type": "execute_result"
216
+ }
217
+ ],
218
+ "source": [
219
+ "my_handler.tokenizer.convert_tokens_to_ids(my_handler.tokenizer.eos_token)\n",
220
+ "git remote set-url origin https://git-repo/new-repository.git"
221
+ ]
222
+ }
223
+ ],
224
+ "metadata": {
225
+ "kernelspec": {
226
+ "display_name": "Python 3.9.13 ('dev': conda)",
227
+ "language": "python",
228
+ "name": "python3"
229
+ },
230
+ "language_info": {
231
+ "codemirror_mode": {
232
+ "name": "ipython",
233
+ "version": 3
234
+ },
235
+ "file_extension": ".py",
236
+ "mimetype": "text/x-python",
237
+ "name": "python",
238
+ "nbconvert_exporter": "python",
239
+ "pygments_lexer": "ipython3",
240
+ "version": "3.9.13"
241
+ },
242
+ "orig_nbformat": 4,
243
+ "vscode": {
244
+ "interpreter": {
245
+ "hash": "f6dd96c16031089903d5a31ec148b80aeb0d39c32affb1a1080393235fbfa2fc"
246
+ }
247
+ }
248
+ },
249
+ "nbformat": 4,
250
+ "nbformat_minor": 2
251
+ }
pipeline.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import torch
4
+
5
+
6
+ class PreTrainedPipeline:
7
+ def __init__(self, path=""):
8
+ # load the optimized model
9
+ self.model = AutoModelForCausalLM.from_pretrained(
10
+ path, torch_dtype=torch.float16, device_map="auto", load_in_8bit=True
11
+ )
12
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
13
+
14
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
15
+ """
16
+ Args:
17
+ data (:obj:):
18
+ includes the input data and the parameters for the inference.
19
+ Return:
20
+ A :obj:`list`:. The list contains the embeddings of the inference inputs
21
+ """
22
+ inputs = data.get("inputs", data)
23
+ parameters = data.get("parameters", {})
24
+
25
+ # tokenize the input
26
+ input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids.to(self.model.device)
27
+ # run the model
28
+ logits = self.model.generate(input_ids, **parameters)
29
+ # Perform pooling
30
+ # postprocess the prediction
31
+ return {"generated_text": self.tokenizer.decode(logits[0].tolist())}
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ bitsandbytes
2
+ git+https://github.com/huggingface/transformers.git
3
+ accelerate
4
+ sentencepiece