ncoop57 commited on
Commit
bab8078
1 Parent(s): b399543

Get minimum working openai server

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +30 -4
  3. utils/codegen.py +25 -140
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
app.py CHANGED
@@ -1,6 +1,9 @@
1
  import logging
2
  import os
3
-
 
 
 
4
  import uvicorn
5
  from fastapi import FastAPI, Request, Response
6
  from fastapi.responses import JSONResponse
@@ -8,12 +11,18 @@ from sse_starlette.sse import EventSourceResponse
8
 
9
  from config.log_config import uvicorn_logger
10
  from models import OpenAIinput
11
- from utils.hf_model import HFModel
12
  from utils.errors import FauxPilotException
 
13
 
14
  logging.config.dictConfig(uvicorn_logger)
15
 
16
- model = HFModel("bigcode/santacoder")
 
 
 
 
 
17
 
18
  codegen = CodeGenProxy(
19
  host=os.environ.get("TRITON_HOST", "triton"),
@@ -42,7 +51,24 @@ async def completions(data: OpenAIinput):
42
  data = data.dict()
43
  try:
44
  content = codegen(data=data)
45
- except codegen.TokensExceedsMaximum as E:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  raise FauxPilotException(
47
  message=str(E),
48
  type="invalid_request_error",
 
1
  import logging
2
  import os
3
+ import torch
4
+ import json
5
+ import torch
6
+ import time
7
  import uvicorn
8
  from fastapi import FastAPI, Request, Response
9
  from fastapi.responses import JSONResponse
 
11
 
12
  from config.log_config import uvicorn_logger
13
  from models import OpenAIinput
14
+ from utils.codegen import CodeGenProxy
15
  from utils.errors import FauxPilotException
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
17
 
18
  logging.config.dictConfig(uvicorn_logger)
19
 
20
+ # token = os.environ.get("HUB_TOKEN", None)
21
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
22
+
23
+ # tokenizer = AutoTokenizer.from_pretrained("bigcode/christmas-models", use_auth_token=token)
24
+ # model = AutoModelForCausalLM.from_pretrained("bigcode/christmas-models", trust_remote_code=True, use_auth_token=token).to(device)
25
+ # pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
26
 
27
  codegen = CodeGenProxy(
28
  host=os.environ.get("TRITON_HOST", "triton"),
 
51
  data = data.dict()
52
  try:
53
  content = codegen(data=data)
54
+ # prompt = data.get("prompt")
55
+ # choices = [pipe(prompt, do_sample=True, top_p=0.95, max_new_tokens=50)[0]['generated_text']]
56
+ # completion = {
57
+ # 'id': None, # fill in
58
+ # 'model': 'codegen',
59
+ # 'object': 'text_completion',
60
+ # 'created': int(time.time()),
61
+ # 'choices': None, # fill in
62
+ # 'usage': {
63
+ # 'completion_tokens': int(sum([len(c.split()) for c in choices])),
64
+ # 'prompt_tokens': int(len(prompt.split())),
65
+ # 'total_tokens': int(sum([len(c.split()) for c in choices]) + len(prompt.split())),
66
+ # }
67
+ # }
68
+ # completion['id'] = 10
69
+ # completion['choices'] = choices
70
+ # content = json.dumps(completion)
71
+ except Exception as E:
72
  raise FauxPilotException(
73
  message=str(E),
74
  type="invalid_request_error",
utils/codegen.py CHANGED
@@ -2,19 +2,28 @@ import json
2
  import random
3
  import string
4
  import time
5
-
 
6
  import numpy as np
7
  import tritonclient.grpc as client_util
8
  from tokenizers import Tokenizer
9
  from tritonclient.utils import np_to_triton_dtype, InferenceServerException
 
 
10
 
11
  np.finfo(np.dtype("float32"))
12
  np.finfo(np.dtype("float64"))
13
 
 
 
 
 
 
 
14
 
15
  class CodeGenProxy:
16
  def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
17
- self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
18
  self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
19
  self.PAD_CHAR = 50256
20
 
@@ -48,7 +57,7 @@ class CodeGenProxy:
48
  item_offsets = []
49
 
50
  for word in word_dict_item:
51
- ids = tokenizer.encode(word).ids
52
 
53
  if len(ids) == 0:
54
  continue
@@ -73,144 +82,20 @@ class CodeGenProxy:
73
  return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
74
 
75
  def generate(self, data):
 
76
  prompt = data['prompt']
77
  n = data.get('n', 1)
78
  model_name = data["model"]
79
- # ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
80
- # i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
81
- np_type = np.int32 if model_name.startswith("py-") else np.uint32
82
-
83
- input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
84
- input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
85
- prompt_len = input_start_ids.shape[1]
86
- input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
87
- max_tokens = data.get('max_tokens', 16)
88
- prompt_tokens: int = input_len[0][0]
89
- requested_tokens = max_tokens + prompt_tokens
90
- if requested_tokens > self.MAX_MODEL_LEN:
91
- print(1)
92
- raise self.TokensExceedsMaximum(
93
- f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
94
- f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
95
- f"Please reduce your prompt; or completion length."
96
- )
97
- output_len = np.ones_like(input_len).astype(np_type) * max_tokens
98
- num_logprobs = data.get('logprobs', -1)
99
- if num_logprobs is None:
100
- num_logprobs = 1
101
- want_logprobs = num_logprobs > 0
102
-
103
- temperature = data.get('temperature', 0.2)
104
- if temperature == 0.0:
105
- temperature = 1.0
106
- top_k = 1
107
- else:
108
- top_k = data.get('top_k', 0)
109
-
110
- top_p = data.get('top_p', 1.0)
111
- frequency_penalty = data.get('frequency_penalty', 1.0)
112
- runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
113
- runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
114
- beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
115
- random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
116
- temperature = temperature * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
117
- len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
118
- repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
119
- is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
120
- beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
121
- start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
122
- end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
123
-
124
- stop_words = data.get('stop', [])
125
- if stop_words is None:
126
- stop_words = []
127
- if stop_words:
128
- stop_word_list = np.repeat(self.to_word_list_format([stop_words], self.tokenizer), input_start_ids.shape[0],
129
- axis=0)
130
- else:
131
- stop_word_list = np.concatenate([np.zeros([input_start_ids.shape[0], 1, 1]).astype(
132
- np.int32), (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)], axis=1)
133
-
134
- # Not used
135
- bad_words_list = np.concatenate([np.zeros([input_start_ids.shape[0], 1, 1]).astype(
136
- np.int32), (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)], axis=1)
137
-
138
- inputs = [
139
- self.prepare_tensor("input_ids", input_start_ids),
140
- self.prepare_tensor("input_lengths", input_len),
141
- self.prepare_tensor("request_output_len", output_len),
142
- self.prepare_tensor("runtime_top_k", runtime_top_k),
143
- self.prepare_tensor("runtime_top_p", runtime_top_p),
144
- self.prepare_tensor("beam_search_diversity_rate", beam_search_diversity_rate),
145
- self.prepare_tensor("random_seed", random_seed),
146
- self.prepare_tensor("temperature", temperature),
147
- self.prepare_tensor("len_penalty", len_penalty),
148
- self.prepare_tensor("repetition_penalty", repetition_penalty),
149
- self.prepare_tensor("is_return_log_probs", is_return_log_probs),
150
- self.prepare_tensor("beam_width", beam_width),
151
- self.prepare_tensor("start_id", start_ids),
152
- self.prepare_tensor("end_id", end_ids),
153
- self.prepare_tensor("bad_words_list", bad_words_list),
154
- self.prepare_tensor("stop_words_list", stop_word_list),
155
- ]
156
-
157
- result = self.client.infer(model_name, inputs)
158
-
159
- output_data = result.as_numpy("output_ids")
160
- if output_data is None:
161
- raise RuntimeError("No output data")
162
-
163
- # All of these squeeze(1)s are to remove the beam width dimension.
164
- output_data = output_data.squeeze(1)
165
- if want_logprobs:
166
- lp_data = result.as_numpy("output_log_probs").squeeze(1)
167
- # clp_data = result.as_numpy("cum_log_probs").squeeze(1)
168
- else:
169
- lp_data = [None] * output_data.shape[0]
170
- sequence_lengths = result.as_numpy("sequence_length").squeeze(1)
171
- gen_len = sequence_lengths - input_len.squeeze(1)
172
-
173
- decoded = self.tokenizer.decode_batch([out[prompt_len:prompt_len + g] for g, out in zip(gen_len, output_data)])
174
- trimmed = [self.trim_with_stopwords(d, stop_words) for d in decoded]
175
-
176
  choices = []
177
- for i, (text, tokens, lps, g) in enumerate(zip(trimmed, output_data, lp_data, gen_len)):
178
- reason = "length" if max_tokens == g else "stop"
179
- if lps is not None:
180
- tokens_str = [self.tokenizer.decode([t]) for t in tokens[prompt_len:prompt_len + g]]
181
- offsets = [len(prompt)] + (np.cumsum([len(t) for t in tokens_str]) + len(prompt)).tolist()[:-1]
182
-
183
- # Fake some log probs for top_logprobs
184
- top_logprobs = []
185
- for ii, t in enumerate(tokens_str):
186
- fakedict = {}
187
- top_token_lp = float(lps[ii])
188
- fakedict[t] = top_token_lp
189
- while len(fakedict) < num_logprobs:
190
- random_token = random.randint(0, self.tokenizer.get_vocab_size() - 1)
191
- random_token_str = self.tokenizer.decode([random_token])
192
- if random_token_str in fakedict:
193
- continue
194
- random_token_lp = top_token_lp - random.random()
195
- fakedict[random_token_str] = random_token_lp
196
- top_logprobs.append(fakedict)
197
-
198
- lpdict = {
199
- 'token_logprobs': lps.tolist(),
200
- 'top_logprobs': top_logprobs,
201
- 'tokens': tokens_str,
202
- 'text_offset': offsets,
203
- }
204
- else:
205
- lpdict = None
206
-
207
- choice = {
208
- 'text': text,
209
- 'index': i,
210
- 'finish_reason': reason,
211
- 'logprobs': lpdict,
212
- }
213
- choices.append(choice)
214
 
215
  completion = {
216
  'id': None, # fill in
@@ -219,9 +104,9 @@ class CodeGenProxy:
219
  'created': int(time.time()),
220
  'choices': None, # fill in
221
  'usage': {
222
- 'completion_tokens': int(gen_len.sum()),
223
- 'prompt_tokens': int(prompt_len),
224
- 'total_tokens': int(gen_len.sum() + prompt_len),
225
  }
226
  }
227
  return completion, choices
 
2
  import random
3
  import string
4
  import time
5
+ import os
6
+ import torch
7
  import numpy as np
8
  import tritonclient.grpc as client_util
9
  from tokenizers import Tokenizer
10
  from tritonclient.utils import np_to_triton_dtype, InferenceServerException
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
+
13
 
14
  np.finfo(np.dtype("float32"))
15
  np.finfo(np.dtype("float64"))
16
 
17
+ token = os.environ.get("HUB_TOKEN", None)
18
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained("bigcode/christmas-models", use_auth_token=token)
21
+ model = AutoModelForCausalLM.from_pretrained("bigcode/christmas-models", trust_remote_code=True, use_auth_token=token).to(device)
22
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
23
 
24
  class CodeGenProxy:
25
  def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
26
+ self.tokenizer = AutoTokenizer.from_pretrained("bigcode/christmas-models", use_auth_token=token)
27
  self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
28
  self.PAD_CHAR = 50256
29
 
 
57
  item_offsets = []
58
 
59
  for word in word_dict_item:
60
+ ids = tokenizer.encode(word)
61
 
62
  if len(ids) == 0:
63
  continue
 
82
  return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
83
 
84
  def generate(self, data):
85
+ global pipe
86
  prompt = data['prompt']
87
  n = data.get('n', 1)
88
  model_name = data["model"]
89
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  choices = []
91
+ text = pipe(prompt, do_sample=True, top_p=0.95, max_new_tokens=50)[0]['generated_text']
92
+ choice = {
93
+ 'text': text,
94
+ 'index': 0,
95
+ 'finish_reason': "stop",
96
+ 'logprobs': None,
97
+ }
98
+ choices.append(choice)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  completion = {
101
  'id': None, # fill in
 
104
  'created': int(time.time()),
105
  'choices': None, # fill in
106
  'usage': {
107
+ 'completion_tokens': int(50),
108
+ 'prompt_tokens': int(50),
109
+ 'total_tokens': int(100),
110
  }
111
  }
112
  return completion, choices