ncoop57 commited on
Commit
b97f6e6
1 Parent(s): c457faa

Initial Commit with starter container

Browse files
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim-buster
2
+
3
+ WORKDIR /python-docker
4
+
5
+ COPY copilot_proxy/requirements.txt requirements.txt
6
+
7
+ RUN pip3 install --no-cache-dir -r requirements.txt
8
+
9
+ COPY copilot_proxy .
10
+
11
+ EXPOSE 5000
12
+
13
+ CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: Santacoder Openai
3
- emoji: 📊
4
  colorFrom: gray
5
  colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Santacoder Openai
3
+ emoji: 🤖
4
  colorFrom: gray
5
  colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
+ This space uses the awesome FauxPilot Copilot server from this [repo](https://github.com/fauxpilot/fauxpilot/tree/main/copilot_proxy).
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ import uvicorn
5
+ from fastapi import FastAPI, Request, Response
6
+ from fastapi.responses import JSONResponse
7
+ from sse_starlette.sse import EventSourceResponse
8
+
9
+ from config.log_config import uvicorn_logger
10
+ from models import OpenAIinput
11
+ from utils.codegen import CodeGenProxy
12
+ from utils.errors import FauxPilotException
13
+
14
+ logging.config.dictConfig(uvicorn_logger)
15
+
16
+ codegen = CodeGenProxy(
17
+ host=os.environ.get("TRITON_HOST", "triton"),
18
+ port=os.environ.get("TRITON_PORT", 8001),
19
+ verbose=os.environ.get("TRITON_VERBOSITY", False)
20
+ )
21
+
22
+ app = FastAPI(
23
+ title="FauxPilot",
24
+ description="This is an attempt to build a locally hosted version of GitHub Copilot. It uses the SalesForce CodeGen"
25
+ "models inside of NVIDIA's Triton Inference Server with the FasterTransformer backend.",
26
+ docs_url="/",
27
+ swagger_ui_parameters={"defaultModelsExpandDepth": -1}
28
+ )
29
+
30
+ @app.exception_handler(FauxPilotException)
31
+ async def fauxpilot_handler(request: Request, exc: FauxPilotException):
32
+ return JSONResponse(
33
+ status_code=400,
34
+ content=exc.json()
35
+ )
36
+
37
+ @app.post("/v1/engines/codegen/completions")
38
+ @app.post("/v1/completions")
39
+ async def completions(data: OpenAIinput):
40
+ data = data.dict()
41
+ try:
42
+ content = codegen(data=data)
43
+ except codegen.TokensExceedsMaximum as E:
44
+ raise FauxPilotException(
45
+ message=str(E),
46
+ type="invalid_request_error",
47
+ param=None,
48
+ code=None,
49
+ )
50
+
51
+ if data.get("stream") is not None:
52
+ return EventSourceResponse(
53
+ content=content,
54
+ status_code=200,
55
+ media_type="text/event-stream"
56
+ )
57
+ else:
58
+ return Response(
59
+ status_code=200,
60
+ content=content,
61
+ media_type="application/json"
62
+ )
63
+
64
+ if __name__ == "__main__":
65
+ uvicorn.run("app:app", host="0.0.0.0", port=5000)
cgtok/added_tokens.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257
40
+ }
cgtok/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
cgtok/openai_format/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
cgtok/openai_format/vocab.bpe ADDED
The diff for this file is too large to render. See raw diff
 
cgtok/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
cgtok/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
cgtok/tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "Salesforce/codegen-350M-mono",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "CodeGenTokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
cgtok/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config/__init__.py ADDED
File without changes
config/log_config.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The uvicorn_logger is used to add timestamps
2
+
3
+ uvicorn_logger = {
4
+ "version": 1,
5
+ "disable_existing_loggers": False,
6
+ "formatters": {
7
+ "access": {
8
+ "()": "uvicorn.logging.AccessFormatter",
9
+ "fmt": '%(levelprefix)s %(asctime)s :: %(client_addr)s - "%(request_line)s" %(status_code)s',
10
+ "use_colors": True
11
+ },
12
+ },
13
+ "handlers": {
14
+ "access": {
15
+ "formatter": "access",
16
+ "class": "logging.StreamHandler",
17
+ "stream": "ext://sys.stdout",
18
+ },
19
+ },
20
+ "loggers": {
21
+ "uvicorn.access": {
22
+ "handlers": ["access"],
23
+ # "level": "INFO",
24
+ "propagate": False
25
+ },
26
+ },
27
+ }
models.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class OpenAIinput(BaseModel):
7
+ model: str = "fastertransformer"
8
+ prompt: Optional[str]
9
+ suffix: Optional[str]
10
+ max_tokens: Optional[int] = 16
11
+ temperature: Optional[float] = 0.6
12
+ top_p: Optional[float] = 1.0
13
+ n: Optional[int] = 1
14
+ stream: Optional[bool]
15
+ logprobs: Optional[int] = None
16
+ echo: Optional[bool]
17
+ stop: Optional[Union[str, list]]
18
+ presence_penalty: Optional[float] = 0
19
+ frequency_penalty: Optional[float] = 1
20
+ best_of: Optional[int] = 1
21
+ logit_bias: Optional[dict]
22
+ user: Optional[str]
23
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi==0.82.0
2
+ numpy==1.23.2
3
+ sse-starlette==1.1.6
4
+ tokenizers==0.12.1
5
+ tritonclient[all]==2.25.0
6
+ uvicorn==0.18.3
utils/__init__.py ADDED
File without changes
utils/codegen.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import string
4
+ import time
5
+
6
+ import numpy as np
7
+ import tritonclient.grpc as client_util
8
+ from tokenizers import Tokenizer
9
+ from tritonclient.utils import np_to_triton_dtype, InferenceServerException
10
+
11
+ np.finfo(np.dtype("float32"))
12
+ np.finfo(np.dtype("float64"))
13
+
14
+
15
+ class CodeGenProxy:
16
+ def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
17
+ self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
18
+ self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
19
+ self.PAD_CHAR = 50256
20
+
21
+ # Max number of tokens the model can handle
22
+ self.MAX_MODEL_LEN = 2048
23
+
24
+ class TokensExceedsMaximum(Exception):
25
+ pass
26
+
27
+ @staticmethod
28
+ def prepare_tensor(name: str, tensor_input):
29
+ t = client_util.InferInput(
30
+ name, tensor_input.shape, np_to_triton_dtype(tensor_input.dtype))
31
+ t.set_data_from_numpy(tensor_input)
32
+ return t
33
+
34
+ @staticmethod
35
+ def trim_with_stopwords(output: str, stopwords: list) -> str:
36
+ for w in sorted(stopwords, key=len, reverse=True):
37
+ if output.endswith(w):
38
+ output = output[:-len(w)]
39
+ break
40
+ return output
41
+
42
+ @staticmethod
43
+ def to_word_list_format(word_dict, tokenizer):
44
+ flat_ids = []
45
+ offsets = []
46
+ for word_dict_item in word_dict:
47
+ item_flat_ids = []
48
+ item_offsets = []
49
+
50
+ for word in word_dict_item:
51
+ ids = tokenizer.encode(word).ids
52
+
53
+ if len(ids) == 0:
54
+ continue
55
+
56
+ item_flat_ids += ids
57
+ item_offsets.append(len(ids))
58
+
59
+ # Hack, can we do this better?
60
+ if word == '\n\n':
61
+ item_flat_ids += [198, 198]
62
+ item_offsets.append(2)
63
+
64
+ flat_ids.append(np.array(item_flat_ids))
65
+ offsets.append(np.cumsum(np.array(item_offsets)))
66
+
67
+ pad_to = max(1, max(len(ids) for ids in flat_ids))
68
+
69
+ for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
70
+ flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
71
+ offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
72
+
73
+ return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
74
+
75
+ def generate(self, data):
76
+ prompt = data['prompt']
77
+ n = data.get('n', 1)
78
+ model_name = data["model"]
79
+ # ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
80
+ # i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
81
+ np_type = np.int32 if model_name.startswith("py-") else np.uint32
82
+
83
+ input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
84
+ input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
85
+ prompt_len = input_start_ids.shape[1]
86
+ input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
87
+ max_tokens = data.get('max_tokens', 16)
88
+ prompt_tokens: int = input_len[0][0]
89
+ requested_tokens = max_tokens + prompt_tokens
90
+ if requested_tokens > self.MAX_MODEL_LEN:
91
+ print(1)
92
+ raise self.TokensExceedsMaximum(
93
+ f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
94
+ f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
95
+ f"Please reduce your prompt; or completion length."
96
+ )
97
+ output_len = np.ones_like(input_len).astype(np_type) * max_tokens
98
+ num_logprobs = data.get('logprobs', -1)
99
+ if num_logprobs is None:
100
+ num_logprobs = 1
101
+ want_logprobs = num_logprobs > 0
102
+
103
+ temperature = data.get('temperature', 0.2)
104
+ if temperature == 0.0:
105
+ temperature = 1.0
106
+ top_k = 1
107
+ else:
108
+ top_k = data.get('top_k', 0)
109
+
110
+ top_p = data.get('top_p', 1.0)
111
+ frequency_penalty = data.get('frequency_penalty', 1.0)
112
+ runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
113
+ runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
114
+ beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
115
+ random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
116
+ temperature = temperature * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
117
+ len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
118
+ repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
119
+ is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
120
+ beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
121
+ start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
122
+ end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
123
+
124
+ stop_words = data.get('stop', [])
125
+ if stop_words is None:
126
+ stop_words = []
127
+ if stop_words:
128
+ stop_word_list = np.repeat(self.to_word_list_format([stop_words], self.tokenizer), input_start_ids.shape[0],
129
+ axis=0)
130
+ else:
131
+ stop_word_list = np.concatenate([np.zeros([input_start_ids.shape[0], 1, 1]).astype(
132
+ np.int32), (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)], axis=1)
133
+
134
+ # Not used
135
+ bad_words_list = np.concatenate([np.zeros([input_start_ids.shape[0], 1, 1]).astype(
136
+ np.int32), (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)], axis=1)
137
+
138
+ inputs = [
139
+ self.prepare_tensor("input_ids", input_start_ids),
140
+ self.prepare_tensor("input_lengths", input_len),
141
+ self.prepare_tensor("request_output_len", output_len),
142
+ self.prepare_tensor("runtime_top_k", runtime_top_k),
143
+ self.prepare_tensor("runtime_top_p", runtime_top_p),
144
+ self.prepare_tensor("beam_search_diversity_rate", beam_search_diversity_rate),
145
+ self.prepare_tensor("random_seed", random_seed),
146
+ self.prepare_tensor("temperature", temperature),
147
+ self.prepare_tensor("len_penalty", len_penalty),
148
+ self.prepare_tensor("repetition_penalty", repetition_penalty),
149
+ self.prepare_tensor("is_return_log_probs", is_return_log_probs),
150
+ self.prepare_tensor("beam_width", beam_width),
151
+ self.prepare_tensor("start_id", start_ids),
152
+ self.prepare_tensor("end_id", end_ids),
153
+ self.prepare_tensor("bad_words_list", bad_words_list),
154
+ self.prepare_tensor("stop_words_list", stop_word_list),
155
+ ]
156
+
157
+ result = self.client.infer(model_name, inputs)
158
+
159
+ output_data = result.as_numpy("output_ids")
160
+ if output_data is None:
161
+ raise RuntimeError("No output data")
162
+
163
+ # All of these squeeze(1)s are to remove the beam width dimension.
164
+ output_data = output_data.squeeze(1)
165
+ if want_logprobs:
166
+ lp_data = result.as_numpy("output_log_probs").squeeze(1)
167
+ # clp_data = result.as_numpy("cum_log_probs").squeeze(1)
168
+ else:
169
+ lp_data = [None] * output_data.shape[0]
170
+ sequence_lengths = result.as_numpy("sequence_length").squeeze(1)
171
+ gen_len = sequence_lengths - input_len.squeeze(1)
172
+
173
+ decoded = self.tokenizer.decode_batch([out[prompt_len:prompt_len + g] for g, out in zip(gen_len, output_data)])
174
+ trimmed = [self.trim_with_stopwords(d, stop_words) for d in decoded]
175
+
176
+ choices = []
177
+ for i, (text, tokens, lps, g) in enumerate(zip(trimmed, output_data, lp_data, gen_len)):
178
+ reason = "length" if max_tokens == g else "stop"
179
+ if lps is not None:
180
+ tokens_str = [self.tokenizer.decode([t]) for t in tokens[prompt_len:prompt_len + g]]
181
+ offsets = [len(prompt)] + (np.cumsum([len(t) for t in tokens_str]) + len(prompt)).tolist()[:-1]
182
+
183
+ # Fake some log probs for top_logprobs
184
+ top_logprobs = []
185
+ for ii, t in enumerate(tokens_str):
186
+ fakedict = {}
187
+ top_token_lp = float(lps[ii])
188
+ fakedict[t] = top_token_lp
189
+ while len(fakedict) < num_logprobs:
190
+ random_token = random.randint(0, self.tokenizer.get_vocab_size() - 1)
191
+ random_token_str = self.tokenizer.decode([random_token])
192
+ if random_token_str in fakedict:
193
+ continue
194
+ random_token_lp = top_token_lp - random.random()
195
+ fakedict[random_token_str] = random_token_lp
196
+ top_logprobs.append(fakedict)
197
+
198
+ lpdict = {
199
+ 'token_logprobs': lps.tolist(),
200
+ 'top_logprobs': top_logprobs,
201
+ 'tokens': tokens_str,
202
+ 'text_offset': offsets,
203
+ }
204
+ else:
205
+ lpdict = None
206
+
207
+ choice = {
208
+ 'text': text,
209
+ 'index': i,
210
+ 'finish_reason': reason,
211
+ 'logprobs': lpdict,
212
+ }
213
+ choices.append(choice)
214
+
215
+ completion = {
216
+ 'id': None, # fill in
217
+ 'model': 'codegen',
218
+ 'object': 'text_completion',
219
+ 'created': int(time.time()),
220
+ 'choices': None, # fill in
221
+ 'usage': {
222
+ 'completion_tokens': int(gen_len.sum()),
223
+ 'prompt_tokens': int(prompt_len),
224
+ 'total_tokens': int(gen_len.sum() + prompt_len),
225
+ }
226
+ }
227
+ return completion, choices
228
+
229
+ @staticmethod
230
+ def random_completion_id():
231
+ return 'cmpl-' + ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(29))
232
+
233
+ def streamed_response(self, completion, choices):
234
+ for c in choices:
235
+ completion['id'] = self.random_completion_id()
236
+ completion['choices'] = [c]
237
+ yield f'data: {json.dumps(completion)}\n\n'
238
+ yield 'data: [DONE]\n\n'
239
+
240
+ def non_streamed_response(self, completion, choices) -> str:
241
+ completion['id'] = self.random_completion_id()
242
+ completion['choices'] = choices
243
+ return json.dumps(completion)
244
+
245
+ def __call__(self, data: dict):
246
+ st = time.time()
247
+ try:
248
+ completion, choices = self.generate(data)
249
+ except InferenceServerException as E:
250
+ print(E)
251
+ completion = {}
252
+ choices = []
253
+ ed = time.time()
254
+ print(f"Returned completion in {(ed - st) * 1000} ms")
255
+ if data.get('stream', False):
256
+ return self.streamed_response(completion, choices)
257
+ else:
258
+ return self.non_streamed_response(completion, choices)
utils/errors.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import *
2
+
3
+ class FauxPilotException(Exception):
4
+ def __init__(self, message: str, type: Optional[str] = None, param: Optional[str] = None, code: Optional[int] = None):
5
+ super().__init__(message)
6
+ self.message = message
7
+ self.type = type
8
+ self.param = param
9
+ self.code = code
10
+
11
+ def json(self):
12
+ return {
13
+ 'error': {
14
+ 'message': self.message,
15
+ 'type': self.type,
16
+ 'param': self.param,
17
+ 'code': self.code
18
+ }
19
+ }