jadehardouin commited on
Commit
7769b47
·
1 Parent(s): 80b9501

Update models.py

Browse files
Files changed (1) hide show
  1. models.py +75 -141
models.py CHANGED
@@ -14,9 +14,6 @@ class BaseTCOModel(ABC):
14
  def __init__(self):
15
  super(BaseTCOModel, self).__setattr__("_components", [])
16
  self.use_case = None
17
- self.num_users = None
18
- self.input_tokens = None
19
- self.output_tokens = None
20
 
21
  def get_components(self) -> list[Component]:
22
  return self._components
@@ -61,7 +58,7 @@ class OpenAIModel(BaseTCOModel):
61
  self.set_formula(r"""$CR = \frac{CIT\_1K \times IT + COT\_1K \times OT}{1000}$ <br>
62
  with: <br>
63
  CR = Cost per Request <br>
64
- CIT_1K = Cost per 1000 Input Tokens (from OpenAI's pricing web page) <br>
65
  COT_1K = Cost per 1000 Output Tokens <br>
66
  IT = Input Tokens <br>
67
  OT = Output Tokens
@@ -79,45 +76,59 @@ class OpenAIModel(BaseTCOModel):
79
  self.latency = "5s"
80
  return gr.Dropdown.update(choices=["4K", "16K"], value="4K")
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  self.model = gr.Dropdown(["GPT-4", "GPT-3.5 Turbo"], value="GPT-4",
83
  label="OpenAI models",
84
  interactive=True, visible=False)
85
  self.context_length = gr.Dropdown(["8K", "32K"], value="8K", interactive=True,
86
  label="Context size",
87
  visible=False, info="Number of tokens the model considers when processing text")
88
- self.model.change(on_model_change, inputs=self.model, outputs=self.context_length)
89
-
90
- def compute_cost_per_token(self, model, context_length):
91
- """Cost per token = """
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- if model == "GPT-4" and context_length == "8K":
94
- cost_per_1k_input_tokens = 0.03
95
- cost_per_1k_output_tokens = 0.06
96
- elif model == "GPT-4" and context_length == "32K":
97
- cost_per_1k_input_tokens = 0.06
98
- cost_per_1k_output_tokens = 0.12
99
- elif model == "GPT-3.5" and context_length == "4K":
100
- cost_per_1k_input_tokens = 0.0015
101
- cost_per_1k_output_tokens = 0.002
102
- else:
103
- cost_per_1k_input_tokens = 0.003
104
- cost_per_1k_output_tokens = 0.004
105
- cost_per_input_token = (cost_per_1k_input_tokens / 1000)
106
- cost_per_output_token = (cost_per_1k_output_tokens / 1000)
107
 
108
- return cost_per_input_token, cost_per_output_token
109
 
110
  class OpenSourceLlama2Model(BaseTCOModel):
111
 
112
  def __init__(self):
113
- self.set_name("(Open source) Llama 2")
114
- self.set_formula(r"""$CT = \frac{VM\_CH \times 100}{3600 \times U} \times (\frac{IT}{ITS} + \frac{OT}{OTS})$<br>
115
  with: <br>
116
- CT = Cost per Token <br>
117
- VM_CH = VM Cost per Hour <br>
118
- ITS = Input Tokens per Second <br>
119
- OTS = Output Tokens per Second <br>
120
- U = Used <br>
121
  IT = Input Tokens <br>
122
  OT = Output Tokens
123
  """)
@@ -125,118 +136,37 @@ class OpenSourceLlama2Model(BaseTCOModel):
125
  super().__init__()
126
 
127
  def render(self):
128
- vm_choices = ["1x Nvidia A100 (Azure NC24ads A100 v4)",
129
- "2x Nvidia A100 (Azure NC24ads A100 v4)",
130
- "2x Nvidia A100 (Azure ND96amsr A100 v4)"]
131
-
132
- def on_model_change(model):
133
- if model == "Llama 2 7B":
134
- return [gr.Dropdown.update(choices=vm_choices),
135
- gr.Markdown.update(value="To see the benchmark results use for the Llama2-7B model, [click here](https://example.com/script)"),
136
- gr.Number.update(value=3.6730),
137
- gr.Number.update(value=694.38),
138
- gr.Number.update(value=694.38),
139
- ]
140
- else:
141
- not_supported_vm = ["1x Nvidia A100 (Azure NC24ads A100 v4)", "2x Nvidia A100 (Azure NC24ads A100 v4)"]
142
- choices = [x for x in vm_choices if x not in not_supported_vm]
143
- return [gr.Dropdown.update(choices=choices, value="2x Nvidia A100 (Azure ND96amsr A100 v4)"),
144
- gr.Markdown.update(value="To see the benchmark results used for the Llama2-70B model, [click here](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper)"),
145
- gr.Number.update(value=2*37.186),
146
- gr.Number.update(value=2860),
147
- gr.Number.update(value=18.545),
148
- ]
149
-
150
- def on_vm_change(model, vm):
151
- # TO DO: load info from CSV
152
- if model == "Llama 2 7B" and vm == "1x Nvidia A100 (Azure NC24ads A100 v4)":
153
- return [gr.Number.update(value=4.777), gr.Number.update(value=694.38), gr.Number.update(value=694.38)]
154
- elif model == "Llama 2 7B" and vm == "2x Nvidia A100 (Azure NC24ads A100 v4)":
155
- return [gr.Number.update(value=2*4.777), gr.Number.update(value=1388.76), gr.Number.update(value=1388.76)]
156
- elif model == "Llama 2 7B" and vm == "2x Nvidia A100 (Azure ND96amsr A100 v4)":
157
- return [gr.Number.update(value=2*37.186), gr.Number.update(value=2777.52), gr.Number.update(value=2777.52)]
158
- elif model == "Llama 2 70B" and vm == "2x Nvidia A100 (Azure ND96amsr A100 v4)":
159
- return [gr.Number.update(value=2*37.186), gr.Number.update(value=2860), gr.Number.update(value=18.449)]
160
 
161
- self.model = gr.Dropdown(["Llama 2 7B", "Llama 2 70B"], value="Llama 2 70B", label="OpenSource models", visible=False)
162
- self.vm = gr.Dropdown(choices=["2x Nvidia A100 (Azure ND96amsr A100 v4)"],
163
- value="2x Nvidia A100 (Azure ND96amsr A100 v4)",
164
  visible=False,
165
  label="Instance of VM with GPU",
166
- info="Your options for this choice depend on the model you previously chose"
167
  )
168
- self.vm_cost_per_hour = gr.Number(2*37.186, label="VM instance cost per hour",
169
  interactive=False, visible=False)
170
- self.input_tokens_per_second = gr.Number(2860, visible=False,
171
- label="Number of output tokens per second for this specific model and VM instance",
172
  interactive=False
173
  )
174
- self.output_tokens_per_second = gr.Number(18.449, visible=False,
175
- label="Number of output tokens per second for this specific model and VM instance",
176
  interactive=False
177
  )
178
- self.info = gr.Markdown("To see the script used to benchmark the Llama2-70B model, [click here](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper)", interactive=False, visible=False)
179
 
180
- self.model.change(on_model_change, inputs=self.model, outputs=[self.vm, self.info, self.vm_cost_per_hour, self.input_tokens_per_second, self.output_tokens_per_second])
181
- self.vm.change(on_vm_change, inputs=[self.model, self.vm], outputs=[self.vm_cost_per_hour, self.input_tokens_per_second, self.output_tokens_per_second])
182
- self.used = gr.Slider(minimum=0.01, value=30., step=0.01, label="% used",
183
- info="Percentage of time the GPU is used",
184
- interactive=True,
185
- visible=False)
186
-
187
- def compute_cost_per_token(self, vm_cost_per_hour, input_tokens_per_second, output_tokens_per_second, used):
188
- cost_per_input_token = vm_cost_per_hour * 100 / (3600 * used * input_tokens_per_second)
189
- cost_per_output_token = vm_cost_per_hour * 100 / (3600 * used * output_tokens_per_second)
190
- return cost_per_input_token, cost_per_output_token
191
-
192
- class OpenSourceDIY(BaseTCOModel):
193
-
194
- def __init__(self):
195
- self.set_name("(Open source) DIY")
196
- self.set_formula(r"""$CT = \frac{VM\_CH \times 100}{3600 \times U} \times (\frac{IT}{ITS} + \frac{OT}{OTS})$<br>
197
- with: <br>
198
- CT = Cost per Token <br>
199
- VM_CH = VM Cost per Hour <br>
200
- ITS = Input Tokens per Second <br>
201
- OTS = Output Tokens per Second <br>
202
- U = Used <br>
203
- IT = Input Tokens <br>
204
- OT = Output Tokens
205
- """)
206
- self.set_latency("The latency can't be estimated in the DIY scenario for the model isn't defined")
207
- super().__init__()
208
-
209
- def render(self):
210
- self.info = gr.Markdown("Compute the cost/token based on our formula below, using your own parameters", visible=False)
211
- self.display_formula = gr.Markdown(r"""$CT = \frac{VM\_CH \times 100}{3600 \times U} \times (\frac{IT}{ITS} + \frac{OT}{OTS})$<br>
212
- with: <br>
213
- CT = Cost per Token <br>
214
- VM_CH = VM Cost per Hour <br>
215
- ITS = Input Tokens per Second <br>
216
- OTS = Output Tokens per Second <br>
217
- U = Used <br>
218
- IT = Input Tokens <br>
219
- OT = Output Tokens
220
- """, visible=False)
221
- self.vm_cost_per_hour = gr.Number(3.5, label="VM instance cost per hour",
222
- interactive=True, visible=False)
223
- self.input_tokens_per_second = gr.Number(300, visible=False,
224
- label="Number of input tokens per second processed for this specific model and VM instance",
225
- interactive=True
226
- )
227
- self.output_tokens_per_second = gr.Number(300, visible=False,
228
- label="Number of output tokens per second processed for this specific model and VM instance",
229
- interactive=True
230
- )
231
- self.used = gr.Slider(minimum=0.01, value=50., step=0.01, label="% used",
232
- info="Percentage of time the GPU is used",
233
- interactive=True,
234
- visible=False)
235
 
236
- def compute_cost_per_token(self, vm_cost_per_hour, input_tokens_per_second, output_tokens_per_second, used):
237
- cost_per_input_token = vm_cost_per_hour * 100 / (3600 * used * input_tokens_per_second)
238
- cost_per_output_token = vm_cost_per_hour * 100 / (3600 * used * output_tokens_per_second)
239
- return cost_per_input_token, cost_per_output_token
240
 
241
  class CohereModel(BaseTCOModel):
242
 
@@ -262,8 +192,13 @@ class CohereModel(BaseTCOModel):
262
  self.model: gr.Dropdown.update(choices=["Default", "Custom"])
263
  else:
264
  self.model: gr.Dropdown.update(choices=["Default", "Custom"])
 
 
 
 
 
265
 
266
- def compute_cost_per_token(self, model):
267
  """Cost per token = """
268
  use_case = self.use_case
269
 
@@ -279,7 +214,7 @@ class CohereModel(BaseTCOModel):
279
  cost_per_input_token = cost_per_1M_tokens / 1000000
280
  cost_per_output_token = cost_per_1M_tokens / 1000000
281
 
282
- return cost_per_input_token, cost_per_output_token
283
 
284
  class ModelPage:
285
 
@@ -306,7 +241,7 @@ class ModelPage:
306
  output += model.get_components_for_cost_computing()
307
  return output
308
 
309
- def make_model_visible(self, name:str, use_case: gr.Dropdown, num_users: gr.Number, input_tokens: gr.Slider, output_tokens: gr.Slider):
310
  # First decide which indexes
311
  output = []
312
  for model in self.models:
@@ -314,26 +249,25 @@ class ModelPage:
314
  output+= [gr.update(visible=True)] * len(model.get_components())
315
  # Set use_case and num_users values in the model
316
  model.use_case = use_case
317
- model.num_users = num_users
318
- model.input_tokens = input_tokens
319
- model.output_tokens = output_tokens
320
  else:
321
  output+= [gr.update(visible=False)] * len(model.get_components())
322
  return output
323
 
324
  def compute_cost_per_token(self, *args):
325
  begin=0
326
- current_model = args[-1]
 
 
327
  for model in self.models:
328
  model_n_args = len(model.get_components_for_cost_computing())
329
  if current_model == model.get_name():
330
 
331
  model_args = args[begin:begin+model_n_args]
332
- cost_per_input_token, cost_per_output_token = model.compute_cost_per_token(*model_args)
333
- model_tco = cost_per_input_token * model.input_tokens + cost_per_output_token * model.output_tokens
334
  formula = model.get_formula()
335
  latency = model.get_latency()
336
 
337
- return f"Model {current_model} has a cost/request of: ${model_tco}", model_tco, formula, f"The average latency of this model is {latency}"
338
 
339
  begin = begin+model_n_args
 
14
  def __init__(self):
15
  super(BaseTCOModel, self).__setattr__("_components", [])
16
  self.use_case = None
 
 
 
17
 
18
  def get_components(self) -> list[Component]:
19
  return self._components
 
58
  self.set_formula(r"""$CR = \frac{CIT\_1K \times IT + COT\_1K \times OT}{1000}$ <br>
59
  with: <br>
60
  CR = Cost per Request <br>
61
+ CIT_1K = Cost per 1000 Input Tokens <br>
62
  COT_1K = Cost per 1000 Output Tokens <br>
63
  IT = Input Tokens <br>
64
  OT = Output Tokens
 
76
  self.latency = "5s"
77
  return gr.Dropdown.update(choices=["4K", "16K"], value="4K")
78
 
79
+ def define_cost_per_token(model, context_length):
80
+ if model == "GPT-4" and context_length == "8K":
81
+ cost_per_1k_input_tokens = 0.03
82
+ cost_per_1k_output_tokens = 0.06
83
+ elif model == "GPT-4" and context_length == "32K":
84
+ cost_per_1k_input_tokens = 0.06
85
+ cost_per_1k_output_tokens = 0.12
86
+ elif model == "GPT-3.5" and context_length == "4K":
87
+ cost_per_1k_input_tokens = 0.0015
88
+ cost_per_1k_output_tokens = 0.002
89
+ else:
90
+ cost_per_1k_input_tokens = 0.003
91
+ cost_per_1k_output_tokens = 0.004
92
+ return cost_per_1k_input_tokens, cost_per_1k_output_tokens
93
+
94
  self.model = gr.Dropdown(["GPT-4", "GPT-3.5 Turbo"], value="GPT-4",
95
  label="OpenAI models",
96
  interactive=True, visible=False)
97
  self.context_length = gr.Dropdown(["8K", "32K"], value="8K", interactive=True,
98
  label="Context size",
99
  visible=False, info="Number of tokens the model considers when processing text")
100
+ self.input_tokens_cost_per_second = gr.Number(0.03, visible=False,
101
+ label="($) Price/1K input prompt tokens",
102
+ interactive=False
103
+ )
104
+ self.output_tokens_cost_per_second = gr.Number(0.06, visible=False,
105
+ label="($) Price/1K output prompt tokens",
106
+ interactive=False
107
+ )
108
+ self.info = gr.Markdown("The cost per input and output tokens values are from OpenAI's pricing web page [here](https://openai.com/pricing)", interactive=False, visible=False)
109
+ self.model.change(on_model_change, inputs=self.model, outputs=self.context_length).then(define_cost_per_token, inputs=[self.model, self.context_length], outputs=[self.input_tokens_cost_per_second, self.output_tokens_cost_per_second])
110
+ self.context_length.change(define_cost_per_token, inputs=[self.model, self.context_length], outputs=[self.input_tokens_cost_per_second, self.output_tokens_cost_per_second])
111
+
112
+ self.labor = gr.Number(0, visible=False,
113
+ label="($) Labor cost per month",
114
+ interactive=True
115
+ )
116
 
117
+ def compute_cost_per_token(self, input_tokens_cost_per_second, output_tokens_cost_per_second, labor):
118
+ cost_per_input_token = (input_tokens_cost_per_second / 1000)
119
+ cost_per_output_token = (output_tokens_cost_per_second / 1000)
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ return cost_per_input_token, cost_per_output_token, labor
122
 
123
  class OpenSourceLlama2Model(BaseTCOModel):
124
 
125
  def __init__(self):
126
+ self.set_name("(Open source) Llama 2 70B")
127
+ self.set_formula(r"""$CR = \frac{CIT\_1K \times IT + COT\_1K \times OT}{1000}$ <br>
128
  with: <br>
129
+ CR = Cost per Request <br>
130
+ CIT_1K = Cost per 1000 Input Tokens <br>
131
+ COT_1K = Cost per 1000 Output Tokens <br>
 
 
132
  IT = Input Tokens <br>
133
  OT = Output Tokens
134
  """)
 
136
  super().__init__()
137
 
138
  def render(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
+ self.vm = gr.Textbox(value="2x A100 80GB NVLINK",
 
 
141
  visible=False,
142
  label="Instance of VM with GPU",
 
143
  )
144
+ self.vm_cost_per_hour = gr.Number(2.21, label="VM instance cost ($) per hour", info="Note that this is the cost for a single VM instance, it is doubled in our case since two GPUs are needed",
145
  interactive=False, visible=False)
146
+ self.input_tokens_cost_per_second = gr.Number(0.00052, visible=False,
147
+ label="($) Price/1K input prompt tokens",
148
  interactive=False
149
  )
150
+ self.output_tokens_cost_per_second = gr.Number(0.06656, visible=False,
151
+ label="($) Price/1K output prompt tokens",
152
  interactive=False
153
  )
154
+ self.info = gr.Markdown("For the Llama2-70B model, we took the cost per input and output tokens values from the benchmark results [here](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper)", interactive=False, visible=False)
155
 
156
+ self.labor = gr.Number(1000, visible=False,
157
+ label="($) Labor cost per month",
158
+ interactive=True
159
+ )
160
+
161
+ # self.used = gr.Slider(minimum=0.01, value=30., step=0.01, label="% used",
162
+ # info="Percentage of time the GPU is used",
163
+ # interactive=True,
164
+ # visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ def compute_cost_per_token(self, input_tokens_cost_per_second, output_tokens_cost_per_second, labor):
167
+ cost_per_input_token = (input_tokens_cost_per_second / 1000)
168
+ cost_per_output_token = (output_tokens_cost_per_second / 1000)
169
+ return cost_per_input_token, cost_per_output_token, labor
170
 
171
  class CohereModel(BaseTCOModel):
172
 
 
192
  self.model: gr.Dropdown.update(choices=["Default", "Custom"])
193
  else:
194
  self.model: gr.Dropdown.update(choices=["Default", "Custom"])
195
+
196
+ self.labor = gr.Number(0, visible=False,
197
+ label="($) Labor cost per month",
198
+ interactive=True
199
+ )
200
 
201
+ def compute_cost_per_token(self, model, labor):
202
  """Cost per token = """
203
  use_case = self.use_case
204
 
 
214
  cost_per_input_token = cost_per_1M_tokens / 1000000
215
  cost_per_output_token = cost_per_1M_tokens / 1000000
216
 
217
+ return cost_per_input_token, cost_per_output_token, labor
218
 
219
  class ModelPage:
220
 
 
241
  output += model.get_components_for_cost_computing()
242
  return output
243
 
244
+ def make_model_visible(self, name:str, use_case: gr.Dropdown):
245
  # First decide which indexes
246
  output = []
247
  for model in self.models:
 
249
  output+= [gr.update(visible=True)] * len(model.get_components())
250
  # Set use_case and num_users values in the model
251
  model.use_case = use_case
 
 
 
252
  else:
253
  output+= [gr.update(visible=False)] * len(model.get_components())
254
  return output
255
 
256
  def compute_cost_per_token(self, *args):
257
  begin=0
258
+ current_model = args[-3]
259
+ current_input_tokens = args[-2]
260
+ current_output_tokens = args[-1]
261
  for model in self.models:
262
  model_n_args = len(model.get_components_for_cost_computing())
263
  if current_model == model.get_name():
264
 
265
  model_args = args[begin:begin+model_n_args]
266
+ cost_per_input_token, cost_per_output_token, labor_cost = model.compute_cost_per_token(*model_args)
267
+ model_tco = cost_per_input_token * current_input_tokens + cost_per_output_token * current_output_tokens
268
  formula = model.get_formula()
269
  latency = model.get_latency()
270
 
271
+ return f"Model {current_model} has a cost/request of: ${model_tco}", model_tco, formula, f"The average latency of this model is {latency}", labor_cost
272
 
273
  begin = begin+model_n_args