jadehardouin commited on
Commit
29078ea
1 Parent(s): 044dd38

Update models.py

Browse files
Files changed (1) hide show
  1. models.py +126 -97
models.py CHANGED
@@ -1,5 +1,6 @@
1
  from gradio.components import Component
2
  import gradio as gr
 
3
  from abc import ABC, abstractclassmethod
4
  import inspect
5
 
@@ -12,6 +13,10 @@ class BaseTCOModel(ABC):
12
 
13
  def __init__(self):
14
  super(BaseTCOModel, self).__setattr__("_components", [])
 
 
 
 
15
 
16
  def get_components(self) -> list[Component]:
17
  return self._components
@@ -42,25 +47,36 @@ class BaseTCOModel(ABC):
42
 
43
  def get_formula(self):
44
  return self.formula
 
 
 
 
 
 
45
 
46
  class OpenAIModel(BaseTCOModel):
47
 
48
  def __init__(self):
49
  self.set_name("(SaaS) OpenAI")
50
- self.set_formula(r"""$CT = \frac{CT\_1K \times 1000}{L}$ <br>
51
  with: <br>
52
- CT = Cost per output Token <br>
53
- CT_1K = Cost per 1000 Tokens (from OpenAI's pricing web page) <br>
54
- L = Input Length
 
 
55
  """)
 
56
  super().__init__()
57
 
58
  def render(self):
59
  def on_model_change(model):
60
 
61
  if model == "GPT-4":
 
62
  return gr.Dropdown.update(choices=["8K", "32K"])
63
  else:
 
64
  return gr.Dropdown.update(choices=["4K", "16K"], value="4K")
65
 
66
  self.model = gr.Dropdown(["GPT-4", "GPT-3.5 Turbo"], value="GPT-4",
@@ -70,196 +86,201 @@ class OpenAIModel(BaseTCOModel):
70
  label="Context size",
71
  visible=False, info="Number of tokens the model considers when processing text")
72
  self.model.change(on_model_change, inputs=self.model, outputs=self.context_length)
73
- self.input_length = gr.Number(350, label="Average number of input tokens",
74
- interactive=True, visible=False)
75
 
76
- def compute_cost_per_token(self, model, context_length, input_length):
77
  """Cost per token = """
78
- model = model[0]
79
- context_length = context_length[0]
80
 
81
  if model == "GPT-4" and context_length == "8K":
82
  cost_per_1k_input_tokens = 0.03
 
83
  elif model == "GPT-4" and context_length == "32K":
84
  cost_per_1k_input_tokens = 0.06
 
85
  elif model == "GPT-3.5" and context_length == "4K":
86
  cost_per_1k_input_tokens = 0.0015
 
87
  else:
88
  cost_per_1k_input_tokens = 0.003
 
 
 
89
 
90
- cost_per_output_token = cost_per_1k_input_tokens * input_length / 1000
91
-
92
- return cost_per_output_token
93
 
94
  class OpenSourceLlama2Model(BaseTCOModel):
95
 
96
  def __init__(self):
97
  self.set_name("(Open source) Llama 2")
98
- self.set_formula(r"""$CT = \frac{VM\_CH}{TS \times 3600 \times MO \times U}$<br>
99
  with: <br>
100
  CT = Cost per Token <br>
101
  VM_CH = VM Cost per Hour <br>
102
- TS = Tokens per Second <br>
103
- MO = Maxed Out <br>
104
- U = Used
 
 
105
  """)
 
106
  super().__init__()
107
 
108
  def render(self):
109
  vm_choices = ["1x Nvidia A100 (Azure NC24ads A100 v4)",
110
- "2x Nvidia A100 (Azure NC48ads A100 v4)",
111
- "4x Nvidia A100 (Azure NC48ads A100 v4)"]
112
 
113
  def on_model_change(model):
114
  if model == "Llama 2 7B":
115
  return [gr.Dropdown.update(choices=vm_choices),
116
- gr.Markdown.update(value="To see the script used to benchmark the Llama2-7B model, [click here](https://example.com/script)"),
117
  gr.Number.update(value=3.6730),
118
  gr.Number.update(value=694.38),
119
- gr.Number.update(visible=True)
120
  ]
121
  else:
122
- not_supported_vm = ["1x Nvidia A100 (Azure NC24ads A100 v4)", "2x Nvidia A100 (Azure NC48ads A100 v4)"]
123
  choices = [x for x in vm_choices if x not in not_supported_vm]
124
- return [gr.Dropdown.update(choices=choices, value="4x Nvidia A100 (Azure NC48ads A100 v4)"),
125
  gr.Markdown.update(value="To see the benchmark results used for the Llama2-70B model, [click here](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper)"),
126
- gr.Number.update(value=14.692),
127
- gr.Number.update(value=18.6),
128
- gr.Number.update(visible=False)
129
  ]
130
 
131
  def on_vm_change(model, vm):
132
  # TO DO: load info from CSV
133
  if model == "Llama 2 7B" and vm == "1x Nvidia A100 (Azure NC24ads A100 v4)":
134
- return [gr.Number.update(value=3.6730), gr.Number.update(value=694.38)]
135
- elif model == "Llama 2 7B" and vm == "2x Nvidia A100 (Azure NC48ads A100 v4)":
136
- return [gr.Number.update(value=7.346), gr.Number.update(value=1388.76)]
137
- elif model == "Llama 2 7B" and vm == "4x Nvidia A100 (Azure NC48ads A100 v4)":
138
- return [gr.Number.update(value=14.692), gr.Number.update(value=2777.52)]
139
- elif model == "Llama 2 70B" and vm == "4x Nvidia A100 (Azure NC48ads A100 v4)":
140
- return [gr.Number.update(value=14.692), gr.Number.update(value=18.6)]
141
 
142
- self.model = gr.Dropdown(["Llama 2 7B", "Llama 2 70B"], value="Llama 2 7B", label="OpenSource models", visible=False)
143
- self.vm = gr.Dropdown(vm_choices,
144
- value="1x Nvidia A100 (Azure NC24ads A100 v4)",
145
  visible=False,
146
  label="Instance of VM with GPU",
147
  info="Your options for this choice depend on the model you previously chose"
148
  )
149
- self.vm_cost_per_hour = gr.Number(3.6730, label="VM instance cost per hour",
150
  interactive=False, visible=False)
151
- self.tokens_per_second = gr.Number(694.38, visible=False,
152
- label="Number of tokens per second for this specific model and VM instance",
153
  interactive=False
154
  )
155
- self.input_length = gr.Number(233, label="Average number of input tokens", info="This is the number of input tokens used when the model was benchmarked to get the number of tokens/second it processes",
156
- interactive=False, visible=False)
157
- self.info = gr.Markdown("To see the script used to benchmark the Llama2-7B model, [click here](https://example.com/script)", interactive=False, visible=False)
 
 
158
 
159
- self.model.change(on_model_change, inputs=self.model, outputs=[self.vm, self.info, self.vm_cost_per_hour, self.tokens_per_second, self.input_length])
160
- self.vm.change(on_vm_change, inputs=[self.model, self.vm], outputs=[self.vm_cost_per_hour, self.tokens_per_second])
161
- self.maxed_out = gr.Slider(minimum=0.01, value=50., step=0.01, label="% maxed out",
162
- info="How much the GPU is fully used",
163
- interactive=True,
164
- visible=False)
165
- self.used = gr.Slider(minimum=0.01, value=50., step=0.01, label="% used",
166
  info="Percentage of time the GPU is used",
167
  interactive=True,
168
  visible=False)
169
 
170
- def compute_cost_per_token(self, vm_cost_per_hour, tokens_per_second, maxed_out, used):
171
- cost_per_token = vm_cost_per_hour * 10000 / (tokens_per_second * 3600 * maxed_out * used)
172
- return cost_per_token
 
173
 
174
  class OpenSourceDIY(BaseTCOModel):
175
 
176
  def __init__(self):
177
  self.set_name("(Open source) DIY")
178
- self.set_formula(r"""$CT = \frac{VM\_CH}{TS \times 3600 \times MO \times U}$<br>
179
  with: <br>
180
  CT = Cost per Token <br>
181
  VM_CH = VM Cost per Hour <br>
182
- TS = Tokens per Second <br>
183
- MO = Maxed Out <br>
184
- U = Used
 
 
185
  """)
 
186
  super().__init__()
187
 
188
  def render(self):
189
  self.info = gr.Markdown("Compute the cost/token based on our formula below, using your own parameters", visible=False)
190
- self.display_formula = gr.Markdown(r"""$CT = \frac{VM\_CH}{TS \times 3600 \times MO \times U}$<br>
191
  with: <br>
192
  CT = Cost per Token <br>
193
  VM_CH = VM Cost per Hour <br>
194
- TS = Tokens per Second <br>
195
- MO = Maxed Out <br>
196
- U = Used
 
 
197
  """, visible=False)
198
  self.vm_cost_per_hour = gr.Number(3.5, label="VM instance cost per hour",
199
  interactive=True, visible=False)
200
- self.tokens_per_second = gr.Number(700, visible=False,
201
- label="Number of tokens per second for this specific model and VM instance",
 
 
 
 
202
  interactive=True
203
  )
204
- self.maxed_out = gr.Slider(minimum=0.01, value=50., step=0.01, label="% maxed out",
205
- info="How much the GPU is fully used",
206
- interactive=True,
207
- visible=False)
208
  self.used = gr.Slider(minimum=0.01, value=50., step=0.01, label="% used",
209
  info="Percentage of time the GPU is used",
210
  interactive=True,
211
  visible=False)
212
 
213
- def compute_cost_per_token(self, vm_cost_per_hour, tokens_per_second, maxed_out, used):
214
- cost_per_token = vm_cost_per_hour * 10000 / (tokens_per_second * 3600 * maxed_out * used)
215
- return cost_per_token
 
216
 
217
  class CohereModel(BaseTCOModel):
218
 
219
  def __init__(self):
220
  self.set_name("(SaaS) Cohere")
221
- self.set_formula(r"""$CT = \frac{CT\_1K \times 1000}{L}$ <br>
222
  with: <br>
223
- CT = Cost per output Token <br>
224
  CT_1M = Cost per one million Tokens (from Cohere's pricing web page) <br>
225
- L = Input Length
 
226
  """)
 
227
  super().__init__()
228
 
229
  def render(self):
230
- def on_use_case_change(use_case):
231
- if use_case == "Summarize":
232
- return gr.Dropdown.update(choices=["Default"])
233
- else:
234
- return gr.Dropdown.update(choices=["Default", "Custom"])
235
-
236
- self.use_case = gr.Dropdown(["Generate", "Summarize"], value="Generate",
237
- label="API",
238
- interactive=True, visible=False)
239
  self.model = gr.Dropdown(["Default", "Custom"], value="Default",
240
  label="Model",
241
  interactive=True, visible=False)
242
- self.use_case.change(on_use_case_change, inputs=self.use_case, outputs=self.model)
243
- self.input_length = gr.Number(350, label="Average number of input tokens",
244
- interactive=True, visible=False)
 
 
 
245
 
246
- def compute_cost_per_token(self, use_case, model, input_length):
247
  """Cost per token = """
248
- use_case = use_case[0]
249
- model = model[0]
250
 
251
  if use_case == "Generate":
252
  if model == "Default":
253
- cost_per_1M_input_tokens = 15
254
  else:
255
- cost_per_1M_input_tokens = 30
256
- else:
257
- cost_per_1M_input_tokens = 15
 
 
 
 
258
 
259
- cost_per_output_token = cost_per_1M_input_tokens * input_length / 1000000
260
 
261
- return cost_per_output_token
262
-
263
  class ModelPage:
264
 
265
  def __init__(self, Models: BaseTCOModel):
@@ -285,12 +306,17 @@ class ModelPage:
285
  output += model.get_components_for_cost_computing()
286
  return output
287
 
288
- def make_model_visible(self, name:str):
289
  # First decide which indexes
290
  output = []
291
  for model in self.models:
292
  if model.get_name() == name:
293
- output+= [gr.update(visible=True)] * len(model.get_components())
 
 
 
 
 
294
  else:
295
  output+= [gr.update(visible=False)] * len(model.get_components())
296
  return output
@@ -303,8 +329,11 @@ class ModelPage:
303
  if current_model == model.get_name():
304
 
305
  model_args = args[begin:begin+model_n_args]
306
- model_tco = model.compute_cost_per_token(*model_args)
 
307
  formula = model.get_formula()
308
- return f"Model {current_model} has a TCO of: ${model_tco}", model_tco, formula
 
 
309
 
310
  begin = begin+model_n_args
 
1
  from gradio.components import Component
2
  import gradio as gr
3
+ import pandas as pd
4
  from abc import ABC, abstractclassmethod
5
  import inspect
6
 
 
13
 
14
  def __init__(self):
15
  super(BaseTCOModel, self).__setattr__("_components", [])
16
+ self.use_case = None
17
+ self.num_users = None
18
+ self.input_tokens = None
19
+ self.output_tokens = None
20
 
21
  def get_components(self) -> list[Component]:
22
  return self._components
 
47
 
48
  def get_formula(self):
49
  return self.formula
50
+
51
+ def set_latency(self, latency):
52
+ self.latency = latency
53
+
54
+ def get_latency(self):
55
+ return self.latency
56
 
57
  class OpenAIModel(BaseTCOModel):
58
 
59
  def __init__(self):
60
  self.set_name("(SaaS) OpenAI")
61
+ self.set_formula(r"""$CR = \frac{CIT\_1K \times IT + COT\_1K \times OT}{1000}$ <br>
62
  with: <br>
63
+ CR = Cost per Request <br>
64
+ CIT_1K = Cost per 1000 Input Tokens (from OpenAI's pricing web page) <br>
65
+ COT_1K = Cost per 1000 Output Tokens <br>
66
+ IT = Input Tokens <br>
67
+ OT = Output Tokens
68
  """)
69
+ self.latency = "15s" #Default value for GPT4
70
  super().__init__()
71
 
72
  def render(self):
73
  def on_model_change(model):
74
 
75
  if model == "GPT-4":
76
+ self.latency = "15s"
77
  return gr.Dropdown.update(choices=["8K", "32K"])
78
  else:
79
+ self.latency = "5s"
80
  return gr.Dropdown.update(choices=["4K", "16K"], value="4K")
81
 
82
  self.model = gr.Dropdown(["GPT-4", "GPT-3.5 Turbo"], value="GPT-4",
 
86
  label="Context size",
87
  visible=False, info="Number of tokens the model considers when processing text")
88
  self.model.change(on_model_change, inputs=self.model, outputs=self.context_length)
 
 
89
 
90
+ def compute_cost_per_token(self, model, context_length):
91
  """Cost per token = """
 
 
92
 
93
  if model == "GPT-4" and context_length == "8K":
94
  cost_per_1k_input_tokens = 0.03
95
+ cost_per_1k_output_tokens = 0.06
96
  elif model == "GPT-4" and context_length == "32K":
97
  cost_per_1k_input_tokens = 0.06
98
+ cost_per_1k_output_tokens = 0.12
99
  elif model == "GPT-3.5" and context_length == "4K":
100
  cost_per_1k_input_tokens = 0.0015
101
+ cost_per_1k_output_tokens = 0.002
102
  else:
103
  cost_per_1k_input_tokens = 0.003
104
+ cost_per_1k_output_tokens = 0.004
105
+ cost_per_input_token = (cost_per_1k_input_tokens / 1000)
106
+ cost_per_output_token = (cost_per_1k_output_tokens / 1000)
107
 
108
+ return cost_per_input_token, cost_per_output_token
 
 
109
 
110
  class OpenSourceLlama2Model(BaseTCOModel):
111
 
112
  def __init__(self):
113
  self.set_name("(Open source) Llama 2")
114
+ self.set_formula(r"""$CT = \frac{VM\_CH \times 100}{3600 \times U} \times (\frac{IT}{ITS} + \frac{OT}{OTS})$<br>
115
  with: <br>
116
  CT = Cost per Token <br>
117
  VM_CH = VM Cost per Hour <br>
118
+ ITS = Input Tokens per Second <br>
119
+ OTS = Output Tokens per Second <br>
120
+ U = Used <br>
121
+ IT = Input Tokens <br>
122
+ OT = Output Tokens
123
  """)
124
+ self.set_latency("27s")
125
  super().__init__()
126
 
127
  def render(self):
128
  vm_choices = ["1x Nvidia A100 (Azure NC24ads A100 v4)",
129
+ "2x Nvidia A100 (Azure NC24ads A100 v4)",
130
+ "2x Nvidia A100 (Azure ND96amsr A100 v4)"]
131
 
132
  def on_model_change(model):
133
  if model == "Llama 2 7B":
134
  return [gr.Dropdown.update(choices=vm_choices),
135
+ gr.Markdown.update(value="To see the benchmark results use for the Llama2-7B model, [click here](https://example.com/script)"),
136
  gr.Number.update(value=3.6730),
137
  gr.Number.update(value=694.38),
138
+ gr.Number.update(value=694.38),
139
  ]
140
  else:
141
+ not_supported_vm = ["1x Nvidia A100 (Azure NC24ads A100 v4)", "2x Nvidia A100 (Azure NC24ads A100 v4)"]
142
  choices = [x for x in vm_choices if x not in not_supported_vm]
143
+ return [gr.Dropdown.update(choices=choices, value="2x Nvidia A100 (Azure ND96amsr A100 v4)"),
144
  gr.Markdown.update(value="To see the benchmark results used for the Llama2-70B model, [click here](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper)"),
145
+ gr.Number.update(value=2*37.186),
146
+ gr.Number.update(value=2860),
147
+ gr.Number.update(value=18.545),
148
  ]
149
 
150
  def on_vm_change(model, vm):
151
  # TO DO: load info from CSV
152
  if model == "Llama 2 7B" and vm == "1x Nvidia A100 (Azure NC24ads A100 v4)":
153
+ return [gr.Number.update(value=4.777), gr.Number.update(value=694.38), gr.Number.update(value=694.38)]
154
+ elif model == "Llama 2 7B" and vm == "2x Nvidia A100 (Azure NC24ads A100 v4)":
155
+ return [gr.Number.update(value=2*4.777), gr.Number.update(value=1388.76), gr.Number.update(value=1388.76)]
156
+ elif model == "Llama 2 7B" and vm == "2x Nvidia A100 (Azure ND96amsr A100 v4)":
157
+ return [gr.Number.update(value=2*37.186), gr.Number.update(value=2777.52), gr.Number.update(value=2777.52)]
158
+ elif model == "Llama 2 70B" and vm == "2x Nvidia A100 (Azure ND96amsr A100 v4)":
159
+ return [gr.Number.update(value=2*37.186), gr.Number.update(value=2860), gr.Number.update(value=18.449)]
160
 
161
+ self.model = gr.Dropdown(["Llama 2 7B", "Llama 2 70B"], value="Llama 2 70B", label="OpenSource models", visible=False)
162
+ self.vm = gr.Dropdown(choices=["2x Nvidia A100 (Azure ND96amsr A100 v4)"],
163
+ value="2x Nvidia A100 (Azure ND96amsr A100 v4)",
164
  visible=False,
165
  label="Instance of VM with GPU",
166
  info="Your options for this choice depend on the model you previously chose"
167
  )
168
+ self.vm_cost_per_hour = gr.Number(2*37.186, label="VM instance cost per hour",
169
  interactive=False, visible=False)
170
+ self.input_tokens_per_second = gr.Number(2860, visible=False,
171
+ label="Number of output tokens per second for this specific model and VM instance",
172
  interactive=False
173
  )
174
+ self.output_tokens_per_second = gr.Number(18.449, visible=False,
175
+ label="Number of output tokens per second for this specific model and VM instance",
176
+ interactive=False
177
+ )
178
+ self.info = gr.Markdown("To see the script used to benchmark the Llama2-70B model, [click here](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper)", interactive=False, visible=False)
179
 
180
+ self.model.change(on_model_change, inputs=self.model, outputs=[self.vm, self.info, self.vm_cost_per_hour, self.input_tokens_per_second, self.output_tokens_per_second])
181
+ self.vm.change(on_vm_change, inputs=[self.model, self.vm], outputs=[self.vm_cost_per_hour, self.input_tokens_per_second, self.output_tokens_per_second])
182
+ self.used = gr.Slider(minimum=0.01, value=30., step=0.01, label="% used",
 
 
 
 
183
  info="Percentage of time the GPU is used",
184
  interactive=True,
185
  visible=False)
186
 
187
+ def compute_cost_per_token(self, vm_cost_per_hour, input_tokens_per_second, output_tokens_per_second, used):
188
+ cost_per_input_token = vm_cost_per_hour * 100 / (3600 * used * input_tokens_per_second)
189
+ cost_per_output_token = vm_cost_per_hour * 100 / (3600 * used * output_tokens_per_second)
190
+ return cost_per_input_token, cost_per_output_token
191
 
192
  class OpenSourceDIY(BaseTCOModel):
193
 
194
  def __init__(self):
195
  self.set_name("(Open source) DIY")
196
+ self.set_formula(r"""$CT = \frac{VM\_CH \times 100}{3600 \times U} \times (\frac{IT}{ITS} + \frac{OT}{OTS})$<br>
197
  with: <br>
198
  CT = Cost per Token <br>
199
  VM_CH = VM Cost per Hour <br>
200
+ ITS = Input Tokens per Second <br>
201
+ OTS = Output Tokens per Second <br>
202
+ U = Used <br>
203
+ IT = Input Tokens <br>
204
+ OT = Output Tokens
205
  """)
206
+ self.set_latency("The latency can't be estimated in the DIY scenario for the model isn't defined")
207
  super().__init__()
208
 
209
  def render(self):
210
  self.info = gr.Markdown("Compute the cost/token based on our formula below, using your own parameters", visible=False)
211
+ self.display_formula = gr.Markdown(r"""$CT = \frac{VM\_CH \times 100}{3600 \times U} \times (\frac{IT}{ITS} + \frac{OT}{OTS})$<br>
212
  with: <br>
213
  CT = Cost per Token <br>
214
  VM_CH = VM Cost per Hour <br>
215
+ ITS = Input Tokens per Second <br>
216
+ OTS = Output Tokens per Second <br>
217
+ U = Used <br>
218
+ IT = Input Tokens <br>
219
+ OT = Output Tokens
220
  """, visible=False)
221
  self.vm_cost_per_hour = gr.Number(3.5, label="VM instance cost per hour",
222
  interactive=True, visible=False)
223
+ self.input_tokens_per_second = gr.Number(300, visible=False,
224
+ label="Number of input tokens per second processed for this specific model and VM instance",
225
+ interactive=True
226
+ )
227
+ self.output_tokens_per_second = gr.Number(300, visible=False,
228
+ label="Number of output tokens per second processed for this specific model and VM instance",
229
  interactive=True
230
  )
 
 
 
 
231
  self.used = gr.Slider(minimum=0.01, value=50., step=0.01, label="% used",
232
  info="Percentage of time the GPU is used",
233
  interactive=True,
234
  visible=False)
235
 
236
+ def compute_cost_per_token(self, vm_cost_per_hour, input_tokens_per_second, output_tokens_per_second, used):
237
+ cost_per_input_token = vm_cost_per_hour * 100 / (3600 * used * input_tokens_per_second)
238
+ cost_per_output_token = vm_cost_per_hour * 100 / (3600 * used * output_tokens_per_second)
239
+ return cost_per_input_token, cost_per_output_token
240
 
241
  class CohereModel(BaseTCOModel):
242
 
243
  def __init__(self):
244
  self.set_name("(SaaS) Cohere")
245
+ self.set_formula(r"""$CR = \frac{CT\_1M \times (IT + OT)}{1000000}$ <br>
246
  with: <br>
247
+ CR = Cost per Request <br>
248
  CT_1M = Cost per one million Tokens (from Cohere's pricing web page) <br>
249
+ IT = Input Tokens <br>
250
+ OT = Output Tokens
251
  """)
252
+ self.set_latency("")
253
  super().__init__()
254
 
255
  def render(self):
 
 
 
 
 
 
 
 
 
256
  self.model = gr.Dropdown(["Default", "Custom"], value="Default",
257
  label="Model",
258
  interactive=True, visible=False)
259
+ if self.use_case == "Summarize":
260
+ self.model: gr.Dropdown.update(choices=["Default"])
261
+ elif self.use_case == "Question-answering":
262
+ self.model: gr.Dropdown.update(choices=["Default", "Custom"])
263
+ else:
264
+ self.model: gr.Dropdown.update(choices=["Default", "Custom"])
265
 
266
+ def compute_cost_per_token(self, model):
267
  """Cost per token = """
268
+ use_case = self.use_case
 
269
 
270
  if use_case == "Generate":
271
  if model == "Default":
272
+ cost_per_1M_tokens = 15
273
  else:
274
+ cost_per_1M_tokens = 30
275
+ elif use_case == "Summarize":
276
+ cost_per_1M_tokens = 15
277
+ else:
278
+ cost_per_1M_tokens = 200
279
+ cost_per_input_token = cost_per_1M_tokens / 1000000
280
+ cost_per_output_token = cost_per_1M_tokens / 1000000
281
 
282
+ return cost_per_input_token, cost_per_output_token
283
 
 
 
284
  class ModelPage:
285
 
286
  def __init__(self, Models: BaseTCOModel):
 
306
  output += model.get_components_for_cost_computing()
307
  return output
308
 
309
+ def make_model_visible(self, name:str, use_case: gr.Dropdown, num_users: gr.Number, input_tokens: gr.Slider, output_tokens: gr.Slider):
310
  # First decide which indexes
311
  output = []
312
  for model in self.models:
313
  if model.get_name() == name:
314
+ output+= [gr.update(visible=True)] * len(model.get_components())
315
+ # Set use_case and num_users values in the model
316
+ model.use_case = use_case
317
+ model.num_users = num_users
318
+ model.input_tokens = input_tokens
319
+ model.output_tokens = output_tokens
320
  else:
321
  output+= [gr.update(visible=False)] * len(model.get_components())
322
  return output
 
329
  if current_model == model.get_name():
330
 
331
  model_args = args[begin:begin+model_n_args]
332
+ cost_per_input_token, cost_per_output_token = model.compute_cost_per_token(*model_args)
333
+ model_tco = cost_per_input_token * model.input_tokens + cost_per_output_token * model.output_tokens
334
  formula = model.get_formula()
335
+ latency = model.get_latency()
336
+
337
+ return f"Model {current_model} has a cost/request of: ${model_tco}", model_tco, formula, f"The average latency of this model is {latency}"
338
 
339
  begin = begin+model_n_args