ayushi-lamini commited on
Commit
5c9a1c4
β€’
1 Parent(s): 93bc043

More changes (#4)

Browse files
app.py CHANGED
@@ -262,92 +262,92 @@ with demo:
262
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
263
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
264
 
265
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
266
- with gr.Column():
267
- with gr.Row():
268
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
269
-
270
- with gr.Column():
271
- with gr.Accordion(
272
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
273
- open=False,
274
- ):
275
- with gr.Row():
276
- finished_eval_table = gr.components.Dataframe(
277
- value=finished_eval_queue_df,
278
- headers=EVAL_COLS,
279
- datatype=EVAL_TYPES,
280
- row_count=5,
281
- )
282
- with gr.Accordion(
283
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
284
- open=False,
285
- ):
286
- with gr.Row():
287
- running_eval_table = gr.components.Dataframe(
288
- value=running_eval_queue_df,
289
- headers=EVAL_COLS,
290
- datatype=EVAL_TYPES,
291
- row_count=5,
292
- )
293
-
294
- with gr.Accordion(
295
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
296
- open=False,
297
- ):
298
- with gr.Row():
299
- pending_eval_table = gr.components.Dataframe(
300
- value=pending_eval_queue_df,
301
- headers=EVAL_COLS,
302
- datatype=EVAL_TYPES,
303
- row_count=5,
304
- )
305
- with gr.Row():
306
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
307
-
308
- with gr.Row():
309
- with gr.Column():
310
- model_name_textbox = gr.Textbox(label="Model name")
311
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
312
- model_type = gr.Dropdown(
313
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
314
- label="Model type",
315
- multiselect=False,
316
- value=None,
317
- interactive=True,
318
- )
319
-
320
- with gr.Column():
321
- precision = gr.Dropdown(
322
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
323
- label="Precision",
324
- multiselect=False,
325
- value="float16" if DEVICE != "cpu" else "float32",
326
- interactive=True,
327
- )
328
- weight_type = gr.Dropdown(
329
- choices=[i.value.name for i in WeightType],
330
- label="Weights type",
331
- multiselect=False,
332
- value="Original",
333
- interactive=True,
334
- )
335
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
336
-
337
- submit_button = gr.Button("Submit Eval")
338
- submission_result = gr.Markdown()
339
- submit_button.click(
340
- add_new_eval,
341
- [
342
- model_name_textbox,
343
- base_model_name_textbox,
344
- revision_name_textbox,
345
- precision,
346
- weight_type,
347
- model_type,
348
- ],
349
- submission_result,
350
- )
351
 
352
  with gr.Row():
353
  with gr.Accordion("πŸ“™ Citation", open=False):
 
262
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
263
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
264
 
265
+ # with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
266
+ # with gr.Column():
267
+ # with gr.Row():
268
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
269
+ #
270
+ # with gr.Column():
271
+ # with gr.Accordion(
272
+ # f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
273
+ # open=False,
274
+ # ):
275
+ # with gr.Row():
276
+ # finished_eval_table = gr.components.Dataframe(
277
+ # value=finished_eval_queue_df,
278
+ # headers=EVAL_COLS,
279
+ # datatype=EVAL_TYPES,
280
+ # row_count=5,
281
+ # )
282
+ # with gr.Accordion(
283
+ # f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
284
+ # open=False,
285
+ # ):
286
+ # with gr.Row():
287
+ # running_eval_table = gr.components.Dataframe(
288
+ # value=running_eval_queue_df,
289
+ # headers=EVAL_COLS,
290
+ # datatype=EVAL_TYPES,
291
+ # row_count=5,
292
+ # )
293
+ #
294
+ # with gr.Accordion(
295
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
296
+ # open=False,
297
+ # ):
298
+ # with gr.Row():
299
+ # pending_eval_table = gr.components.Dataframe(
300
+ # value=pending_eval_queue_df,
301
+ # headers=EVAL_COLS,
302
+ # datatype=EVAL_TYPES,
303
+ # row_count=5,
304
+ # )
305
+ # with gr.Row():
306
+ # gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
307
+ #
308
+ # with gr.Row():
309
+ # with gr.Column():
310
+ # model_name_textbox = gr.Textbox(label="Model name")
311
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
312
+ # model_type = gr.Dropdown(
313
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
314
+ # label="Model type",
315
+ # multiselect=False,
316
+ # value=None,
317
+ # interactive=True,
318
+ # )
319
+ #
320
+ # with gr.Column():
321
+ # precision = gr.Dropdown(
322
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
323
+ # label="Precision",
324
+ # multiselect=False,
325
+ # value="float16" if DEVICE != "cpu" else "float32",
326
+ # interactive=True,
327
+ # )
328
+ # weight_type = gr.Dropdown(
329
+ # choices=[i.value.name for i in WeightType],
330
+ # label="Weights type",
331
+ # multiselect=False,
332
+ # value="Original",
333
+ # interactive=True,
334
+ # )
335
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
336
+ #
337
+ # submit_button = gr.Button("Submit Eval")
338
+ # submission_result = gr.Markdown()
339
+ # submit_button.click(
340
+ # add_new_eval,
341
+ # [
342
+ # model_name_textbox,
343
+ # base_model_name_textbox,
344
+ # revision_name_textbox,
345
+ # precision,
346
+ # weight_type,
347
+ # model_type,
348
+ # ],
349
+ # submission_result,
350
+ # )
351
 
352
  with gr.Row():
353
  with gr.Accordion("πŸ“™ Citation", open=False):
src/backend/eval/evaluators/openai_eval/main.py CHANGED
@@ -1,11 +1,14 @@
1
  from tqdm import tqdm
2
- from lamini import MistralRunner
3
  import lamini
4
 
5
  from src.envs import LAMINI_API_KEY
6
 
7
  lamini.api_key = LAMINI_API_KEY
8
 
 
 
 
9
 
10
  async def evaluate_openai_model(model, dataset, max_examples):
11
  for example in tqdm(dataset, desc="Evaluating", total=max_examples):
@@ -17,26 +20,26 @@ def evaluate(model, example):
17
  prompt = example.get_prompt()
18
 
19
  response = model(prompt=prompt, output_type=example.get_output_type())
 
 
 
 
 
 
20
 
21
  result = {
22
  "prompt": prompt,
23
  "response": response,
24
  "reference_response": example.get_response_json(),
25
- "is_exact_match": example.is_exact_match(response),
26
- "score": score(
27
- example.format_response(response),
28
- example.get_response(response),
29
- example.get_question(),
30
- example.get_rubric()
31
- ),
32
  }
33
  return result
34
 
35
 
36
  def score(response, expected_response, question, rubric):
37
- runner = MistralRunner(system_prompt=" ")
38
 
39
- prompt = "A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n\n"
40
  prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
41
  prompt += rubric
42
  prompt += "Use the full range. Read the gold answer carefully. "
@@ -49,12 +52,18 @@ def score(response, expected_response, question, rubric):
49
  prompt += f"========== model answer =========\n{response}\n\n"
50
  prompt += "=" * 40 + "\n\n"
51
  prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
 
52
 
53
- score = runner(
54
- prompt=prompt,
55
- output_type={"explanation": "str", "score": "int"}
56
- )
 
 
 
 
 
57
 
58
- print(prompt, "\nModel score:", score)
59
 
60
- return score["score"]
 
1
  from tqdm import tqdm
2
+ from lamini import MistralRunner, Lamini
3
  import lamini
4
 
5
  from src.envs import LAMINI_API_KEY
6
 
7
  lamini.api_key = LAMINI_API_KEY
8
 
9
+ lamini.max_workers = 40
10
+ lamini.batch_size = 50
11
+
12
 
13
  async def evaluate_openai_model(model, dataset, max_examples):
14
  for example in tqdm(dataset, desc="Evaluating", total=max_examples):
 
20
  prompt = example.get_prompt()
21
 
22
  response = model(prompt=prompt, output_type=example.get_output_type())
23
+ response["score"] = score(
24
+ example.format_response(response),
25
+ example.get_response(response),
26
+ example.get_question(),
27
+ example.get_rubric()
28
+ )
29
 
30
  result = {
31
  "prompt": prompt,
32
  "response": response,
33
  "reference_response": example.get_response_json(),
34
+ "is_exact_match": example.is_exact_match(response)
 
 
 
 
 
 
35
  }
36
  return result
37
 
38
 
39
  def score(response, expected_response, question, rubric):
40
+ api = Lamini(model_name="mistralai/Mistral-7B-Instruct-v0.2")
41
 
42
+ prompt = "<s>[INST] A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n\n"
43
  prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
44
  prompt += rubric
45
  prompt += "Use the full range. Read the gold answer carefully. "
 
52
  prompt += f"========== model answer =========\n{response}\n\n"
53
  prompt += "=" * 40 + "\n\n"
54
  prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
55
+ prompt += " [/INST]"
56
 
57
+ try:
58
+ lamini_score = api.generate(
59
+ prompt=prompt,
60
+ output_type={"explanation": "str", "score": "int"},
61
+ max_new_tokens=150
62
+ )
63
+ except Exception as e:
64
+ print("hit lamini scoring exception:", e)
65
+ lamini_score = {"explanation": "", "score": 0}
66
 
67
+ print(prompt, "\nModel score:", lamini_score)
68
 
69
+ return lamini_score["score"]
src/backend/eval/evaluators/openai_eval/openai_earnings_evaluator.py CHANGED
@@ -1,5 +1,5 @@
1
  import jsonlines
2
- from src.envs import EARNINGS_DATASET_PATH
3
 
4
 
5
  def load_earnings_dataset():
@@ -18,6 +18,8 @@ class EarningsCallsDataset:
18
  def __iter__(self):
19
  with jsonlines.open(self.path) as reader:
20
  for index, obj in enumerate(reader):
 
 
21
  yield EarningsCallsExample(index, obj)
22
 
23
  def get_length(self):
 
1
  import jsonlines
2
+ from src.envs import EARNINGS_DATASET_PATH, MAX_EXAMPLES
3
 
4
 
5
  def load_earnings_dataset():
 
18
  def __iter__(self):
19
  with jsonlines.open(self.path) as reader:
20
  for index, obj in enumerate(reader):
21
+ if index == MAX_EXAMPLES:
22
+ break
23
  yield EarningsCallsExample(index, obj)
24
 
25
  def get_length(self):
src/backend/eval/evaluators/openai_eval/openai_ecommerce_evaluator.py CHANGED
@@ -1,5 +1,5 @@
1
  import jsonlines
2
- from src.envs import SHOPPING_DATASET_PATH
3
 
4
 
5
  def load_shopping_dataset():
@@ -18,6 +18,8 @@ class ShoppingDataset:
18
  def __iter__(self):
19
  with jsonlines.open(self.path) as reader:
20
  for index, obj in enumerate(reader):
 
 
21
  yield ShoppingExample(index, obj)
22
 
23
  def get_length(self):
@@ -73,12 +75,17 @@ class ShoppingExample:
73
  }
74
 
75
  def format_response(self, response):
76
- formatted_response = f"Product ID: {response['product_id']}\n"
77
 
78
- if response['product_name'] != "N/A":
 
 
 
 
 
79
  formatted_response += f"Product Name: {response['product_name']}\n"
80
 
81
- if response['product_description'] != "N/A":
82
  formatted_response += f"Description: {response['product_description']}"
83
 
84
  return formatted_response
 
1
  import jsonlines
2
+ from src.envs import SHOPPING_DATASET_PATH, MAX_EXAMPLES
3
 
4
 
5
  def load_shopping_dataset():
 
18
  def __iter__(self):
19
  with jsonlines.open(self.path) as reader:
20
  for index, obj in enumerate(reader):
21
+ if index == MAX_EXAMPLES:
22
+ break
23
  yield ShoppingExample(index, obj)
24
 
25
  def get_length(self):
 
75
  }
76
 
77
  def format_response(self, response):
78
+ formatted_response = ""
79
 
80
+ if 'product_id' in response:
81
+ formatted_response += f"Product ID: {response['product_id']}\n"
82
+ else:
83
+ formatted_response += "Product ID: N/A\n"
84
+
85
+ if ("product_name" in response) and response['product_name'] != "N/A":
86
  formatted_response += f"Product Name: {response['product_name']}\n"
87
 
88
+ if ("product_description" in response) and response['product_description'] != "N/A":
89
  formatted_response += f"Description: {response['product_description']}"
90
 
91
  return formatted_response
src/backend/eval/evaluators/openai_eval/openai_icd_evaluator.py CHANGED
@@ -1,5 +1,5 @@
1
  import jsonlines
2
- from src.envs import ICD_DATASET_PATH
3
 
4
 
5
  def load_icd_dataset():
@@ -18,6 +18,8 @@ class ICD11Dataset:
18
  def __iter__(self):
19
  with jsonlines.open(self.path) as reader:
20
  for index, obj in enumerate(reader):
 
 
21
  yield ICD11Example(index, obj)
22
 
23
  def get_length(self):
@@ -43,7 +45,7 @@ class ICD11Example:
43
  return prompt
44
 
45
  def is_exact_match(self, response):
46
- return self.example["entity"]["code"] == response["product_id"]
47
 
48
  def get_question(self):
49
  return self.example["question"]
@@ -69,9 +71,13 @@ class ICD11Example:
69
  }
70
 
71
  def format_response(self, response):
72
- formatted_response = f"ICD11 Code: {response['icd11_code']}\n"
 
 
 
 
73
 
74
- if response['answer'] != "N/A":
75
  formatted_response += f"Answer: {response['answer']}\n"
76
 
77
  return formatted_response
 
1
  import jsonlines
2
+ from src.envs import ICD_DATASET_PATH, MAX_EXAMPLES
3
 
4
 
5
  def load_icd_dataset():
 
18
  def __iter__(self):
19
  with jsonlines.open(self.path) as reader:
20
  for index, obj in enumerate(reader):
21
+ if index == MAX_EXAMPLES:
22
+ break
23
  yield ICD11Example(index, obj)
24
 
25
  def get_length(self):
 
45
  return prompt
46
 
47
  def is_exact_match(self, response):
48
+ return self.example["entity"]["code"] == response["icd11_code"]
49
 
50
  def get_question(self):
51
  return self.example["question"]
 
71
  }
72
 
73
  def format_response(self, response):
74
+ formatted_response = ""
75
+ if 'icd11_code' in response:
76
+ formatted_response += f"ICD11 Code: {response['icd11_code']}\n"
77
+ else:
78
+ formatted_response += "ICD11 Code: N/A\n"
79
 
80
+ if ('answer' in response) and response['answer'] != "N/A":
81
  formatted_response += f"Answer: {response['answer']}\n"
82
 
83
  return formatted_response
src/backend/eval/models/gpt4_model.py CHANGED
@@ -36,7 +36,7 @@ class GPT4Model(LM):
36
 
37
  json_object = json.loads(response.choices[0].message.content)
38
 
39
- validate_output(json_object, output_type)
40
 
41
  return json_object
42
 
@@ -46,7 +46,11 @@ class GPT4Model(LM):
46
  try:
47
  question = request.doc["question"]
48
  result = self.__call__(prompt=question, output_type={"answer": "str"})
49
- response = result['answer']
 
 
 
 
50
  except Exception as e:
51
  print("Error fetching gpt4 response: ", e)
52
  # select random answer
@@ -56,8 +60,13 @@ class GPT4Model(LM):
56
  try:
57
  question = request.arguments[0]
58
  op_type = {"explanation": "str", "answer": "str"}
59
- result = self.__call__(prompt=question, output_type=op_type)
60
- response = f"({result['answer']})"
 
 
 
 
 
61
  except Exception as e:
62
  print("Error fetching gpt4 response: ", e)
63
  # select random answer
 
36
 
37
  json_object = json.loads(response.choices[0].message.content)
38
 
39
+ # validate_output(json_object, output_type)
40
 
41
  return json_object
42
 
 
46
  try:
47
  question = request.doc["question"]
48
  result = self.__call__(prompt=question, output_type={"answer": "str"})
49
+ response = result['answer'].strip()
50
+ print("\n\ntask_name: ", request.task_name)
51
+ print("helm prompt: ", question)
52
+ print("helm response: ", response)
53
+ print("\n\n")
54
  except Exception as e:
55
  print("Error fetching gpt4 response: ", e)
56
  # select random answer
 
60
  try:
61
  question = request.arguments[0]
62
  op_type = {"explanation": "str", "answer": "str"}
63
+ prompt = f"The 'answer' should only include the option name. {question}"
64
+ result = self.__call__(prompt=prompt, output_type=op_type)
65
+ response = f"({result['answer'].strip()})"
66
+ print("\n\ntask_name: ", request.task_name)
67
+ print("helm prompt: ", question)
68
+ print("helm response: ", response)
69
+ print("\n\n")
70
  except Exception as e:
71
  print("Error fetching gpt4 response: ", e)
72
  # select random answer
src/backend/run_eval_suite.py CHANGED
@@ -23,12 +23,14 @@ async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, bat
23
  batch_size, device, no_cache, limit, write_out=True,
24
  output_base_path='logs')
25
  lamini_results = await custom_evaluator.evaluate()
 
26
 
27
  print(f"Selected Harness Tasks: {task_names}")
28
  harness_evaluator = HarnessEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
29
  batch_size, device, no_cache, limit, write_out=True,
30
  output_base_path='logs')
31
  harness_results = harness_evaluator.evaluate(task_names)
 
32
 
33
 
34
  results_trimmed = {
 
23
  batch_size, device, no_cache, limit, write_out=True,
24
  output_base_path='logs')
25
  lamini_results = await custom_evaluator.evaluate()
26
+ print("lamini_results:", lamini_results)
27
 
28
  print(f"Selected Harness Tasks: {task_names}")
29
  harness_evaluator = HarnessEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
30
  batch_size, device, no_cache, limit, write_out=True,
31
  output_base_path='logs')
32
  harness_results = harness_evaluator.evaluate(task_names)
33
+ print("harness_results:", harness_results)
34
 
35
 
36
  results_trimmed = {