Spaces:

lamini
/

leaderboard

Running on CPU Upgrade

App Files Files Community

ayushi-lamini commited on Mar 22

Commit

5c9a1c4

•

1 Parent(s): 93bc043

More changes (#4)

Browse files

Files changed (7) hide show

app.py +86 -86
src/backend/eval/evaluators/openai_eval/main.py +25 -16
src/backend/eval/evaluators/openai_eval/openai_earnings_evaluator.py +3 -1
src/backend/eval/evaluators/openai_eval/openai_ecommerce_evaluator.py +11 -4
src/backend/eval/evaluators/openai_eval/openai_icd_evaluator.py +10 -4
src/backend/eval/models/gpt4_model.py +13 -4
src/backend/run_eval_suite.py +2 -0

app.py CHANGED Viewed

@@ -262,92 +262,92 @@ with demo:
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                            f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                            open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                            f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                            open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                            f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                            open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16" if DEVICE != "cpu" else "float32",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+        #     with gr.Column():
+        #         with gr.Row():
+        #             gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+        #
+        #         with gr.Column():
+        #             with gr.Accordion(
+        #                     f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+        #                     open=False,
+        #             ):
+        #                 with gr.Row():
+        #                     finished_eval_table = gr.components.Dataframe(
+        #                         value=finished_eval_queue_df,
+        #                         headers=EVAL_COLS,
+        #                         datatype=EVAL_TYPES,
+        #                         row_count=5,
+        #                     )
+        #             with gr.Accordion(
+        #                     f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+        #                     open=False,
+        #             ):
+        #                 with gr.Row():
+        #                     running_eval_table = gr.components.Dataframe(
+        #                         value=running_eval_queue_df,
+        #                         headers=EVAL_COLS,
+        #                         datatype=EVAL_TYPES,
+        #                         row_count=5,
+        #                     )
+        #
+        #             with gr.Accordion(
+        #                     f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+        #                     open=False,
+        #             ):
+        #                 with gr.Row():
+        #                     pending_eval_table = gr.components.Dataframe(
+        #                         value=pending_eval_queue_df,
+        #                         headers=EVAL_COLS,
+        #                         datatype=EVAL_TYPES,
+        #                         row_count=5,
+        #                     )
+        #     with gr.Row():
+        #         gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+        #
+        #     with gr.Row():
+        #         with gr.Column():
+        #             model_name_textbox = gr.Textbox(label="Model name")
+        #             revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+        #             model_type = gr.Dropdown(
+        #                 choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+        #                 label="Model type",
+        #                 multiselect=False,
+        #                 value=None,
+        #                 interactive=True,
+        #             )
+        #
+        #         with gr.Column():
+        #             precision = gr.Dropdown(
+        #                 choices=[i.value.name for i in Precision if i != Precision.Unknown],
+        #                 label="Precision",
+        #                 multiselect=False,
+        #                 value="float16" if DEVICE != "cpu" else "float32",
+        #                 interactive=True,
+        #             )
+        #             weight_type = gr.Dropdown(
+        #                 choices=[i.value.name for i in WeightType],
+        #                 label="Weights type",
+        #                 multiselect=False,
+        #                 value="Original",
+        #                 interactive=True,
+        #             )
+        #             base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+        #
+        #     submit_button = gr.Button("Submit Eval")
+        #     submission_result = gr.Markdown()
+        #     submit_button.click(
+        #         add_new_eval,
+        #         [
+        #             model_name_textbox,
+        #             base_model_name_textbox,
+        #             revision_name_textbox,
+        #             precision,
+        #             weight_type,
+        #             model_type,
+        #         ],
+        #         submission_result,
+        #     )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

src/backend/eval/evaluators/openai_eval/main.py CHANGED Viewed

@@ -1,11 +1,14 @@
 from tqdm import tqdm
-from lamini import MistralRunner
 import lamini
 from src.envs import LAMINI_API_KEY
 lamini.api_key = LAMINI_API_KEY
 async def evaluate_openai_model(model, dataset, max_examples):
     for example in tqdm(dataset, desc="Evaluating", total=max_examples):
@@ -17,26 +20,26 @@ def evaluate(model, example):
     prompt = example.get_prompt()
     response = model(prompt=prompt, output_type=example.get_output_type())
     result = {
         "prompt": prompt,
         "response": response,
         "reference_response": example.get_response_json(),
-        "is_exact_match": example.is_exact_match(response),
-        "score": score(
-            example.format_response(response),
-            example.get_response(response),
-            example.get_question(),
-            example.get_rubric()
-        ),
     }
     return result
 def score(response, expected_response, question, rubric):
-    runner = MistralRunner(system_prompt=" ")
-    prompt = "A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference.  You are an expert scorer.\n\n"
     prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
     prompt += rubric
     prompt += "Use the full range. Read the gold answer carefully. "
@@ -49,12 +52,18 @@ def score(response, expected_response, question, rubric):
     prompt += f"========== model answer =========\n{response}\n\n"
     prompt += "=" * 40 + "\n\n"
     prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
-    score = runner(
-        prompt=prompt,
-        output_type={"explanation": "str", "score": "int"}
-    )
-    print(prompt, "\nModel score:", score)
-    return score["score"]

 from tqdm import tqdm
+from lamini import MistralRunner, Lamini
 import lamini
 from src.envs import LAMINI_API_KEY
 lamini.api_key = LAMINI_API_KEY
+lamini.max_workers = 40
+lamini.batch_size = 50
 async def evaluate_openai_model(model, dataset, max_examples):
     for example in tqdm(dataset, desc="Evaluating", total=max_examples):
     prompt = example.get_prompt()
     response = model(prompt=prompt, output_type=example.get_output_type())
+    response["score"] = score(
+        example.format_response(response),
+        example.get_response(response),
+        example.get_question(),
+        example.get_rubric()
+    )
     result = {
         "prompt": prompt,
         "response": response,
         "reference_response": example.get_response_json(),
+        "is_exact_match": example.is_exact_match(response)
     }
     return result
 def score(response, expected_response, question, rubric):
+    api = Lamini(model_name="mistralai/Mistral-7B-Instruct-v0.2")
+    prompt = "<s>[INST] A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n\n"
     prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
     prompt += rubric
     prompt += "Use the full range. Read the gold answer carefully. "
     prompt += f"========== model answer =========\n{response}\n\n"
     prompt += "=" * 40 + "\n\n"
     prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
+    prompt += " [/INST]"
+    try:
+        lamini_score = api.generate(
+            prompt=prompt,
+            output_type={"explanation": "str", "score": "int"},
+            max_new_tokens=150
+        )
+    except Exception as e:
+        print("hit lamini scoring exception:", e)
+        lamini_score = {"explanation": "", "score": 0}
+    print(prompt, "\nModel score:", lamini_score)
+    return lamini_score["score"]

src/backend/eval/evaluators/openai_eval/openai_earnings_evaluator.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import jsonlines
-from src.envs import EARNINGS_DATASET_PATH
 def load_earnings_dataset():
@@ -18,6 +18,8 @@ class EarningsCallsDataset:
     def __iter__(self):
         with jsonlines.open(self.path) as reader:
             for index, obj in enumerate(reader):
                 yield EarningsCallsExample(index, obj)
     def get_length(self):

 import jsonlines
+from src.envs import EARNINGS_DATASET_PATH, MAX_EXAMPLES
 def load_earnings_dataset():
     def __iter__(self):
         with jsonlines.open(self.path) as reader:
             for index, obj in enumerate(reader):
+                if index == MAX_EXAMPLES:
+                    break
                 yield EarningsCallsExample(index, obj)
     def get_length(self):

src/backend/eval/evaluators/openai_eval/openai_ecommerce_evaluator.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import jsonlines
-from src.envs import SHOPPING_DATASET_PATH
 def load_shopping_dataset():
@@ -18,6 +18,8 @@ class ShoppingDataset:
     def __iter__(self):
         with jsonlines.open(self.path) as reader:
             for index, obj in enumerate(reader):
                 yield ShoppingExample(index, obj)
     def get_length(self):
@@ -73,12 +75,17 @@ class ShoppingExample:
         }
     def format_response(self, response):
-        formatted_response = f"Product ID: {response['product_id']}\n"
-        if response['product_name'] != "N/A":
             formatted_response += f"Product Name: {response['product_name']}\n"
-        if response['product_description'] != "N/A":
             formatted_response += f"Description: {response['product_description']}"
         return formatted_response

 import jsonlines
+from src.envs import SHOPPING_DATASET_PATH, MAX_EXAMPLES
 def load_shopping_dataset():
     def __iter__(self):
         with jsonlines.open(self.path) as reader:
             for index, obj in enumerate(reader):
+                if index == MAX_EXAMPLES:
+                    break
                 yield ShoppingExample(index, obj)
     def get_length(self):
         }
     def format_response(self, response):
+        formatted_response = ""
+        if 'product_id' in response:
+            formatted_response += f"Product ID: {response['product_id']}\n"
+        else:
+            formatted_response += "Product ID: N/A\n"
+        if ("product_name" in response) and response['product_name'] != "N/A":
             formatted_response += f"Product Name: {response['product_name']}\n"
+        if ("product_description" in response) and response['product_description'] != "N/A":
             formatted_response += f"Description: {response['product_description']}"
         return formatted_response

src/backend/eval/evaluators/openai_eval/openai_icd_evaluator.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import jsonlines
-from src.envs import ICD_DATASET_PATH
 def load_icd_dataset():
@@ -18,6 +18,8 @@ class ICD11Dataset:
     def __iter__(self):
         with jsonlines.open(self.path) as reader:
             for index, obj in enumerate(reader):
                 yield ICD11Example(index, obj)
     def get_length(self):
@@ -43,7 +45,7 @@ class ICD11Example:
         return prompt
     def is_exact_match(self, response):
-        return self.example["entity"]["code"] == response["product_id"]
     def get_question(self):
         return self.example["question"]
@@ -69,9 +71,13 @@ class ICD11Example:
         }
     def format_response(self, response):
-        formatted_response = f"ICD11 Code: {response['icd11_code']}\n"
-        if response['answer'] != "N/A":
             formatted_response += f"Answer: {response['answer']}\n"
         return formatted_response

 import jsonlines
+from src.envs import ICD_DATASET_PATH, MAX_EXAMPLES
 def load_icd_dataset():
     def __iter__(self):
         with jsonlines.open(self.path) as reader:
             for index, obj in enumerate(reader):
+                if index == MAX_EXAMPLES:
+                    break
                 yield ICD11Example(index, obj)
     def get_length(self):
         return prompt
     def is_exact_match(self, response):
+        return self.example["entity"]["code"] == response["icd11_code"]
     def get_question(self):
         return self.example["question"]
         }
     def format_response(self, response):
+        formatted_response = ""
+        if 'icd11_code' in response:
+            formatted_response += f"ICD11 Code: {response['icd11_code']}\n"
+        else:
+            formatted_response += "ICD11 Code: N/A\n"
+        if ('answer' in response) and response['answer'] != "N/A":
             formatted_response += f"Answer: {response['answer']}\n"
         return formatted_response

src/backend/eval/models/gpt4_model.py CHANGED Viewed

@@ -36,7 +36,7 @@ class GPT4Model(LM):
         json_object = json.loads(response.choices[0].message.content)
-        validate_output(json_object, output_type)
         return json_object
@@ -46,7 +46,11 @@ class GPT4Model(LM):
             try:
                 question = request.doc["question"]
                 result = self.__call__(prompt=question, output_type={"answer": "str"})
-                response = result['answer']
             except Exception as e:
                 print("Error fetching gpt4 response: ", e)
                 # select random answer
@@ -56,8 +60,13 @@ class GPT4Model(LM):
             try:
                 question = request.arguments[0]
                 op_type = {"explanation": "str", "answer": "str"}
-                result = self.__call__(prompt=question, output_type=op_type)
-                response = f"({result['answer']})"
             except Exception as e:
                 print("Error fetching gpt4 response: ", e)
                 # select random answer

         json_object = json.loads(response.choices[0].message.content)
+        # validate_output(json_object, output_type)
         return json_object
             try:
                 question = request.doc["question"]
                 result = self.__call__(prompt=question, output_type={"answer": "str"})
+                response = result['answer'].strip()
+                print("\n\ntask_name: ", request.task_name)
+                print("helm prompt: ", question)
+                print("helm response: ", response)
+                print("\n\n")
             except Exception as e:
                 print("Error fetching gpt4 response: ", e)
                 # select random answer
             try:
                 question = request.arguments[0]
                 op_type = {"explanation": "str", "answer": "str"}
+                prompt = f"The 'answer' should only include the option name. {question}"
+                result = self.__call__(prompt=prompt, output_type=op_type)
+                response = f"({result['answer'].strip()})"
+                print("\n\ntask_name: ", request.task_name)
+                print("helm prompt: ", question)
+                print("helm response: ", response)
+                print("\n\n")
             except Exception as e:
                 print("Error fetching gpt4 response: ", e)
                 # select random answer

src/backend/run_eval_suite.py CHANGED Viewed

@@ -23,12 +23,14 @@ async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, bat
                                        batch_size, device, no_cache, limit, write_out=True,
                                        output_base_path='logs')
     lamini_results = await custom_evaluator.evaluate()
     print(f"Selected Harness Tasks: {task_names}")
     harness_evaluator = HarnessEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
                                          batch_size, device, no_cache, limit, write_out=True,
                                          output_base_path='logs')
     harness_results = harness_evaluator.evaluate(task_names)
     results_trimmed = {

                                        batch_size, device, no_cache, limit, write_out=True,
                                        output_base_path='logs')
     lamini_results = await custom_evaluator.evaluate()
+    print("lamini_results:", lamini_results)
     print(f"Selected Harness Tasks: {task_names}")
     harness_evaluator = HarnessEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
                                          batch_size, device, no_cache, limit, write_out=True,
                                          output_base_path='logs')
     harness_results = harness_evaluator.evaluate(task_names)
+    print("harness_results:", harness_results)
     results_trimmed = {