Spaces:

lamini
/

leaderboard

Running on CPU Upgrade

App Files Files Community

ayushi0430 commited on Mar 16

Commit

283ff70

•

1 Parent(s): 42030b2

cleanup

Browse files

Files changed (9) hide show

Dockerfile +0 -4
adhoc.py +19 -42
main_backend.py +2 -2
output-data/test.txt +0 -0
src/about.py +86 -4
src/backend/lamini_eval/evaluators/ecommerce_evaluator.py +7 -53
src/backend/lamini_eval/evaluators/utils/ecommerce_utils.py +52 -0
src/backend/lamini_eval/models/lamini_model.py +39 -26
src/backend/run_eval_suite.py +4 -4

Dockerfile CHANGED Viewed

@@ -12,12 +12,8 @@ COPY src /code/src
 COPY app.py /code/app.py
 COPY main_backend.py /code/main_backend.py
-# to generate adhoc result files --> to be deleted once generation pipeline is stable
 COPY adhoc.py /code/adhoc.py
-# answer and score generation pipeline
-COPY generation.py /code/generation.py
 EXPOSE 7860

 COPY app.py /code/app.py
 COPY main_backend.py /code/main_backend.py
 COPY adhoc.py /code/adhoc.py
 EXPOSE 7860

adhoc.py CHANGED Viewed

@@ -1,54 +1,31 @@
 import asyncio
-import json
-from src.backend.harness_evaluator import HarnessEvaluator
-from src.backend.lamini_evaluator import LaminiEvaluator
 from src.backend.manage_requests import EvalRequest
-from src.envs import LIMIT
 async def run_adhoc_eval(eval_request: EvalRequest):
-    batch_size = 10
-    lamini_evaluator = LaminiEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
-                                       batch_size, "cpu", True, LIMIT, write_out=True,
-                                       output_base_path='logs')
-    lamini_results = await lamini_evaluator.evaluate()
-    print(f"lamini_results: {lamini_results}")
-    task_names = ["mmlu_flan_n_shot_generative_global_facts", "mmlu_flan_n_shot_generative_formal_logic", "truthfulqa_gen"]
-    # mmlu_flan_n_shot_generative
-    # truthfulqa_gen
-    # babi
-    evaluator = HarnessEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
-                                 batch_size, "", True, 100, write_out=True,
-                                 output_base_path='logs')
-    results = evaluator.evaluate(task_names)
-    print("harness results:", results)
-    results_trimmed = {
-        "config": results["config"],
-        "results": {
-            "mmlu_flan_n_shot_generative_global_facts": results["results"]["mmlu_flan_n_shot_generative_global_facts"],
-            "mmlu_flan_n_shot_generative_formal_logic": results["results"]["mmlu_flan_n_shot_generative_formal_logic"],
-            "truthfulqa_gen": results["results"]["truthfulqa_gen"],
-            "response_subjective_score": lamini_results["results"]["response_subjective_score"],
-            "product_id_precision_score": lamini_results["results"]["product_id_precision_score"],
-        }
-    }
-    results_trimmed["config"]["model_dtype"] = eval_request.precision
-    results_trimmed["config"]["model_name"] = eval_request.model
-    results_trimmed["config"]["model_sha"] = eval_request.revision
-    output = json.dumps(results_trimmed, indent=4)
-    print("output:", output)
-    return output
 def main():
     # eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
-    vals = {"model": "NousResearch/Genstruct-7B", "json_filepath": "", "base_model": "", "revision": "main",
             "private": False,
             "precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
             "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,

 import asyncio
 from src.backend.manage_requests import EvalRequest
+from src.envs import LIMIT, EVAL_RESULTS_PATH_BACKEND, RESULTS_REPO, DEVICE
+from src.backend.run_eval_suite import run_evaluation
+from src.about import HarnessTasks
 async def run_adhoc_eval(eval_request: EvalRequest):
+    # This job runs lamini tasks and harness tasks
+    TASKS_HARNESS = [task.value.benchmark for task in HarnessTasks]
+    await run_evaluation(
+            eval_request=eval_request,
+            task_names=TASKS_HARNESS,
+            num_fewshot=0,
+            local_dir=EVAL_RESULTS_PATH_BACKEND,
+            results_repo=RESULTS_REPO,
+            batch_size=1,
+            device=DEVICE,
+            no_cache=True,
+            limit=LIMIT
+        )
 def main():
     # eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
+    vals = {"model": "hf-internal-testing/tiny-random-gpt2", "json_filepath": "", "base_model": "", "revision": "main",
             "private": False,
             "precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
             "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,

main_backend.py CHANGED Viewed

@@ -15,10 +15,10 @@ from src.backend.sort_queue import sort_models_by_priority
 from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, \
     LIMIT, TOKEN, RUN_MODE
-from src.about import Tasks, NUM_FEWSHOT
 import asyncio
-TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 logging.basicConfig(level=logging.ERROR)
 pp = pprint.PrettyPrinter(width=80)

 from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, \
     LIMIT, TOKEN, RUN_MODE
+from src.about import NUM_FEWSHOT, HarnessTasks
 import asyncio
+TASKS_HARNESS = [task.value.benchmark for task in HarnessTasks]
 logging.basicConfig(level=logging.ERROR)
 pp = pprint.PrettyPrinter(width=80)

output-data/test.txt ADDED Viewed

File without changes

src/about.py CHANGED Viewed

@@ -8,6 +8,12 @@ class Task:
     metric: str
     col_name: str
 # Select your tasks here
 # ---------------------------------------------------
@@ -20,6 +26,7 @@ class Tasks(Enum):
     mmlu_flan_n_shot_generative_formal_logic = Task("mmlu_flan_n_shot_generative_formal_logic",
                                                     "exact_match,strict-match", "MMLU (Formal logic)")
     truthfulqa_gen = Task("truthfulqa_gen", "bleu_max,none", "Truthful QA")
     response_subjective_score = Task("response_subjective_score", "response_subjective_score",
                                      "Subjective Response Score")
     product_id_precision_score = Task("product_id_precision_score", "product_id_precision_score",
@@ -42,19 +49,94 @@ This leaderboard evaluates and benchmarks LLM hallucinations when answering ques
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 ## Reproducibility
-To run this code locally:
-1. Change the environment variables in `src/env` and `src/about`.
-2. Run the following command to start the server locally:
 ```
 ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface model you want to try>"
 Eg:
 ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
 ```
-This does not upload/download any requests/results files.
 """

     metric: str
     col_name: str
+class HarnessTasks(Enum):
+    mmlu_flan_n_shot_generative_global_facts = Task("mmlu_flan_n_shot_generative_global_facts",
+                                                    "exact_match,strict-match", "MMLU (Global facts)")
+    mmlu_flan_n_shot_generative_formal_logic = Task("mmlu_flan_n_shot_generative_formal_logic",
+                                                    "exact_match,strict-match", "MMLU (Formal logic)")
+    truthfulqa_gen = Task("truthfulqa_gen", "bleu_max,none", "Truthful QA")
 # Select your tasks here
 # ---------------------------------------------------
     mmlu_flan_n_shot_generative_formal_logic = Task("mmlu_flan_n_shot_generative_formal_logic",
                                                     "exact_match,strict-match", "MMLU (Formal logic)")
     truthfulqa_gen = Task("truthfulqa_gen", "bleu_max,none", "Truthful QA")
     response_subjective_score = Task("response_subjective_score", "response_subjective_score",
                                      "Subjective Response Score")
     product_id_precision_score = Task("product_id_precision_score", "product_id_precision_score",
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
+All the requests are executed on the Lamini servers. Get your API key from [Lamini](https://app.lamini.ai/) today!
+We run the following evaluation metrics:
+1. **MMLU (Global facts)**: Evaluates the model's ability to answer questions about global facts.
+2. **MMLU (Formal logic)**: Evaluates the model's ability to answer questions about formal logic.
+3. **Truthful QA**: Evaluates the model's ability to generate truthful answers.
+### Custom benchmarks:
+We introduce 2 custom benchmarks run on an open-source product catalog dataset.
+4. **Subjective Response Score**: Subjectively score and evaluates the model's ability to answer a customer's question about a product.
+5. **Product ID Precision Score**: An exact match of the product id answered by the model in response to a customer's question. This evaluates how well the model learns and its recall abilities.
+Task:
+Given a product catalog dataset, we evaluate the models on its ability to remember the product information correctly and answer a customer's question based on the product information.
+The expectation here is that the models finetuned over this dataset should be able to generate the correct product information compared to the base model.
+Prompt used to generate product information:
+```
+You are an expert shopper at Instacart.
+You are helping a customer find a product.
+Include the product name, id, and detailed description in your answer.
+A product id is a number between 0 and 100,000.
+The customer asks\n
+<question>
+```
+We use Mistral(mistralai/Mistral-7B-Instruct-v0.1) to score the model answers.
+Prompt to get the scoring rubric:
+```
+Read this scoring rubric carefully and follow the instructions precisely:\n
+A score of 5 means that model's id is the same as the gold answer's id.\n
+A score of 4 means that the model's product name is the same or a paraphrase of the gold answer, but the id may be wrong.  For example, the product names 'Tuna' and 'Canned Tuna' are similar\n
+A score of 3 means that the model's description is similar as the gold answer's description, but the id and product name may be wrong.  For example, lemonade and iced tea are different products.\n
+A score of 2 means that the model's description is not similar to the gold answer, but the answer is plausible.\n
+A score of 1 means that the model's description is not similar to the gold answer, and the answer doesn't make sense.\n
+Here are three examples of how to score the model's response:\n
+gold answer == Product ID: 1234, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Tuna, Description: Canned Tuna, score == 5\n
+gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Canned Tuna, Description: Tuna, score == 4\n
+gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Bubble Gum, Description: Delicious treat, score == 1\n
+Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n
+```
+Prompt used to score the model answers:
+```
+A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n
+Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n
+<rubric prompt>
+Use the full range. Read the gold answer carefully.
+Explain your score in 2-3 short sentences not exceeding 100 words each, then assign a score.
+Output your score as a JSON object in the format <"explanation\" : str, \"score\" : int>
+Use single quotes within your explanation. End your explanation with a double quote.\n
+Prefer answers that are most similar to the gold answer, even if the gold answer refused to answer the question.\n\n
+========== question =========\n<question>\n\n
+f"========== gold answer =========\n<expected response>\n\n
+f"========== model answer =========\n<model response>\n\n
+"=" * 40 + "\n\n
+f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n
+```
 ## Reproducibility
+First change the environment variables in `src/env`.
+Now, there are 2 ways to run this code locally:
+1. To run the backend server + UI locally, run the following command:
 ```
 ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface model you want to try>"
 Eg:
 ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
 ```
+This does not upload/download any requests/results files. It will run the eval metrics for the model specified and save the results to output-data folder.
+2. To run only the backend server locally, make changes to adhoc.py to specify your model name and run the following command:
+```
+./run-adhoc.sh"
+```
+This does not upload/download any requests/results files. It will run the eval metrics for the model specified and save the results to the output-data folder.
 """

src/backend/lamini_eval/evaluators/ecommerce_evaluator.py CHANGED Viewed

@@ -7,11 +7,10 @@ import lamini
 from lamini.generation.base_prompt_object import PromptObject
 from lamini.generation.generation_node import GenerationNode
 from lamini.generation.generation_pipeline import GenerationPipeline
-from tqdm import tqdm
-import hashlib
 from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
 from src.backend.lamini_eval.models.lamini_model import load_lamini_model
 from src.envs import DATASET_PATH, LAMINI_API_KEY
 lamini.api_key = LAMINI_API_KEY
@@ -46,70 +45,25 @@ class EcommerceEvaluator:
                 i += 1
                 yield PromptObject(prompt="", data=line)
-    async def save_results(self, answers, model_name):
-        path = f"/code/data/{model_name}"
-        os.makedirs(path, exist_ok=True)
-        path = path + "/model-answers.jsonl"
-        short_answers = []
-        with jsonlines.open(path, "w") as writer:
-            pbar = tqdm(desc="Saving answers", unit=" answers")
-            async for answer in answers:
-                answer = {
-                    "prompt": answer.prompt,
-                    "question": answer.data["question"],
-                    "answer": answer.response,
-                    "is_exact_match": answer.data["is_exact_match"]
-                }
-                short_answers.append(answer)
-                print(f"\n\n=======\n{answer}\n\n")
-                writer.write(answer)
-                pbar.update()
-        return short_answers
     async def evaluate_hallucination(self):
         dataset = self.load_shopping_dataset(DATASET_PATH)
         answers = AnswerScorePipeline(answer_model=self.model_name).call(dataset)
-        results = await self.save_results(answers, model_name=self.model_name)
-        print(f"evaluate_hallucination results: {results}")
         try:
             mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
             product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
-            results = self.format_results(model_name=self.model_name,
-                                          response_subjective_score=mean_response_score,
-                                          product_id_precision_score=product_id_precision_score)
         except ZeroDivisionError:
             raise ValueError("No results to evaluate")
         return results
-    def compute_hash(self, input):
-        m = hashlib.md5()
-        m.update(str(input).encode("utf-8"))
-        return m.hexdigest()
-    def format_results(self, model_name: str, response_subjective_score: float,
-                       product_id_precision_score: float) -> dict:
-        results = {
-            "config": {
-                "model_dtype": "100",  # Precision with which you ran the evaluation
-                "model_name": model_name,  # Name of the model
-                "model_sha": self.compute_hash(model_name)  # Hash of the model
-            },
-            "results": {
-                "response_subjective_score": {
-                    "response_subjective_score": response_subjective_score
-                },
-                "product_id_precision_score": {
-                    "product_id_precision_score": product_id_precision_score
-                }
-            }
-        }
-        return results
 class AnswerScorePipeline(GenerationPipeline):
-    def __init__(self, answer_model="mistralai/Mistral-7B-Instruct-v0.1", score_model="mistralai/Mistral-7B-Instruct-v0.1"):
         super(AnswerScorePipeline, self).__init__()
         self.answer_generator = AnswerGenerator(model_name=answer_model)

 from lamini.generation.base_prompt_object import PromptObject
 from lamini.generation.generation_node import GenerationNode
 from lamini.generation.generation_pipeline import GenerationPipeline
 from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
 from src.backend.lamini_eval.models.lamini_model import load_lamini_model
 from src.envs import DATASET_PATH, LAMINI_API_KEY
+from src.backend.lamini_eval.evaluators.utils.ecommerce_utils import format_results, save_results
 lamini.api_key = LAMINI_API_KEY
                 i += 1
                 yield PromptObject(prompt="", data=line)
     async def evaluate_hallucination(self):
         dataset = self.load_shopping_dataset(DATASET_PATH)
         answers = AnswerScorePipeline(answer_model=self.model_name).call(dataset)
+        results = await save_results(answers, model_name=self.model_name)
         try:
             mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
             product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
+            results = format_results(model_name=self.model_name,
+                                     response_subjective_score=mean_response_score,
+                                     product_id_precision_score=product_id_precision_score)
         except ZeroDivisionError:
             raise ValueError("No results to evaluate")
         return results
 class AnswerScorePipeline(GenerationPipeline):
+    def __init__(self, answer_model="mistralai/Mistral-7B-Instruct-v0.1",
+                 score_model="mistralai/Mistral-7B-Instruct-v0.1"):
         super(AnswerScorePipeline, self).__init__()
         self.answer_generator = AnswerGenerator(model_name=answer_model)

src/backend/lamini_eval/evaluators/utils/ecommerce_utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import hashlib
+import os
+import jsonlines
+from tqdm import tqdm
+def format_results(model_name: str, response_subjective_score: float,
+                   product_id_precision_score: float) -> dict:
+    results = {
+        "config": {
+            "model_dtype": "100",  # Precision with which you ran the evaluation
+            "model_name": model_name,  # Name of the model
+            "model_sha": compute_hash(model_name)  # Hash of the model
+        },
+        "results": {
+            "response_subjective_score": {
+                "response_subjective_score": response_subjective_score
+            },
+            "product_id_precision_score": {
+                "product_id_precision_score": product_id_precision_score
+            }
+        }
+    }
+    return results
+def compute_hash(input):
+    m = hashlib.md5()
+    m.update(str(input).encode("utf-8"))
+    return m.hexdigest()
+async def save_results(answers, model_name):
+    path = f"/code/data/{model_name}"
+    os.makedirs(path, exist_ok=True)
+    path = path + "/lamini-answers.jsonl"
+    short_answers = []
+    with jsonlines.open(path, "w") as writer:
+        pbar = tqdm(desc="Saving answers", unit=" answers")
+        async for answer in answers:
+            answer = {
+                "prompt": answer.prompt,
+                "question": answer.data["question"],
+                "answer": answer.response,
+                "is_exact_match": answer.data["is_exact_match"]
+            }
+            short_answers.append(answer)
+            print(f"\n\n=======\n{answer}\n\n")
+            writer.write(answer)
+            pbar.update()
+    return short_answers

src/backend/lamini_eval/models/lamini_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 from typing import List
 import lamini
 from lamini import MistralRunner
 from lm_eval.api.model import LM
@@ -28,35 +28,48 @@ class LaminiModel(LM):
     def __call__(self, prompt, output_type):
         return self.runner(prompt=prompt, output_type=output_type)
     def generate_until(self, requests) -> List[str]:
         print("inside generate_until")
         res = []
-        for request in tqdm(requests):
-            question = request.arguments[0]
-            if request.task_name == "truthfulqa_gen":
-                obj = TruthfulQA_Evaluator()
-                prompt = obj.get_prompt(question)
-                try:
-                    response = self.runner(prompt=prompt)
-                except Exception as e:
-                    # select random answer
-                    print("Error fetching lamini response: ", e)
-                    response = "\nA: none"
-            else:
-                obj = MMLU_Evaluator()
-                prompt = obj.get_prompt(question)
-                try:
-                    op_type = {"explanation": "str", "answer": "str"}
-                    answer = self.runner(prompt=prompt, output_type=op_type)
-                    response = f"({answer['answer']})"
-                except Exception as e:
-                    # select random answer
-                    print("Error fetching lamini response: ", e)
-                    # random answer
-                    response = "(A)"
-            res.append(response)
-            self.cache_hook.add_partial("generate_until", request, response)
         return res
     def loglikelihood_rolling(self, requests):

 import os
 from typing import List
+import jsonlines
 import lamini
 from lamini import MistralRunner
 from lm_eval.api.model import LM
     def __call__(self, prompt, output_type):
         return self.runner(prompt=prompt, output_type=output_type)
+    def get_helm_response(self, request):
+        question = request.arguments[0]
+        if request.task_name == "truthfulqa_gen":
+            obj = TruthfulQA_Evaluator()
+            prompt = obj.get_prompt(question)
+            try:
+                response = self.runner(prompt=prompt)
+            except Exception as e:
+                # select random answer
+                print("Error fetching lamini response: ", e)
+                response = "\nA: none"
+        else:
+            obj = MMLU_Evaluator()
+            prompt = obj.get_prompt(question)
+            try:
+                op_type = {"explanation": "str", "answer": "str"}
+                answer = self.runner(prompt=prompt, output_type=op_type)
+                response = f"({answer['answer']})"
+            except Exception as e:
+                # select random answer
+                print("Error fetching lamini response: ", e)
+                # random answer
+                response = "(A)"
+        return response
     def generate_until(self, requests) -> List[str]:
         print("inside generate_until")
         res = []
+        path = f"/code/data/{self.model}"
+        os.makedirs(path, exist_ok=True)
+        path = path + "/helm-answers.jsonl"
+        with jsonlines.open(path, "w") as writer:
+            for request in tqdm(requests):
+                write_dict = request.__dict__
+                response = self.get_helm_response(request)
+                write_dict["model_response"] = response
+                writer.write(write_dict)
+                res.append(response)
+                self.cache_hook.add_partial("generate_until", request, response)
         return res
     def loglikelihood_rolling(self, requests):

src/backend/run_eval_suite.py CHANGED Viewed

@@ -10,7 +10,7 @@ from src.backend.harness_evaluator import HarnessEvaluator
 logging.getLogger("openai").setLevel(logging.WARNING)
 async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str,
                    results_repo: str, no_cache=True, limit=None):
     if limit:
@@ -29,12 +29,12 @@ async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, bat
                                          output_base_path='logs')
     results = harness_evaluator.evaluate(task_names)
-    print("results:", results)
     results_trimmed = {
         "config": results["config"],
         "results": {
-            "mmlu_flan_n_shot_generative_astronomy": results["results"]["mmlu_flan_n_shot_generative_astronomy"],
-            "mmlu_flan_n_shot_generative_anatomy": results["results"]["mmlu_flan_n_shot_generative_anatomy"],
             "response_subjective_score": lamini_results["results"]["response_subjective_score"],
             "product_id_precision_score": lamini_results["results"]["product_id_precision_score"],
         }

 logging.getLogger("openai").setLevel(logging.WARNING)
+# This functon runs lamini tasks and harness tasks
 async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str,
                    results_repo: str, no_cache=True, limit=None):
     if limit:
                                          output_base_path='logs')
     results = harness_evaluator.evaluate(task_names)
     results_trimmed = {
         "config": results["config"],
         "results": {
+            "mmlu_flan_n_shot_generative_global_facts": results["results"]["mmlu_flan_n_shot_generative_global_facts"],
+            "mmlu_flan_n_shot_generative_formal_logic": results["results"]["mmlu_flan_n_shot_generative_formal_logic"],
+            "truthfulqa_gen": results["results"]["truthfulqa_gen"],
             "response_subjective_score": lamini_results["results"]["response_subjective_score"],
             "product_id_precision_score": lamini_results["results"]["product_id_precision_score"],
         }