ayushi0430 commited on
Commit
283ff70
1 Parent(s): 42030b2
Dockerfile CHANGED
@@ -12,12 +12,8 @@ COPY src /code/src
12
  COPY app.py /code/app.py
13
  COPY main_backend.py /code/main_backend.py
14
 
15
- # to generate adhoc result files --> to be deleted once generation pipeline is stable
16
  COPY adhoc.py /code/adhoc.py
17
 
18
- # answer and score generation pipeline
19
- COPY generation.py /code/generation.py
20
-
21
 
22
  EXPOSE 7860
23
 
 
12
  COPY app.py /code/app.py
13
  COPY main_backend.py /code/main_backend.py
14
 
 
15
  COPY adhoc.py /code/adhoc.py
16
 
 
 
 
17
 
18
  EXPOSE 7860
19
 
adhoc.py CHANGED
@@ -1,54 +1,31 @@
1
  import asyncio
2
- import json
3
-
4
- from src.backend.harness_evaluator import HarnessEvaluator
5
- from src.backend.lamini_evaluator import LaminiEvaluator
6
  from src.backend.manage_requests import EvalRequest
7
- from src.envs import LIMIT
 
 
8
 
9
 
10
  async def run_adhoc_eval(eval_request: EvalRequest):
11
- batch_size = 10
12
-
13
- lamini_evaluator = LaminiEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
14
- batch_size, "cpu", True, LIMIT, write_out=True,
15
- output_base_path='logs')
16
- lamini_results = await lamini_evaluator.evaluate()
17
- print(f"lamini_results: {lamini_results}")
18
-
19
- task_names = ["mmlu_flan_n_shot_generative_global_facts", "mmlu_flan_n_shot_generative_formal_logic", "truthfulqa_gen"]
20
- # mmlu_flan_n_shot_generative
21
- # truthfulqa_gen
22
- # babi
23
- evaluator = HarnessEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
24
- batch_size, "", True, 100, write_out=True,
25
- output_base_path='logs')
26
- results = evaluator.evaluate(task_names)
27
- print("harness results:", results)
28
-
29
- results_trimmed = {
30
- "config": results["config"],
31
- "results": {
32
- "mmlu_flan_n_shot_generative_global_facts": results["results"]["mmlu_flan_n_shot_generative_global_facts"],
33
- "mmlu_flan_n_shot_generative_formal_logic": results["results"]["mmlu_flan_n_shot_generative_formal_logic"],
34
- "truthfulqa_gen": results["results"]["truthfulqa_gen"],
35
- "response_subjective_score": lamini_results["results"]["response_subjective_score"],
36
- "product_id_precision_score": lamini_results["results"]["product_id_precision_score"],
37
- }
38
- }
39
- results_trimmed["config"]["model_dtype"] = eval_request.precision
40
- results_trimmed["config"]["model_name"] = eval_request.model
41
- results_trimmed["config"]["model_sha"] = eval_request.revision
42
-
43
- output = json.dumps(results_trimmed, indent=4)
44
- print("output:", output)
45
-
46
- return output
47
 
48
 
49
  def main():
50
  # eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
51
- vals = {"model": "NousResearch/Genstruct-7B", "json_filepath": "", "base_model": "", "revision": "main",
52
  "private": False,
53
  "precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
54
  "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,
 
1
  import asyncio
 
 
 
 
2
  from src.backend.manage_requests import EvalRequest
3
+ from src.envs import LIMIT, EVAL_RESULTS_PATH_BACKEND, RESULTS_REPO, DEVICE
4
+ from src.backend.run_eval_suite import run_evaluation
5
+ from src.about import HarnessTasks
6
 
7
 
8
  async def run_adhoc_eval(eval_request: EvalRequest):
9
+ # This job runs lamini tasks and harness tasks
10
+
11
+ TASKS_HARNESS = [task.value.benchmark for task in HarnessTasks]
12
+
13
+ await run_evaluation(
14
+ eval_request=eval_request,
15
+ task_names=TASKS_HARNESS,
16
+ num_fewshot=0,
17
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
18
+ results_repo=RESULTS_REPO,
19
+ batch_size=1,
20
+ device=DEVICE,
21
+ no_cache=True,
22
+ limit=LIMIT
23
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  def main():
27
  # eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
28
+ vals = {"model": "hf-internal-testing/tiny-random-gpt2", "json_filepath": "", "base_model": "", "revision": "main",
29
  "private": False,
30
  "precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
31
  "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,
main_backend.py CHANGED
@@ -15,10 +15,10 @@ from src.backend.sort_queue import sort_models_by_priority
15
 
16
  from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, \
17
  LIMIT, TOKEN, RUN_MODE
18
- from src.about import Tasks, NUM_FEWSHOT
19
  import asyncio
20
 
21
- TASKS_HARNESS = [task.value.benchmark for task in Tasks]
22
 
23
  logging.basicConfig(level=logging.ERROR)
24
  pp = pprint.PrettyPrinter(width=80)
 
15
 
16
  from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, \
17
  LIMIT, TOKEN, RUN_MODE
18
+ from src.about import NUM_FEWSHOT, HarnessTasks
19
  import asyncio
20
 
21
+ TASKS_HARNESS = [task.value.benchmark for task in HarnessTasks]
22
 
23
  logging.basicConfig(level=logging.ERROR)
24
  pp = pprint.PrettyPrinter(width=80)
output-data/test.txt ADDED
File without changes
src/about.py CHANGED
@@ -8,6 +8,12 @@ class Task:
8
  metric: str
9
  col_name: str
10
 
 
 
 
 
 
 
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
@@ -20,6 +26,7 @@ class Tasks(Enum):
20
  mmlu_flan_n_shot_generative_formal_logic = Task("mmlu_flan_n_shot_generative_formal_logic",
21
  "exact_match,strict-match", "MMLU (Formal logic)")
22
  truthfulqa_gen = Task("truthfulqa_gen", "bleu_max,none", "Truthful QA")
 
23
  response_subjective_score = Task("response_subjective_score", "response_subjective_score",
24
  "Subjective Response Score")
25
  product_id_precision_score = Task("product_id_precision_score", "product_id_precision_score",
@@ -42,19 +49,94 @@ This leaderboard evaluates and benchmarks LLM hallucinations when answering ques
42
  LLM_BENCHMARKS_TEXT = f"""
43
  ## How it works
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  ## Reproducibility
46
- To run this code locally:
47
 
48
- 1. Change the environment variables in `src/env` and `src/about`.
 
 
49
 
50
- 2. Run the following command to start the server locally:
51
  ```
52
  ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface model you want to try>"
53
 
54
  Eg:
55
  ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
56
  ```
57
- This does not upload/download any requests/results files.
 
 
 
 
 
 
58
 
59
  """
60
 
 
8
  metric: str
9
  col_name: str
10
 
11
+ class HarnessTasks(Enum):
12
+ mmlu_flan_n_shot_generative_global_facts = Task("mmlu_flan_n_shot_generative_global_facts",
13
+ "exact_match,strict-match", "MMLU (Global facts)")
14
+ mmlu_flan_n_shot_generative_formal_logic = Task("mmlu_flan_n_shot_generative_formal_logic",
15
+ "exact_match,strict-match", "MMLU (Formal logic)")
16
+ truthfulqa_gen = Task("truthfulqa_gen", "bleu_max,none", "Truthful QA")
17
 
18
  # Select your tasks here
19
  # ---------------------------------------------------
 
26
  mmlu_flan_n_shot_generative_formal_logic = Task("mmlu_flan_n_shot_generative_formal_logic",
27
  "exact_match,strict-match", "MMLU (Formal logic)")
28
  truthfulqa_gen = Task("truthfulqa_gen", "bleu_max,none", "Truthful QA")
29
+
30
  response_subjective_score = Task("response_subjective_score", "response_subjective_score",
31
  "Subjective Response Score")
32
  product_id_precision_score = Task("product_id_precision_score", "product_id_precision_score",
 
49
  LLM_BENCHMARKS_TEXT = f"""
50
  ## How it works
51
 
52
+ All the requests are executed on the Lamini servers. Get your API key from [Lamini](https://app.lamini.ai/) today!
53
+
54
+ We run the following evaluation metrics:
55
+
56
+ 1. **MMLU (Global facts)**: Evaluates the model's ability to answer questions about global facts.
57
+ 2. **MMLU (Formal logic)**: Evaluates the model's ability to answer questions about formal logic.
58
+ 3. **Truthful QA**: Evaluates the model's ability to generate truthful answers.
59
+
60
+ ### Custom benchmarks:
61
+
62
+ We introduce 2 custom benchmarks run on an open-source product catalog dataset.
63
+
64
+ 4. **Subjective Response Score**: Subjectively score and evaluates the model's ability to answer a customer's question about a product.
65
+ 5. **Product ID Precision Score**: An exact match of the product id answered by the model in response to a customer's question. This evaluates how well the model learns and its recall abilities.
66
+
67
+
68
+ Task:
69
+ Given a product catalog dataset, we evaluate the models on its ability to remember the product information correctly and answer a customer's question based on the product information.
70
+ The expectation here is that the models finetuned over this dataset should be able to generate the correct product information compared to the base model.
71
+
72
+ Prompt used to generate product information:
73
+ ```
74
+ You are an expert shopper at Instacart.
75
+ You are helping a customer find a product.
76
+ Include the product name, id, and detailed description in your answer.
77
+ A product id is a number between 0 and 100,000.
78
+ The customer asks\n
79
+ <question>
80
+ ```
81
+
82
+
83
+ We use Mistral(mistralai/Mistral-7B-Instruct-v0.1) to score the model answers.
84
+
85
+ Prompt to get the scoring rubric:
86
+ ```
87
+ Read this scoring rubric carefully and follow the instructions precisely:\n
88
+ A score of 5 means that model's id is the same as the gold answer's id.\n
89
+ A score of 4 means that the model's product name is the same or a paraphrase of the gold answer, but the id may be wrong. For example, the product names 'Tuna' and 'Canned Tuna' are similar\n
90
+ A score of 3 means that the model's description is similar as the gold answer's description, but the id and product name may be wrong. For example, lemonade and iced tea are different products.\n
91
+ A score of 2 means that the model's description is not similar to the gold answer, but the answer is plausible.\n
92
+ A score of 1 means that the model's description is not similar to the gold answer, and the answer doesn't make sense.\n
93
+
94
+ Here are three examples of how to score the model's response:\n
95
+ gold answer == Product ID: 1234, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Tuna, Description: Canned Tuna, score == 5\n
96
+ gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Canned Tuna, Description: Tuna, score == 4\n
97
+ gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Bubble Gum, Description: Delicious treat, score == 1\n
98
+ Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n
99
+ ```
100
+
101
+ Prompt used to score the model answers:
102
+ ```
103
+ A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n
104
+ Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n
105
+ <rubric prompt>
106
+
107
+ Use the full range. Read the gold answer carefully.
108
+ Explain your score in 2-3 short sentences not exceeding 100 words each, then assign a score.
109
+ Output your score as a JSON object in the format <"explanation\" : str, \"score\" : int>
110
+ Use single quotes within your explanation. End your explanation with a double quote.\n
111
+ Prefer answers that are most similar to the gold answer, even if the gold answer refused to answer the question.\n\n
112
+ ========== question =========\n<question>\n\n
113
+ f"========== gold answer =========\n<expected response>\n\n
114
+ f"========== model answer =========\n<model response>\n\n
115
+ "=" * 40 + "\n\n
116
+ f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n
117
+ ```
118
+
119
+
120
  ## Reproducibility
 
121
 
122
+ First change the environment variables in `src/env`.
123
+
124
+ Now, there are 2 ways to run this code locally:
125
 
126
+ 1. To run the backend server + UI locally, run the following command:
127
  ```
128
  ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface model you want to try>"
129
 
130
  Eg:
131
  ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
132
  ```
133
+ This does not upload/download any requests/results files. It will run the eval metrics for the model specified and save the results to output-data folder.
134
+
135
+ 2. To run only the backend server locally, make changes to adhoc.py to specify your model name and run the following command:
136
+ ```
137
+ ./run-adhoc.sh"
138
+ ```
139
+ This does not upload/download any requests/results files. It will run the eval metrics for the model specified and save the results to the output-data folder.
140
 
141
  """
142
 
src/backend/lamini_eval/evaluators/ecommerce_evaluator.py CHANGED
@@ -7,11 +7,10 @@ import lamini
7
  from lamini.generation.base_prompt_object import PromptObject
8
  from lamini.generation.generation_node import GenerationNode
9
  from lamini.generation.generation_pipeline import GenerationPipeline
10
- from tqdm import tqdm
11
- import hashlib
12
  from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
13
  from src.backend.lamini_eval.models.lamini_model import load_lamini_model
14
  from src.envs import DATASET_PATH, LAMINI_API_KEY
 
15
 
16
  lamini.api_key = LAMINI_API_KEY
17
 
@@ -46,70 +45,25 @@ class EcommerceEvaluator:
46
  i += 1
47
  yield PromptObject(prompt="", data=line)
48
 
49
- async def save_results(self, answers, model_name):
50
- path = f"/code/data/{model_name}"
51
- os.makedirs(path, exist_ok=True)
52
- path = path + "/model-answers.jsonl"
53
- short_answers = []
54
- with jsonlines.open(path, "w") as writer:
55
- pbar = tqdm(desc="Saving answers", unit=" answers")
56
- async for answer in answers:
57
- answer = {
58
- "prompt": answer.prompt,
59
- "question": answer.data["question"],
60
- "answer": answer.response,
61
- "is_exact_match": answer.data["is_exact_match"]
62
- }
63
- short_answers.append(answer)
64
- print(f"\n\n=======\n{answer}\n\n")
65
- writer.write(answer)
66
- pbar.update()
67
- return short_answers
68
-
69
  async def evaluate_hallucination(self):
70
  dataset = self.load_shopping_dataset(DATASET_PATH)
71
  answers = AnswerScorePipeline(answer_model=self.model_name).call(dataset)
72
- results = await self.save_results(answers, model_name=self.model_name)
73
- print(f"evaluate_hallucination results: {results}")
74
 
75
  try:
76
  mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
77
  product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
78
- results = self.format_results(model_name=self.model_name,
79
- response_subjective_score=mean_response_score,
80
- product_id_precision_score=product_id_precision_score)
81
  except ZeroDivisionError:
82
  raise ValueError("No results to evaluate")
83
  return results
84
 
85
- def compute_hash(self, input):
86
- m = hashlib.md5()
87
- m.update(str(input).encode("utf-8"))
88
- return m.hexdigest()
89
-
90
- def format_results(self, model_name: str, response_subjective_score: float,
91
- product_id_precision_score: float) -> dict:
92
- results = {
93
- "config": {
94
- "model_dtype": "100", # Precision with which you ran the evaluation
95
- "model_name": model_name, # Name of the model
96
- "model_sha": self.compute_hash(model_name) # Hash of the model
97
- },
98
- "results": {
99
- "response_subjective_score": {
100
- "response_subjective_score": response_subjective_score
101
- },
102
- "product_id_precision_score": {
103
- "product_id_precision_score": product_id_precision_score
104
- }
105
- }
106
- }
107
-
108
- return results
109
-
110
 
111
  class AnswerScorePipeline(GenerationPipeline):
112
- def __init__(self, answer_model="mistralai/Mistral-7B-Instruct-v0.1", score_model="mistralai/Mistral-7B-Instruct-v0.1"):
 
113
  super(AnswerScorePipeline, self).__init__()
114
 
115
  self.answer_generator = AnswerGenerator(model_name=answer_model)
 
7
  from lamini.generation.base_prompt_object import PromptObject
8
  from lamini.generation.generation_node import GenerationNode
9
  from lamini.generation.generation_pipeline import GenerationPipeline
 
 
10
  from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
11
  from src.backend.lamini_eval.models.lamini_model import load_lamini_model
12
  from src.envs import DATASET_PATH, LAMINI_API_KEY
13
+ from src.backend.lamini_eval.evaluators.utils.ecommerce_utils import format_results, save_results
14
 
15
  lamini.api_key = LAMINI_API_KEY
16
 
 
45
  i += 1
46
  yield PromptObject(prompt="", data=line)
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  async def evaluate_hallucination(self):
49
  dataset = self.load_shopping_dataset(DATASET_PATH)
50
  answers = AnswerScorePipeline(answer_model=self.model_name).call(dataset)
51
+ results = await save_results(answers, model_name=self.model_name)
 
52
 
53
  try:
54
  mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
55
  product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
56
+ results = format_results(model_name=self.model_name,
57
+ response_subjective_score=mean_response_score,
58
+ product_id_precision_score=product_id_precision_score)
59
  except ZeroDivisionError:
60
  raise ValueError("No results to evaluate")
61
  return results
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  class AnswerScorePipeline(GenerationPipeline):
65
+ def __init__(self, answer_model="mistralai/Mistral-7B-Instruct-v0.1",
66
+ score_model="mistralai/Mistral-7B-Instruct-v0.1"):
67
  super(AnswerScorePipeline, self).__init__()
68
 
69
  self.answer_generator = AnswerGenerator(model_name=answer_model)
src/backend/lamini_eval/evaluators/utils/ecommerce_utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import jsonlines
4
+ from tqdm import tqdm
5
+
6
+
7
+ def format_results(model_name: str, response_subjective_score: float,
8
+ product_id_precision_score: float) -> dict:
9
+ results = {
10
+ "config": {
11
+ "model_dtype": "100", # Precision with which you ran the evaluation
12
+ "model_name": model_name, # Name of the model
13
+ "model_sha": compute_hash(model_name) # Hash of the model
14
+ },
15
+ "results": {
16
+ "response_subjective_score": {
17
+ "response_subjective_score": response_subjective_score
18
+ },
19
+ "product_id_precision_score": {
20
+ "product_id_precision_score": product_id_precision_score
21
+ }
22
+ }
23
+ }
24
+
25
+ return results
26
+
27
+
28
+ def compute_hash(input):
29
+ m = hashlib.md5()
30
+ m.update(str(input).encode("utf-8"))
31
+ return m.hexdigest()
32
+
33
+
34
+ async def save_results(answers, model_name):
35
+ path = f"/code/data/{model_name}"
36
+ os.makedirs(path, exist_ok=True)
37
+ path = path + "/lamini-answers.jsonl"
38
+ short_answers = []
39
+ with jsonlines.open(path, "w") as writer:
40
+ pbar = tqdm(desc="Saving answers", unit=" answers")
41
+ async for answer in answers:
42
+ answer = {
43
+ "prompt": answer.prompt,
44
+ "question": answer.data["question"],
45
+ "answer": answer.response,
46
+ "is_exact_match": answer.data["is_exact_match"]
47
+ }
48
+ short_answers.append(answer)
49
+ print(f"\n\n=======\n{answer}\n\n")
50
+ writer.write(answer)
51
+ pbar.update()
52
+ return short_answers
src/backend/lamini_eval/models/lamini_model.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  from typing import List
3
-
4
  import lamini
5
  from lamini import MistralRunner
6
  from lm_eval.api.model import LM
@@ -28,35 +28,48 @@ class LaminiModel(LM):
28
  def __call__(self, prompt, output_type):
29
  return self.runner(prompt=prompt, output_type=output_type)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def generate_until(self, requests) -> List[str]:
32
  print("inside generate_until")
33
 
34
  res = []
35
- for request in tqdm(requests):
36
- question = request.arguments[0]
37
- if request.task_name == "truthfulqa_gen":
38
- obj = TruthfulQA_Evaluator()
39
- prompt = obj.get_prompt(question)
40
- try:
41
- response = self.runner(prompt=prompt)
42
- except Exception as e:
43
- # select random answer
44
- print("Error fetching lamini response: ", e)
45
- response = "\nA: none"
46
- else:
47
- obj = MMLU_Evaluator()
48
- prompt = obj.get_prompt(question)
49
- try:
50
- op_type = {"explanation": "str", "answer": "str"}
51
- answer = self.runner(prompt=prompt, output_type=op_type)
52
- response = f"({answer['answer']})"
53
- except Exception as e:
54
- # select random answer
55
- print("Error fetching lamini response: ", e)
56
- # random answer
57
- response = "(A)"
58
- res.append(response)
59
- self.cache_hook.add_partial("generate_until", request, response)
60
  return res
61
 
62
  def loglikelihood_rolling(self, requests):
 
1
  import os
2
  from typing import List
3
+ import jsonlines
4
  import lamini
5
  from lamini import MistralRunner
6
  from lm_eval.api.model import LM
 
28
  def __call__(self, prompt, output_type):
29
  return self.runner(prompt=prompt, output_type=output_type)
30
 
31
+ def get_helm_response(self, request):
32
+ question = request.arguments[0]
33
+ if request.task_name == "truthfulqa_gen":
34
+ obj = TruthfulQA_Evaluator()
35
+ prompt = obj.get_prompt(question)
36
+ try:
37
+ response = self.runner(prompt=prompt)
38
+ except Exception as e:
39
+ # select random answer
40
+ print("Error fetching lamini response: ", e)
41
+ response = "\nA: none"
42
+
43
+ else:
44
+ obj = MMLU_Evaluator()
45
+ prompt = obj.get_prompt(question)
46
+ try:
47
+ op_type = {"explanation": "str", "answer": "str"}
48
+ answer = self.runner(prompt=prompt, output_type=op_type)
49
+ response = f"({answer['answer']})"
50
+ except Exception as e:
51
+ # select random answer
52
+ print("Error fetching lamini response: ", e)
53
+ # random answer
54
+ response = "(A)"
55
+ return response
56
+
57
  def generate_until(self, requests) -> List[str]:
58
  print("inside generate_until")
59
 
60
  res = []
61
+ path = f"/code/data/{self.model}"
62
+ os.makedirs(path, exist_ok=True)
63
+ path = path + "/helm-answers.jsonl"
64
+
65
+ with jsonlines.open(path, "w") as writer:
66
+ for request in tqdm(requests):
67
+ write_dict = request.__dict__
68
+ response = self.get_helm_response(request)
69
+ write_dict["model_response"] = response
70
+ writer.write(write_dict)
71
+ res.append(response)
72
+ self.cache_hook.add_partial("generate_until", request, response)
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  return res
74
 
75
  def loglikelihood_rolling(self, requests):
src/backend/run_eval_suite.py CHANGED
@@ -10,7 +10,7 @@ from src.backend.harness_evaluator import HarnessEvaluator
10
 
11
  logging.getLogger("openai").setLevel(logging.WARNING)
12
 
13
-
14
  async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str,
15
  results_repo: str, no_cache=True, limit=None):
16
  if limit:
@@ -29,12 +29,12 @@ async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, bat
29
  output_base_path='logs')
30
  results = harness_evaluator.evaluate(task_names)
31
 
32
- print("results:", results)
33
  results_trimmed = {
34
  "config": results["config"],
35
  "results": {
36
- "mmlu_flan_n_shot_generative_astronomy": results["results"]["mmlu_flan_n_shot_generative_astronomy"],
37
- "mmlu_flan_n_shot_generative_anatomy": results["results"]["mmlu_flan_n_shot_generative_anatomy"],
 
38
  "response_subjective_score": lamini_results["results"]["response_subjective_score"],
39
  "product_id_precision_score": lamini_results["results"]["product_id_precision_score"],
40
  }
 
10
 
11
  logging.getLogger("openai").setLevel(logging.WARNING)
12
 
13
+ # This functon runs lamini tasks and harness tasks
14
  async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str,
15
  results_repo: str, no_cache=True, limit=None):
16
  if limit:
 
29
  output_base_path='logs')
30
  results = harness_evaluator.evaluate(task_names)
31
 
 
32
  results_trimmed = {
33
  "config": results["config"],
34
  "results": {
35
+ "mmlu_flan_n_shot_generative_global_facts": results["results"]["mmlu_flan_n_shot_generative_global_facts"],
36
+ "mmlu_flan_n_shot_generative_formal_logic": results["results"]["mmlu_flan_n_shot_generative_formal_logic"],
37
+ "truthfulqa_gen": results["results"]["truthfulqa_gen"],
38
  "response_subjective_score": lamini_results["results"]["response_subjective_score"],
39
  "product_id_precision_score": lamini_results["results"]["product_id_precision_score"],
40
  }