ayushi0430 commited on
Commit
6c9df03
1 Parent(s): efe0c89

add sys tags, pass model

Browse files
generation.py CHANGED
@@ -2,7 +2,6 @@ from lamini.generation.generation_node import GenerationNode
2
  from lamini.generation.generation_pipeline import GenerationPipeline
3
  from lamini.generation.base_prompt_object import PromptObject
4
  from typing import Union, Iterator, AsyncIterator
5
- from src.backend.lamini_eval.datasets.ecommerce.shopping_dataset import load_shopping_dataset
6
  from src.envs import DATASET_PATH, LAMINI_API_KEY
7
  import logging
8
  import asyncio
 
2
  from lamini.generation.generation_pipeline import GenerationPipeline
3
  from lamini.generation.base_prompt_object import PromptObject
4
  from typing import Union, Iterator, AsyncIterator
 
5
  from src.envs import DATASET_PATH, LAMINI_API_KEY
6
  import logging
7
  import asyncio
main_backend.py CHANGED
@@ -28,11 +28,13 @@ RUNNING_STATUS = "RUNNING"
28
  FINISHED_STATUS = "FINISHED"
29
  FAILED_STATUS = "FAILED"
30
 
31
-
32
  # TODO: uncomment
33
  if RUN_MODE != "LOCAL":
34
- snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
35
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
 
 
 
36
 
37
  def run_auto_eval():
38
  current_pending_status = [PENDING_STATUS]
 
28
  FINISHED_STATUS = "FINISHED"
29
  FAILED_STATUS = "FAILED"
30
 
 
31
  # TODO: uncomment
32
  if RUN_MODE != "LOCAL":
33
+ snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset",
34
+ max_workers=60, token=TOKEN)
35
+ snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset",
36
+ max_workers=60, token=TOKEN)
37
+
38
 
39
  def run_auto_eval():
40
  current_pending_status = [PENDING_STATUS]
src/backend/lamini_eval/evaluators/ecommerce_evaluator.py CHANGED
@@ -7,10 +7,9 @@ from lamini.generation.base_prompt_object import PromptObject
7
  from lamini.generation.generation_node import GenerationNode
8
  from lamini.generation.generation_pipeline import GenerationPipeline
9
  from tqdm import tqdm
10
-
11
- from src.backend.lamini_eval.datasets.ecommerce.shopping_dataset import load_shopping_dataset
12
- from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
13
- from src.backend.lamini_eval.models.lamini_model import load_lamini_model
14
  from src.envs import DATASET_PATH, LAMINI_API_KEY
15
 
16
  lamini.api_key = LAMINI_API_KEY
@@ -31,11 +30,11 @@ class EcommerceEvaluator:
31
  self.max_examples = max_examples
32
  self.batch_size = batch_size
33
 
34
- def load_model(self):
35
- if self.model_type == "gpt4":
36
- return load_gpt4_model()
37
-
38
- return load_lamini_model(self.model_name)
39
 
40
  async def load_shopping_dataset(self, path):
41
  i = 0
@@ -59,20 +58,15 @@ class EcommerceEvaluator:
59
  "is_exact_match": answer.data["is_exact_match"]
60
  }
61
  short_answers.append(answer)
62
- print(f"\n\n=======\n{answer}\n\n")
63
  writer.write(answer)
64
  pbar.update()
65
  return short_answers
66
 
67
- async def evaluate(self):
68
- dataset = load_shopping_dataset(DATASET_PATH)
69
-
70
- answers = AnswerScorePipeline().call(dataset)
71
- short_answers = await self.save_results(answers)
72
- return short_answers
73
-
74
  async def evaluate_hallucination(self):
75
- results = await self.evaluate()
 
 
76
  mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
77
  product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
78
  results = self.format_results(model_name=self.model_name,
@@ -80,13 +74,18 @@ class EcommerceEvaluator:
80
  product_id_precision_score=product_id_precision_score)
81
  return results
82
 
 
 
 
 
 
83
  def format_results(self, model_name: str, response_subjective_score: float,
84
  product_id_precision_score: float) -> dict:
85
  results = {
86
  "config": {
87
  "model_dtype": "100", # Precision with which you ran the evaluation
88
  "model_name": model_name, # Name of the model
89
- "model_sha": "xuz" # Hash of the model
90
  },
91
  "results": {
92
  "response_subjective_score": {
@@ -102,11 +101,11 @@ class EcommerceEvaluator:
102
 
103
 
104
  class AnswerScorePipeline(GenerationPipeline):
105
- def __init__(self):
106
  super(AnswerScorePipeline, self).__init__()
107
 
108
- self.answer_generator = AnswerGenerator()
109
- self.score_generator = ScoreGenerator()
110
 
111
  def forward(self, x):
112
  ans = self.answer_generator(x)
@@ -115,9 +114,10 @@ class AnswerScorePipeline(GenerationPipeline):
115
 
116
 
117
  class AnswerGenerator(GenerationNode):
118
- def __init__(self):
119
  super(AnswerGenerator, self).__init__(
120
- model_name="mistralai/Mistral-7B-Instruct-v0.1", max_tokens=150
 
121
  )
122
 
123
  def generate(
@@ -155,22 +155,25 @@ class AnswerGenerator(GenerationNode):
155
  yield prompt
156
 
157
  def make_prompt(self, chunk):
 
158
  prompt = (
159
- "You are an expert shopper at Instacart."
160
  )
161
  prompt += "You are helping a customer find a product. "
162
  prompt += "Include the product name, id, and detailed description in your answer. "
163
  prompt += "A product id is a number between 0 and 100,000. "
164
  prompt += "The customer asks\n"
165
  prompt += chunk.data["question"]
 
166
  print("answer prompt:", prompt)
167
  return prompt
168
 
169
 
170
  class ScoreGenerator(GenerationNode):
171
- def __init__(self):
172
  super(ScoreGenerator, self).__init__(
173
- model_name="mistralai/Mistral-7B-Instruct-v0.1", max_tokens=150
 
174
  )
175
 
176
  def generate(
@@ -193,8 +196,9 @@ class ScoreGenerator(GenerationNode):
193
  yield result
194
 
195
  def get_rubric(self):
 
196
  rubric = (
197
- "Read this scoring rubric carefully and follow the instructions precisely:\n\n"
198
  )
199
  rubric += "A score of 5 means that model's id is the same as the gold answer's id.\n"
200
  rubric += "A score of 4 means that the model's product name is the same or a paraphrase of the gold answer, but the id may be wrong. For example, the product names 'Tuna' and 'Canned Tuna' are similar\n"
@@ -207,7 +211,7 @@ class ScoreGenerator(GenerationNode):
207
  rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Canned Tuna, Description: Tuna, score == 4\n"
208
  rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Bubble Gum, Description: Delicious treat, score == 1\n"
209
  rubric += "Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n"
210
-
211
  return rubric
212
 
213
  def is_exact_match(self, prompt):
@@ -247,8 +251,9 @@ class ScoreGenerator(GenerationNode):
247
  return formatted_response
248
 
249
  def make_prompt(self, chunk):
 
250
  prompt = (
251
- "A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n\n"
252
  )
253
  prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
254
  prompt += chunk.data["rubric"]
@@ -262,6 +267,6 @@ class ScoreGenerator(GenerationNode):
262
  prompt += f"========== model answer =========\n{chunk.data['response']}\n\n"
263
  prompt += "=" * 40 + "\n\n"
264
  prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
265
-
266
  print("score prompt:", prompt)
267
  return prompt
 
7
  from lamini.generation.generation_node import GenerationNode
8
  from lamini.generation.generation_pipeline import GenerationPipeline
9
  from tqdm import tqdm
10
+ import hashlib
11
+ # from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
12
+ # from src.backend.lamini_eval.models.lamini_model import load_lamini_model
 
13
  from src.envs import DATASET_PATH, LAMINI_API_KEY
14
 
15
  lamini.api_key = LAMINI_API_KEY
 
30
  self.max_examples = max_examples
31
  self.batch_size = batch_size
32
 
33
+ # def load_model(self):
34
+ # if self.model_type == "gpt4":
35
+ # return load_gpt4_model()
36
+ #
37
+ # return load_lamini_model(self.model_name)
38
 
39
  async def load_shopping_dataset(self, path):
40
  i = 0
 
58
  "is_exact_match": answer.data["is_exact_match"]
59
  }
60
  short_answers.append(answer)
61
+ # print(f"\n\n=======\n{answer}\n\n")
62
  writer.write(answer)
63
  pbar.update()
64
  return short_answers
65
 
 
 
 
 
 
 
 
66
  async def evaluate_hallucination(self):
67
+ dataset = self.load_shopping_dataset(DATASET_PATH)
68
+ answers = AnswerScorePipeline(answer_model=self.model_name).call(dataset)
69
+ results = await self.save_results(answers)
70
  mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
71
  product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
72
  results = self.format_results(model_name=self.model_name,
 
74
  product_id_precision_score=product_id_precision_score)
75
  return results
76
 
77
+ def compute_hash(self, input):
78
+ m = hashlib.md5()
79
+ m.update(str(input).encode("utf-8"))
80
+ return m.hexdigest()
81
+
82
  def format_results(self, model_name: str, response_subjective_score: float,
83
  product_id_precision_score: float) -> dict:
84
  results = {
85
  "config": {
86
  "model_dtype": "100", # Precision with which you ran the evaluation
87
  "model_name": model_name, # Name of the model
88
+ "model_sha": self.compute_hash(model_name) # Hash of the model
89
  },
90
  "results": {
91
  "response_subjective_score": {
 
101
 
102
 
103
  class AnswerScorePipeline(GenerationPipeline):
104
+ def __init__(self, answer_model="mistralai/Mistral-7B-Instruct-v0.1", score_model="mistralai/Mistral-7B-Instruct-v0.1"):
105
  super(AnswerScorePipeline, self).__init__()
106
 
107
+ self.answer_generator = AnswerGenerator(model_name=answer_model)
108
+ self.score_generator = ScoreGenerator(model_name=score_model)
109
 
110
  def forward(self, x):
111
  ans = self.answer_generator(x)
 
114
 
115
 
116
  class AnswerGenerator(GenerationNode):
117
+ def __init__(self, model_name="mistralai/Mistral-7B-Instruct-v0.1"):
118
  super(AnswerGenerator, self).__init__(
119
+ model_name,
120
+ max_tokens=150
121
  )
122
 
123
  def generate(
 
155
  yield prompt
156
 
157
  def make_prompt(self, chunk):
158
+ # <s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]
159
  prompt = (
160
+ "<s>[INST] <<SYS>>\nYou are an expert shopper at Instacart.\n<</SYS>>\n\n"
161
  )
162
  prompt += "You are helping a customer find a product. "
163
  prompt += "Include the product name, id, and detailed description in your answer. "
164
  prompt += "A product id is a number between 0 and 100,000. "
165
  prompt += "The customer asks\n"
166
  prompt += chunk.data["question"]
167
+ prompt += " [/INST]"
168
  print("answer prompt:", prompt)
169
  return prompt
170
 
171
 
172
  class ScoreGenerator(GenerationNode):
173
+ def __init__(self, model_name="mistralai/Mistral-7B-Instruct-v0.1"):
174
  super(ScoreGenerator, self).__init__(
175
+ model_name=model_name,
176
+ max_tokens=150
177
  )
178
 
179
  def generate(
 
196
  yield result
197
 
198
  def get_rubric(self):
199
+ # <s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]
200
  rubric = (
201
+ " <s>[INST] <<SYS>>\nRead this scoring rubric carefully and follow the instructions precisely:\n<</SYS>>\n\n"
202
  )
203
  rubric += "A score of 5 means that model's id is the same as the gold answer's id.\n"
204
  rubric += "A score of 4 means that the model's product name is the same or a paraphrase of the gold answer, but the id may be wrong. For example, the product names 'Tuna' and 'Canned Tuna' are similar\n"
 
211
  rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Canned Tuna, Description: Tuna, score == 4\n"
212
  rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Bubble Gum, Description: Delicious treat, score == 1\n"
213
  rubric += "Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n"
214
+ rubric += " [/INST]"
215
  return rubric
216
 
217
  def is_exact_match(self, prompt):
 
251
  return formatted_response
252
 
253
  def make_prompt(self, chunk):
254
+ # <s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]
255
  prompt = (
256
+ "<s>[INST] <<SYS>>\nA model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n"
257
  )
258
  prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
259
  prompt += chunk.data["rubric"]
 
267
  prompt += f"========== model answer =========\n{chunk.data['response']}\n\n"
268
  prompt += "=" * 40 + "\n\n"
269
  prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
270
+ prompt += " [/INST]"
271
  print("score prompt:", prompt)
272
  return prompt
src/leaderboard/read_evals.py CHANGED
@@ -74,7 +74,7 @@ class EvalResult:
74
  if accs.size == 0 or any([acc is None for acc in accs]):
75
  continue
76
 
77
- mean_acc = np.mean(accs)
78
  results[task.benchmark] = mean_acc
79
 
80
  return self(
 
74
  if accs.size == 0 or any([acc is None for acc in accs]):
75
  continue
76
 
77
+ mean_acc = np.mean(accs) * 100.0
78
  results[task.benchmark] = mean_acc
79
 
80
  return self(
start.sh CHANGED
@@ -9,6 +9,8 @@
9
  # -o pipefail | produces a failure code if any stage fails
10
  set -Eeuoxa pipefail
11
 
 
 
12
  for ARGUMENT in "$@"
13
  do
14
  KEY=$(echo $ARGUMENT | cut -f1 -d=)
@@ -24,4 +26,4 @@ echo "Run mode is: $RUN_MODE"
24
  echo "Model passed is: $LOCAL_MODEL_NAME"
25
 
26
  docker buildx build --platform=linux/amd64 -t ldr .
27
- docker run -it --rm -p 7860:7860 --platform=linux/amd64 -e RUN_MODE=$RUN_MODE -e LOCAL_MODEL_NAME=$LOCAL_MODEL_NAME ldr python app.py
 
9
  # -o pipefail | produces a failure code if any stage fails
10
  set -Eeuoxa pipefail
11
 
12
+ LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
13
+
14
  for ARGUMENT in "$@"
15
  do
16
  KEY=$(echo $ARGUMENT | cut -f1 -d=)
 
26
  echo "Model passed is: $LOCAL_MODEL_NAME"
27
 
28
  docker buildx build --platform=linux/amd64 -t ldr .
29
+ docker run -it --rm -p 7860:7860 --platform=linux/amd64 -v $LOCAL_DIRECTORY/output-data:/code/data -e RUN_MODE=$RUN_MODE -e LOCAL_MODEL_NAME=$LOCAL_MODEL_NAME ldr python app.py