Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
ayushi0430
commited on
Commit
•
6c9df03
1
Parent(s):
efe0c89
add sys tags, pass model
Browse files- generation.py +0 -1
- main_backend.py +5 -3
- src/backend/lamini_eval/evaluators/ecommerce_evaluator.py +36 -31
- src/leaderboard/read_evals.py +1 -1
- start.sh +3 -1
generation.py
CHANGED
@@ -2,7 +2,6 @@ from lamini.generation.generation_node import GenerationNode
|
|
2 |
from lamini.generation.generation_pipeline import GenerationPipeline
|
3 |
from lamini.generation.base_prompt_object import PromptObject
|
4 |
from typing import Union, Iterator, AsyncIterator
|
5 |
-
from src.backend.lamini_eval.datasets.ecommerce.shopping_dataset import load_shopping_dataset
|
6 |
from src.envs import DATASET_PATH, LAMINI_API_KEY
|
7 |
import logging
|
8 |
import asyncio
|
|
|
2 |
from lamini.generation.generation_pipeline import GenerationPipeline
|
3 |
from lamini.generation.base_prompt_object import PromptObject
|
4 |
from typing import Union, Iterator, AsyncIterator
|
|
|
5 |
from src.envs import DATASET_PATH, LAMINI_API_KEY
|
6 |
import logging
|
7 |
import asyncio
|
main_backend.py
CHANGED
@@ -28,11 +28,13 @@ RUNNING_STATUS = "RUNNING"
|
|
28 |
FINISHED_STATUS = "FINISHED"
|
29 |
FAILED_STATUS = "FAILED"
|
30 |
|
31 |
-
|
32 |
# TODO: uncomment
|
33 |
if RUN_MODE != "LOCAL":
|
34 |
-
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset",
|
35 |
-
|
|
|
|
|
|
|
36 |
|
37 |
def run_auto_eval():
|
38 |
current_pending_status = [PENDING_STATUS]
|
|
|
28 |
FINISHED_STATUS = "FINISHED"
|
29 |
FAILED_STATUS = "FAILED"
|
30 |
|
|
|
31 |
# TODO: uncomment
|
32 |
if RUN_MODE != "LOCAL":
|
33 |
+
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset",
|
34 |
+
max_workers=60, token=TOKEN)
|
35 |
+
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset",
|
36 |
+
max_workers=60, token=TOKEN)
|
37 |
+
|
38 |
|
39 |
def run_auto_eval():
|
40 |
current_pending_status = [PENDING_STATUS]
|
src/backend/lamini_eval/evaluators/ecommerce_evaluator.py
CHANGED
@@ -7,10 +7,9 @@ from lamini.generation.base_prompt_object import PromptObject
|
|
7 |
from lamini.generation.generation_node import GenerationNode
|
8 |
from lamini.generation.generation_pipeline import GenerationPipeline
|
9 |
from tqdm import tqdm
|
10 |
-
|
11 |
-
from src.backend.lamini_eval.
|
12 |
-
from src.backend.lamini_eval.models.
|
13 |
-
from src.backend.lamini_eval.models.lamini_model import load_lamini_model
|
14 |
from src.envs import DATASET_PATH, LAMINI_API_KEY
|
15 |
|
16 |
lamini.api_key = LAMINI_API_KEY
|
@@ -31,11 +30,11 @@ class EcommerceEvaluator:
|
|
31 |
self.max_examples = max_examples
|
32 |
self.batch_size = batch_size
|
33 |
|
34 |
-
def load_model(self):
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
|
40 |
async def load_shopping_dataset(self, path):
|
41 |
i = 0
|
@@ -59,20 +58,15 @@ class EcommerceEvaluator:
|
|
59 |
"is_exact_match": answer.data["is_exact_match"]
|
60 |
}
|
61 |
short_answers.append(answer)
|
62 |
-
print(f"\n\n=======\n{answer}\n\n")
|
63 |
writer.write(answer)
|
64 |
pbar.update()
|
65 |
return short_answers
|
66 |
|
67 |
-
async def evaluate(self):
|
68 |
-
dataset = load_shopping_dataset(DATASET_PATH)
|
69 |
-
|
70 |
-
answers = AnswerScorePipeline().call(dataset)
|
71 |
-
short_answers = await self.save_results(answers)
|
72 |
-
return short_answers
|
73 |
-
|
74 |
async def evaluate_hallucination(self):
|
75 |
-
|
|
|
|
|
76 |
mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
|
77 |
product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
|
78 |
results = self.format_results(model_name=self.model_name,
|
@@ -80,13 +74,18 @@ class EcommerceEvaluator:
|
|
80 |
product_id_precision_score=product_id_precision_score)
|
81 |
return results
|
82 |
|
|
|
|
|
|
|
|
|
|
|
83 |
def format_results(self, model_name: str, response_subjective_score: float,
|
84 |
product_id_precision_score: float) -> dict:
|
85 |
results = {
|
86 |
"config": {
|
87 |
"model_dtype": "100", # Precision with which you ran the evaluation
|
88 |
"model_name": model_name, # Name of the model
|
89 |
-
"model_sha":
|
90 |
},
|
91 |
"results": {
|
92 |
"response_subjective_score": {
|
@@ -102,11 +101,11 @@ class EcommerceEvaluator:
|
|
102 |
|
103 |
|
104 |
class AnswerScorePipeline(GenerationPipeline):
|
105 |
-
def __init__(self):
|
106 |
super(AnswerScorePipeline, self).__init__()
|
107 |
|
108 |
-
self.answer_generator = AnswerGenerator()
|
109 |
-
self.score_generator = ScoreGenerator()
|
110 |
|
111 |
def forward(self, x):
|
112 |
ans = self.answer_generator(x)
|
@@ -115,9 +114,10 @@ class AnswerScorePipeline(GenerationPipeline):
|
|
115 |
|
116 |
|
117 |
class AnswerGenerator(GenerationNode):
|
118 |
-
def __init__(self):
|
119 |
super(AnswerGenerator, self).__init__(
|
120 |
-
model_name
|
|
|
121 |
)
|
122 |
|
123 |
def generate(
|
@@ -155,22 +155,25 @@ class AnswerGenerator(GenerationNode):
|
|
155 |
yield prompt
|
156 |
|
157 |
def make_prompt(self, chunk):
|
|
|
158 |
prompt = (
|
159 |
-
"
|
160 |
)
|
161 |
prompt += "You are helping a customer find a product. "
|
162 |
prompt += "Include the product name, id, and detailed description in your answer. "
|
163 |
prompt += "A product id is a number between 0 and 100,000. "
|
164 |
prompt += "The customer asks\n"
|
165 |
prompt += chunk.data["question"]
|
|
|
166 |
print("answer prompt:", prompt)
|
167 |
return prompt
|
168 |
|
169 |
|
170 |
class ScoreGenerator(GenerationNode):
|
171 |
-
def __init__(self):
|
172 |
super(ScoreGenerator, self).__init__(
|
173 |
-
model_name=
|
|
|
174 |
)
|
175 |
|
176 |
def generate(
|
@@ -193,8 +196,9 @@ class ScoreGenerator(GenerationNode):
|
|
193 |
yield result
|
194 |
|
195 |
def get_rubric(self):
|
|
|
196 |
rubric = (
|
197 |
-
"
|
198 |
)
|
199 |
rubric += "A score of 5 means that model's id is the same as the gold answer's id.\n"
|
200 |
rubric += "A score of 4 means that the model's product name is the same or a paraphrase of the gold answer, but the id may be wrong. For example, the product names 'Tuna' and 'Canned Tuna' are similar\n"
|
@@ -207,7 +211,7 @@ class ScoreGenerator(GenerationNode):
|
|
207 |
rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Canned Tuna, Description: Tuna, score == 4\n"
|
208 |
rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Bubble Gum, Description: Delicious treat, score == 1\n"
|
209 |
rubric += "Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n"
|
210 |
-
|
211 |
return rubric
|
212 |
|
213 |
def is_exact_match(self, prompt):
|
@@ -247,8 +251,9 @@ class ScoreGenerator(GenerationNode):
|
|
247 |
return formatted_response
|
248 |
|
249 |
def make_prompt(self, chunk):
|
|
|
250 |
prompt = (
|
251 |
-
"
|
252 |
)
|
253 |
prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
|
254 |
prompt += chunk.data["rubric"]
|
@@ -262,6 +267,6 @@ class ScoreGenerator(GenerationNode):
|
|
262 |
prompt += f"========== model answer =========\n{chunk.data['response']}\n\n"
|
263 |
prompt += "=" * 40 + "\n\n"
|
264 |
prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
|
265 |
-
|
266 |
print("score prompt:", prompt)
|
267 |
return prompt
|
|
|
7 |
from lamini.generation.generation_node import GenerationNode
|
8 |
from lamini.generation.generation_pipeline import GenerationPipeline
|
9 |
from tqdm import tqdm
|
10 |
+
import hashlib
|
11 |
+
# from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
|
12 |
+
# from src.backend.lamini_eval.models.lamini_model import load_lamini_model
|
|
|
13 |
from src.envs import DATASET_PATH, LAMINI_API_KEY
|
14 |
|
15 |
lamini.api_key = LAMINI_API_KEY
|
|
|
30 |
self.max_examples = max_examples
|
31 |
self.batch_size = batch_size
|
32 |
|
33 |
+
# def load_model(self):
|
34 |
+
# if self.model_type == "gpt4":
|
35 |
+
# return load_gpt4_model()
|
36 |
+
#
|
37 |
+
# return load_lamini_model(self.model_name)
|
38 |
|
39 |
async def load_shopping_dataset(self, path):
|
40 |
i = 0
|
|
|
58 |
"is_exact_match": answer.data["is_exact_match"]
|
59 |
}
|
60 |
short_answers.append(answer)
|
61 |
+
# print(f"\n\n=======\n{answer}\n\n")
|
62 |
writer.write(answer)
|
63 |
pbar.update()
|
64 |
return short_answers
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
async def evaluate_hallucination(self):
|
67 |
+
dataset = self.load_shopping_dataset(DATASET_PATH)
|
68 |
+
answers = AnswerScorePipeline(answer_model=self.model_name).call(dataset)
|
69 |
+
results = await self.save_results(answers)
|
70 |
mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
|
71 |
product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
|
72 |
results = self.format_results(model_name=self.model_name,
|
|
|
74 |
product_id_precision_score=product_id_precision_score)
|
75 |
return results
|
76 |
|
77 |
+
def compute_hash(self, input):
|
78 |
+
m = hashlib.md5()
|
79 |
+
m.update(str(input).encode("utf-8"))
|
80 |
+
return m.hexdigest()
|
81 |
+
|
82 |
def format_results(self, model_name: str, response_subjective_score: float,
|
83 |
product_id_precision_score: float) -> dict:
|
84 |
results = {
|
85 |
"config": {
|
86 |
"model_dtype": "100", # Precision with which you ran the evaluation
|
87 |
"model_name": model_name, # Name of the model
|
88 |
+
"model_sha": self.compute_hash(model_name) # Hash of the model
|
89 |
},
|
90 |
"results": {
|
91 |
"response_subjective_score": {
|
|
|
101 |
|
102 |
|
103 |
class AnswerScorePipeline(GenerationPipeline):
|
104 |
+
def __init__(self, answer_model="mistralai/Mistral-7B-Instruct-v0.1", score_model="mistralai/Mistral-7B-Instruct-v0.1"):
|
105 |
super(AnswerScorePipeline, self).__init__()
|
106 |
|
107 |
+
self.answer_generator = AnswerGenerator(model_name=answer_model)
|
108 |
+
self.score_generator = ScoreGenerator(model_name=score_model)
|
109 |
|
110 |
def forward(self, x):
|
111 |
ans = self.answer_generator(x)
|
|
|
114 |
|
115 |
|
116 |
class AnswerGenerator(GenerationNode):
|
117 |
+
def __init__(self, model_name="mistralai/Mistral-7B-Instruct-v0.1"):
|
118 |
super(AnswerGenerator, self).__init__(
|
119 |
+
model_name,
|
120 |
+
max_tokens=150
|
121 |
)
|
122 |
|
123 |
def generate(
|
|
|
155 |
yield prompt
|
156 |
|
157 |
def make_prompt(self, chunk):
|
158 |
+
# <s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]
|
159 |
prompt = (
|
160 |
+
"<s>[INST] <<SYS>>\nYou are an expert shopper at Instacart.\n<</SYS>>\n\n"
|
161 |
)
|
162 |
prompt += "You are helping a customer find a product. "
|
163 |
prompt += "Include the product name, id, and detailed description in your answer. "
|
164 |
prompt += "A product id is a number between 0 and 100,000. "
|
165 |
prompt += "The customer asks\n"
|
166 |
prompt += chunk.data["question"]
|
167 |
+
prompt += " [/INST]"
|
168 |
print("answer prompt:", prompt)
|
169 |
return prompt
|
170 |
|
171 |
|
172 |
class ScoreGenerator(GenerationNode):
|
173 |
+
def __init__(self, model_name="mistralai/Mistral-7B-Instruct-v0.1"):
|
174 |
super(ScoreGenerator, self).__init__(
|
175 |
+
model_name=model_name,
|
176 |
+
max_tokens=150
|
177 |
)
|
178 |
|
179 |
def generate(
|
|
|
196 |
yield result
|
197 |
|
198 |
def get_rubric(self):
|
199 |
+
# <s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]
|
200 |
rubric = (
|
201 |
+
" <s>[INST] <<SYS>>\nRead this scoring rubric carefully and follow the instructions precisely:\n<</SYS>>\n\n"
|
202 |
)
|
203 |
rubric += "A score of 5 means that model's id is the same as the gold answer's id.\n"
|
204 |
rubric += "A score of 4 means that the model's product name is the same or a paraphrase of the gold answer, but the id may be wrong. For example, the product names 'Tuna' and 'Canned Tuna' are similar\n"
|
|
|
211 |
rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Canned Tuna, Description: Tuna, score == 4\n"
|
212 |
rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Bubble Gum, Description: Delicious treat, score == 1\n"
|
213 |
rubric += "Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n"
|
214 |
+
rubric += " [/INST]"
|
215 |
return rubric
|
216 |
|
217 |
def is_exact_match(self, prompt):
|
|
|
251 |
return formatted_response
|
252 |
|
253 |
def make_prompt(self, chunk):
|
254 |
+
# <s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]
|
255 |
prompt = (
|
256 |
+
"<s>[INST] <<SYS>>\nA model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n"
|
257 |
)
|
258 |
prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
|
259 |
prompt += chunk.data["rubric"]
|
|
|
267 |
prompt += f"========== model answer =========\n{chunk.data['response']}\n\n"
|
268 |
prompt += "=" * 40 + "\n\n"
|
269 |
prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
|
270 |
+
prompt += " [/INST]"
|
271 |
print("score prompt:", prompt)
|
272 |
return prompt
|
src/leaderboard/read_evals.py
CHANGED
@@ -74,7 +74,7 @@ class EvalResult:
|
|
74 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
75 |
continue
|
76 |
|
77 |
-
mean_acc = np.mean(accs)
|
78 |
results[task.benchmark] = mean_acc
|
79 |
|
80 |
return self(
|
|
|
74 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
75 |
continue
|
76 |
|
77 |
+
mean_acc = np.mean(accs) * 100.0
|
78 |
results[task.benchmark] = mean_acc
|
79 |
|
80 |
return self(
|
start.sh
CHANGED
@@ -9,6 +9,8 @@
|
|
9 |
# -o pipefail | produces a failure code if any stage fails
|
10 |
set -Eeuoxa pipefail
|
11 |
|
|
|
|
|
12 |
for ARGUMENT in "$@"
|
13 |
do
|
14 |
KEY=$(echo $ARGUMENT | cut -f1 -d=)
|
@@ -24,4 +26,4 @@ echo "Run mode is: $RUN_MODE"
|
|
24 |
echo "Model passed is: $LOCAL_MODEL_NAME"
|
25 |
|
26 |
docker buildx build --platform=linux/amd64 -t ldr .
|
27 |
-
docker run -it --rm -p 7860:7860 --platform=linux/amd64 -e RUN_MODE=$RUN_MODE -e LOCAL_MODEL_NAME=$LOCAL_MODEL_NAME ldr python app.py
|
|
|
9 |
# -o pipefail | produces a failure code if any stage fails
|
10 |
set -Eeuoxa pipefail
|
11 |
|
12 |
+
LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
13 |
+
|
14 |
for ARGUMENT in "$@"
|
15 |
do
|
16 |
KEY=$(echo $ARGUMENT | cut -f1 -d=)
|
|
|
26 |
echo "Model passed is: $LOCAL_MODEL_NAME"
|
27 |
|
28 |
docker buildx build --platform=linux/amd64 -t ldr .
|
29 |
+
docker run -it --rm -p 7860:7860 --platform=linux/amd64 -v $LOCAL_DIRECTORY/output-data:/code/data -e RUN_MODE=$RUN_MODE -e LOCAL_MODEL_NAME=$LOCAL_MODEL_NAME ldr python app.py
|