Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
ayushi-lamini
commited on
Commit
β’
5c9a1c4
1
Parent(s):
93bc043
More changes (#4)
Browse files- app.py +86 -86
- src/backend/eval/evaluators/openai_eval/main.py +25 -16
- src/backend/eval/evaluators/openai_eval/openai_earnings_evaluator.py +3 -1
- src/backend/eval/evaluators/openai_eval/openai_ecommerce_evaluator.py +11 -4
- src/backend/eval/evaluators/openai_eval/openai_icd_evaluator.py +10 -4
- src/backend/eval/models/gpt4_model.py +13 -4
- src/backend/run_eval_suite.py +2 -0
app.py
CHANGED
@@ -262,92 +262,92 @@ with demo:
|
|
262 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
263 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
264 |
|
265 |
-
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
|
352 |
with gr.Row():
|
353 |
with gr.Accordion("π Citation", open=False):
|
|
|
262 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
263 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
264 |
|
265 |
+
# with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
266 |
+
# with gr.Column():
|
267 |
+
# with gr.Row():
|
268 |
+
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
269 |
+
#
|
270 |
+
# with gr.Column():
|
271 |
+
# with gr.Accordion(
|
272 |
+
# f"β
Finished Evaluations ({len(finished_eval_queue_df)})",
|
273 |
+
# open=False,
|
274 |
+
# ):
|
275 |
+
# with gr.Row():
|
276 |
+
# finished_eval_table = gr.components.Dataframe(
|
277 |
+
# value=finished_eval_queue_df,
|
278 |
+
# headers=EVAL_COLS,
|
279 |
+
# datatype=EVAL_TYPES,
|
280 |
+
# row_count=5,
|
281 |
+
# )
|
282 |
+
# with gr.Accordion(
|
283 |
+
# f"π Running Evaluation Queue ({len(running_eval_queue_df)})",
|
284 |
+
# open=False,
|
285 |
+
# ):
|
286 |
+
# with gr.Row():
|
287 |
+
# running_eval_table = gr.components.Dataframe(
|
288 |
+
# value=running_eval_queue_df,
|
289 |
+
# headers=EVAL_COLS,
|
290 |
+
# datatype=EVAL_TYPES,
|
291 |
+
# row_count=5,
|
292 |
+
# )
|
293 |
+
#
|
294 |
+
# with gr.Accordion(
|
295 |
+
# f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
296 |
+
# open=False,
|
297 |
+
# ):
|
298 |
+
# with gr.Row():
|
299 |
+
# pending_eval_table = gr.components.Dataframe(
|
300 |
+
# value=pending_eval_queue_df,
|
301 |
+
# headers=EVAL_COLS,
|
302 |
+
# datatype=EVAL_TYPES,
|
303 |
+
# row_count=5,
|
304 |
+
# )
|
305 |
+
# with gr.Row():
|
306 |
+
# gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text")
|
307 |
+
#
|
308 |
+
# with gr.Row():
|
309 |
+
# with gr.Column():
|
310 |
+
# model_name_textbox = gr.Textbox(label="Model name")
|
311 |
+
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
312 |
+
# model_type = gr.Dropdown(
|
313 |
+
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
314 |
+
# label="Model type",
|
315 |
+
# multiselect=False,
|
316 |
+
# value=None,
|
317 |
+
# interactive=True,
|
318 |
+
# )
|
319 |
+
#
|
320 |
+
# with gr.Column():
|
321 |
+
# precision = gr.Dropdown(
|
322 |
+
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
323 |
+
# label="Precision",
|
324 |
+
# multiselect=False,
|
325 |
+
# value="float16" if DEVICE != "cpu" else "float32",
|
326 |
+
# interactive=True,
|
327 |
+
# )
|
328 |
+
# weight_type = gr.Dropdown(
|
329 |
+
# choices=[i.value.name for i in WeightType],
|
330 |
+
# label="Weights type",
|
331 |
+
# multiselect=False,
|
332 |
+
# value="Original",
|
333 |
+
# interactive=True,
|
334 |
+
# )
|
335 |
+
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
336 |
+
#
|
337 |
+
# submit_button = gr.Button("Submit Eval")
|
338 |
+
# submission_result = gr.Markdown()
|
339 |
+
# submit_button.click(
|
340 |
+
# add_new_eval,
|
341 |
+
# [
|
342 |
+
# model_name_textbox,
|
343 |
+
# base_model_name_textbox,
|
344 |
+
# revision_name_textbox,
|
345 |
+
# precision,
|
346 |
+
# weight_type,
|
347 |
+
# model_type,
|
348 |
+
# ],
|
349 |
+
# submission_result,
|
350 |
+
# )
|
351 |
|
352 |
with gr.Row():
|
353 |
with gr.Accordion("π Citation", open=False):
|
src/backend/eval/evaluators/openai_eval/main.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
from tqdm import tqdm
|
2 |
-
from lamini import MistralRunner
|
3 |
import lamini
|
4 |
|
5 |
from src.envs import LAMINI_API_KEY
|
6 |
|
7 |
lamini.api_key = LAMINI_API_KEY
|
8 |
|
|
|
|
|
|
|
9 |
|
10 |
async def evaluate_openai_model(model, dataset, max_examples):
|
11 |
for example in tqdm(dataset, desc="Evaluating", total=max_examples):
|
@@ -17,26 +20,26 @@ def evaluate(model, example):
|
|
17 |
prompt = example.get_prompt()
|
18 |
|
19 |
response = model(prompt=prompt, output_type=example.get_output_type())
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
result = {
|
22 |
"prompt": prompt,
|
23 |
"response": response,
|
24 |
"reference_response": example.get_response_json(),
|
25 |
-
"is_exact_match": example.is_exact_match(response)
|
26 |
-
"score": score(
|
27 |
-
example.format_response(response),
|
28 |
-
example.get_response(response),
|
29 |
-
example.get_question(),
|
30 |
-
example.get_rubric()
|
31 |
-
),
|
32 |
}
|
33 |
return result
|
34 |
|
35 |
|
36 |
def score(response, expected_response, question, rubric):
|
37 |
-
|
38 |
|
39 |
-
prompt = "A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference.
|
40 |
prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
|
41 |
prompt += rubric
|
42 |
prompt += "Use the full range. Read the gold answer carefully. "
|
@@ -49,12 +52,18 @@ def score(response, expected_response, question, rubric):
|
|
49 |
prompt += f"========== model answer =========\n{response}\n\n"
|
50 |
prompt += "=" * 40 + "\n\n"
|
51 |
prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
|
|
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
print(prompt, "\nModel score:",
|
59 |
|
60 |
-
return
|
|
|
1 |
from tqdm import tqdm
|
2 |
+
from lamini import MistralRunner, Lamini
|
3 |
import lamini
|
4 |
|
5 |
from src.envs import LAMINI_API_KEY
|
6 |
|
7 |
lamini.api_key = LAMINI_API_KEY
|
8 |
|
9 |
+
lamini.max_workers = 40
|
10 |
+
lamini.batch_size = 50
|
11 |
+
|
12 |
|
13 |
async def evaluate_openai_model(model, dataset, max_examples):
|
14 |
for example in tqdm(dataset, desc="Evaluating", total=max_examples):
|
|
|
20 |
prompt = example.get_prompt()
|
21 |
|
22 |
response = model(prompt=prompt, output_type=example.get_output_type())
|
23 |
+
response["score"] = score(
|
24 |
+
example.format_response(response),
|
25 |
+
example.get_response(response),
|
26 |
+
example.get_question(),
|
27 |
+
example.get_rubric()
|
28 |
+
)
|
29 |
|
30 |
result = {
|
31 |
"prompt": prompt,
|
32 |
"response": response,
|
33 |
"reference_response": example.get_response_json(),
|
34 |
+
"is_exact_match": example.is_exact_match(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
}
|
36 |
return result
|
37 |
|
38 |
|
39 |
def score(response, expected_response, question, rubric):
|
40 |
+
api = Lamini(model_name="mistralai/Mistral-7B-Instruct-v0.2")
|
41 |
|
42 |
+
prompt = "<s>[INST] A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n\n"
|
43 |
prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
|
44 |
prompt += rubric
|
45 |
prompt += "Use the full range. Read the gold answer carefully. "
|
|
|
52 |
prompt += f"========== model answer =========\n{response}\n\n"
|
53 |
prompt += "=" * 40 + "\n\n"
|
54 |
prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
|
55 |
+
prompt += " [/INST]"
|
56 |
|
57 |
+
try:
|
58 |
+
lamini_score = api.generate(
|
59 |
+
prompt=prompt,
|
60 |
+
output_type={"explanation": "str", "score": "int"},
|
61 |
+
max_new_tokens=150
|
62 |
+
)
|
63 |
+
except Exception as e:
|
64 |
+
print("hit lamini scoring exception:", e)
|
65 |
+
lamini_score = {"explanation": "", "score": 0}
|
66 |
|
67 |
+
print(prompt, "\nModel score:", lamini_score)
|
68 |
|
69 |
+
return lamini_score["score"]
|
src/backend/eval/evaluators/openai_eval/openai_earnings_evaluator.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import jsonlines
|
2 |
-
from src.envs import EARNINGS_DATASET_PATH
|
3 |
|
4 |
|
5 |
def load_earnings_dataset():
|
@@ -18,6 +18,8 @@ class EarningsCallsDataset:
|
|
18 |
def __iter__(self):
|
19 |
with jsonlines.open(self.path) as reader:
|
20 |
for index, obj in enumerate(reader):
|
|
|
|
|
21 |
yield EarningsCallsExample(index, obj)
|
22 |
|
23 |
def get_length(self):
|
|
|
1 |
import jsonlines
|
2 |
+
from src.envs import EARNINGS_DATASET_PATH, MAX_EXAMPLES
|
3 |
|
4 |
|
5 |
def load_earnings_dataset():
|
|
|
18 |
def __iter__(self):
|
19 |
with jsonlines.open(self.path) as reader:
|
20 |
for index, obj in enumerate(reader):
|
21 |
+
if index == MAX_EXAMPLES:
|
22 |
+
break
|
23 |
yield EarningsCallsExample(index, obj)
|
24 |
|
25 |
def get_length(self):
|
src/backend/eval/evaluators/openai_eval/openai_ecommerce_evaluator.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import jsonlines
|
2 |
-
from src.envs import SHOPPING_DATASET_PATH
|
3 |
|
4 |
|
5 |
def load_shopping_dataset():
|
@@ -18,6 +18,8 @@ class ShoppingDataset:
|
|
18 |
def __iter__(self):
|
19 |
with jsonlines.open(self.path) as reader:
|
20 |
for index, obj in enumerate(reader):
|
|
|
|
|
21 |
yield ShoppingExample(index, obj)
|
22 |
|
23 |
def get_length(self):
|
@@ -73,12 +75,17 @@ class ShoppingExample:
|
|
73 |
}
|
74 |
|
75 |
def format_response(self, response):
|
76 |
-
formatted_response =
|
77 |
|
78 |
-
if
|
|
|
|
|
|
|
|
|
|
|
79 |
formatted_response += f"Product Name: {response['product_name']}\n"
|
80 |
|
81 |
-
if response['product_description'] != "N/A":
|
82 |
formatted_response += f"Description: {response['product_description']}"
|
83 |
|
84 |
return formatted_response
|
|
|
1 |
import jsonlines
|
2 |
+
from src.envs import SHOPPING_DATASET_PATH, MAX_EXAMPLES
|
3 |
|
4 |
|
5 |
def load_shopping_dataset():
|
|
|
18 |
def __iter__(self):
|
19 |
with jsonlines.open(self.path) as reader:
|
20 |
for index, obj in enumerate(reader):
|
21 |
+
if index == MAX_EXAMPLES:
|
22 |
+
break
|
23 |
yield ShoppingExample(index, obj)
|
24 |
|
25 |
def get_length(self):
|
|
|
75 |
}
|
76 |
|
77 |
def format_response(self, response):
|
78 |
+
formatted_response = ""
|
79 |
|
80 |
+
if 'product_id' in response:
|
81 |
+
formatted_response += f"Product ID: {response['product_id']}\n"
|
82 |
+
else:
|
83 |
+
formatted_response += "Product ID: N/A\n"
|
84 |
+
|
85 |
+
if ("product_name" in response) and response['product_name'] != "N/A":
|
86 |
formatted_response += f"Product Name: {response['product_name']}\n"
|
87 |
|
88 |
+
if ("product_description" in response) and response['product_description'] != "N/A":
|
89 |
formatted_response += f"Description: {response['product_description']}"
|
90 |
|
91 |
return formatted_response
|
src/backend/eval/evaluators/openai_eval/openai_icd_evaluator.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import jsonlines
|
2 |
-
from src.envs import ICD_DATASET_PATH
|
3 |
|
4 |
|
5 |
def load_icd_dataset():
|
@@ -18,6 +18,8 @@ class ICD11Dataset:
|
|
18 |
def __iter__(self):
|
19 |
with jsonlines.open(self.path) as reader:
|
20 |
for index, obj in enumerate(reader):
|
|
|
|
|
21 |
yield ICD11Example(index, obj)
|
22 |
|
23 |
def get_length(self):
|
@@ -43,7 +45,7 @@ class ICD11Example:
|
|
43 |
return prompt
|
44 |
|
45 |
def is_exact_match(self, response):
|
46 |
-
return self.example["entity"]["code"] == response["
|
47 |
|
48 |
def get_question(self):
|
49 |
return self.example["question"]
|
@@ -69,9 +71,13 @@ class ICD11Example:
|
|
69 |
}
|
70 |
|
71 |
def format_response(self, response):
|
72 |
-
formatted_response =
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
if response['answer'] != "N/A":
|
75 |
formatted_response += f"Answer: {response['answer']}\n"
|
76 |
|
77 |
return formatted_response
|
|
|
1 |
import jsonlines
|
2 |
+
from src.envs import ICD_DATASET_PATH, MAX_EXAMPLES
|
3 |
|
4 |
|
5 |
def load_icd_dataset():
|
|
|
18 |
def __iter__(self):
|
19 |
with jsonlines.open(self.path) as reader:
|
20 |
for index, obj in enumerate(reader):
|
21 |
+
if index == MAX_EXAMPLES:
|
22 |
+
break
|
23 |
yield ICD11Example(index, obj)
|
24 |
|
25 |
def get_length(self):
|
|
|
45 |
return prompt
|
46 |
|
47 |
def is_exact_match(self, response):
|
48 |
+
return self.example["entity"]["code"] == response["icd11_code"]
|
49 |
|
50 |
def get_question(self):
|
51 |
return self.example["question"]
|
|
|
71 |
}
|
72 |
|
73 |
def format_response(self, response):
|
74 |
+
formatted_response = ""
|
75 |
+
if 'icd11_code' in response:
|
76 |
+
formatted_response += f"ICD11 Code: {response['icd11_code']}\n"
|
77 |
+
else:
|
78 |
+
formatted_response += "ICD11 Code: N/A\n"
|
79 |
|
80 |
+
if ('answer' in response) and response['answer'] != "N/A":
|
81 |
formatted_response += f"Answer: {response['answer']}\n"
|
82 |
|
83 |
return formatted_response
|
src/backend/eval/models/gpt4_model.py
CHANGED
@@ -36,7 +36,7 @@ class GPT4Model(LM):
|
|
36 |
|
37 |
json_object = json.loads(response.choices[0].message.content)
|
38 |
|
39 |
-
validate_output(json_object, output_type)
|
40 |
|
41 |
return json_object
|
42 |
|
@@ -46,7 +46,11 @@ class GPT4Model(LM):
|
|
46 |
try:
|
47 |
question = request.doc["question"]
|
48 |
result = self.__call__(prompt=question, output_type={"answer": "str"})
|
49 |
-
response = result['answer']
|
|
|
|
|
|
|
|
|
50 |
except Exception as e:
|
51 |
print("Error fetching gpt4 response: ", e)
|
52 |
# select random answer
|
@@ -56,8 +60,13 @@ class GPT4Model(LM):
|
|
56 |
try:
|
57 |
question = request.arguments[0]
|
58 |
op_type = {"explanation": "str", "answer": "str"}
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
61 |
except Exception as e:
|
62 |
print("Error fetching gpt4 response: ", e)
|
63 |
# select random answer
|
|
|
36 |
|
37 |
json_object = json.loads(response.choices[0].message.content)
|
38 |
|
39 |
+
# validate_output(json_object, output_type)
|
40 |
|
41 |
return json_object
|
42 |
|
|
|
46 |
try:
|
47 |
question = request.doc["question"]
|
48 |
result = self.__call__(prompt=question, output_type={"answer": "str"})
|
49 |
+
response = result['answer'].strip()
|
50 |
+
print("\n\ntask_name: ", request.task_name)
|
51 |
+
print("helm prompt: ", question)
|
52 |
+
print("helm response: ", response)
|
53 |
+
print("\n\n")
|
54 |
except Exception as e:
|
55 |
print("Error fetching gpt4 response: ", e)
|
56 |
# select random answer
|
|
|
60 |
try:
|
61 |
question = request.arguments[0]
|
62 |
op_type = {"explanation": "str", "answer": "str"}
|
63 |
+
prompt = f"The 'answer' should only include the option name. {question}"
|
64 |
+
result = self.__call__(prompt=prompt, output_type=op_type)
|
65 |
+
response = f"({result['answer'].strip()})"
|
66 |
+
print("\n\ntask_name: ", request.task_name)
|
67 |
+
print("helm prompt: ", question)
|
68 |
+
print("helm response: ", response)
|
69 |
+
print("\n\n")
|
70 |
except Exception as e:
|
71 |
print("Error fetching gpt4 response: ", e)
|
72 |
# select random answer
|
src/backend/run_eval_suite.py
CHANGED
@@ -23,12 +23,14 @@ async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, bat
|
|
23 |
batch_size, device, no_cache, limit, write_out=True,
|
24 |
output_base_path='logs')
|
25 |
lamini_results = await custom_evaluator.evaluate()
|
|
|
26 |
|
27 |
print(f"Selected Harness Tasks: {task_names}")
|
28 |
harness_evaluator = HarnessEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
|
29 |
batch_size, device, no_cache, limit, write_out=True,
|
30 |
output_base_path='logs')
|
31 |
harness_results = harness_evaluator.evaluate(task_names)
|
|
|
32 |
|
33 |
|
34 |
results_trimmed = {
|
|
|
23 |
batch_size, device, no_cache, limit, write_out=True,
|
24 |
output_base_path='logs')
|
25 |
lamini_results = await custom_evaluator.evaluate()
|
26 |
+
print("lamini_results:", lamini_results)
|
27 |
|
28 |
print(f"Selected Harness Tasks: {task_names}")
|
29 |
harness_evaluator = HarnessEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
|
30 |
batch_size, device, no_cache, limit, write_out=True,
|
31 |
output_base_path='logs')
|
32 |
harness_results = harness_evaluator.evaluate(task_names)
|
33 |
+
print("harness_results:", harness_results)
|
34 |
|
35 |
|
36 |
results_trimmed = {
|