LlemmaFT / mse_deepeval_dataset.py
Zenos5's picture
Upload 24 files
766ea9e verified
import argparse
import jsonlines
import json
# from deepeval.scorer import Scorer
from deepeval.models import OllamaModel
from deepeval.metrics import (
ContextualRelevancyMetric,
ContextualRecallMetric,
ContextualPrecisionMetric,
AnswerRelevancyMetric,
FaithfulnessMetric
)
# import docx
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset, Golden
from deepeval import evaluate
from deepeval.models import OllamaModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from Llemma_Finetuned import Llemma_Finetuned
import ollama
#ollama run Hudson/llemma:7b
#deepeval set-ollama Hudson/llemma:7b
if __name__=="__main__":
# Initialize parser
parser = argparse.ArgumentParser()
# Adding optional argument
parser.add_argument("-n", "--num", help = "Number of test cases to use")
parser.add_argument("-s", "--shot", help = "n-shot inference examples")
parser.add_argument("-d", "--dataset", help = "Path to test case dataset")
# Read arguments from command line
args = parser.parse_args()
test_case_num = int(args.num)
num_shot = int(args.shot)
dataset_name = str(args.dataset)
# orig
# model = ollama.pull(model="Hudson/llemma:7b")
#OllamaModel(model="Hudson/llemma:7b")
# finetuned
# llemma_model = AutoModelForCausalLM.from_pretrained("./train_llemma/merged_models/llemma_lora_merged")
# tokenizer = AutoTokenizer.from_pretrained("./train_llemma/merged_models/llemma_lora_merged")
# model = Llemma_Finetuned(model=llemma_model, tokenizer=tokenizer)
sorted_rows = []
with open('dataset_row_stl.txt', 'r') as file:
sorted_rows = file.readlines()
# print(sorted_rows)
sorted_rows = sorted_rows[0:num_shot]
sorted_rows = [int(x) for x in sorted_rows]
print("Read in sorted rows.")
examples = "Here are " + str(num_shot) + " examples of math questions (Q) with given answers (A).\n"
with jsonlines.open("mse_text_img_QA_ds_test.jsonl", mode='r') as fp:
#with open("mse_text_img_QA_ds_test.jsonl", mode='r') as fp:
n = 0
for j, data in enumerate(fp):
if j + 1 in sorted_rows:
print("Num shot row " + str(j + 1))
# data = json.loads(line)
examples += "Q: " + data["body"] + "\n\n"
is_accepted = False
best_score = float('-inf')
output_text = ""
for i in range(len(data["answers"])):
if bool(data["answers"][i]["accepted"]) == True:
if is_accepted == False:
is_accepted = True
best_score = int(data["answers"][i]["score"])
output_text = data["answers"][i]["body"]
elif int(data["answers"][i]["score"]) > best_score:
best_score = int(data["answers"][i]["score"])
output_text = data["answers"][i]["body"]
elif int(data["answers"][i]["score"]) > best_score:
best_score = int(data["answers"][i]["score"])
output_text = data["answers"][i]["body"]
examples += "A: " + output_text + "\n\n"
if n == (num_shot - 1):
examples += "Provide an answer (A) to the following math question (Q) in a similar manner to the previous example(s) given.\n\nQ: "
# 26th line
n += 1
elif n >= num_shot:
break
else:
continue
print("Generated examples for", str(num_shot), "shot.")
mse_dataset = []
with jsonlines.open("mse_text_img_QA_ds_test.jsonl", mode='r') as reader:
count = 0
curr_row = 0
for row in reader.iter(type=dict, skip_invalid=True):
curr_row += 1
if curr_row == 33 or curr_row == 36 or curr_row == 69 \
or curr_row == 24 or curr_row == 76 \
or curr_row == 66 or curr_row == 9 \
or curr_row == 26 or curr_row == 27 \
or curr_row == 37 or curr_row == 55 \
or curr_row == 54 or curr_row == 138 \
or curr_row == 77 or curr_row == 84 or curr_row == 87 \
or curr_row == 80 or curr_row == 81 or curr_row == 97 \
or curr_row == 115 or curr_row == 106:
print("Skipped row " + str(curr_row))
continue
elif curr_row in sorted_rows:
print("Skipped row " + str(curr_row) + " because it is a shorter example")
continue
# question_path = "output/" + row["id"]
# if count ual<= 0:
# print(obj)
if count >= test_case_num:
break
else:
input_text = row["body"]
# response = ollama.generate(model='Hudson/llemma:7b', prompt=input_text)
# actual_response = response['response']
is_accepted = False
best_score = float('-inf')
output_text = ""
# context = []
next_best_answer = ""
for i in range(len(row["answers"])):
if bool(row["answers"][i]["accepted"]) == True:
if is_accepted == False:
is_accepted = True
next_best_answer = output_text
best_score = int(row["answers"][i]["score"])
output_text = row["answers"][i]["body"]
elif int(row["answers"][i]["score"]) > best_score:
next_best_answer = output_text
best_score = int(row["answers"][i]["score"])
output_text = row["answers"][i]["body"]
# else:
# context.append(row["answers"][i]["body"])
elif int(row["answers"][i]["score"]) > best_score:
next_best_answer = output_text
best_score = int(row["answers"][i]["score"])
output_text = row["answers"][i]["body"]
# else:
# context.append(row["answers"][i]["body"])
if next_best_answer == "" or next_best_answer is None:
next_best_answer = row["title"]
# test_case_dataset.append(LLMTestCase(input=input_text, actual_output=actual_response, expected_output=output_text, retrieval_context=None))
# test_case_dataset.append(LLMTestCase(input=input_text, actual_output=model.generate(input_text), expected_output=output_text, retrieval_context=context))
if num_shot == 0:
i_text = json.dumps(input_text)
e_output = json.dumps(output_text)
r_context = json.dumps(next_best_answer)
gen_answer = ollama.generate(model="Hudson/llemma:7b", prompt=i_text)
a_output = json.dumps(gen_answer.response)
# print("i_text = ", i_text)
# print("a_output = ", a_output)
# print("e_output = ", e_output)
# print("r_context = ", r_context)
# r_context = gen_answer.context
# if is_invalid_length(i_text) or is_invalid_length(e_output) or is_invalid_length(r_context):
# continue
mse_dataset.append(LLMTestCase(input=i_text, actual_output=a_output, expected_output=e_output, retrieval_context=[r_context]))
else:
i_text = json.dumps(examples + input_text)
e_output = json.dumps(output_text)
r_context = json.dumps(next_best_answer)
gen_answer = ollama.generate(model="Hudson/llemma:7b", prompt=i_text)
a_output = json.dumps(gen_answer.response)
# r_context = gen_answer.context
# print("i_text = ", i_text)
# print("a_output = ", a_output)
# print("e_output = ", e_output)
# print("r_context = ", r_context)
# if is_invalid_length(i_text) or is_invalid_length(e_output) or is_invalid_length(r_context):
# continue
mse_dataset.append(LLMTestCase(input=i_text, actual_output=a_output, expected_output=e_output, retrieval_context=[r_context]))
count = count + 1
# if curr_row % 1 == 0:
print("At", str(count), "out of", str(test_case_num), " current row =", str(curr_row))
# first_test_case = LLMTestCase(input="...", actual_output="...", context=["..."])
# second_test_case = LLMTestCase(input="...", actual_output="...", context=["..."])
dataset = EvaluationDataset(test_cases=mse_dataset)
dataset.save_as(file_type="json", directory="./deepeval-test-dataset", file_name=dataset_name, include_test_cases=True)