|
import argparse
|
|
import jsonlines
|
|
import json
|
|
|
|
from deepeval.models import OllamaModel
|
|
from deepeval.metrics import (
|
|
ContextualRelevancyMetric,
|
|
ContextualRecallMetric,
|
|
ContextualPrecisionMetric,
|
|
AnswerRelevancyMetric,
|
|
FaithfulnessMetric
|
|
)
|
|
|
|
|
|
|
|
|
|
from deepeval.test_case import LLMTestCase
|
|
from deepeval.dataset import EvaluationDataset, Golden
|
|
|
|
from deepeval import evaluate
|
|
from deepeval.models import OllamaModel
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
from Llemma_Finetuned import Llemma_Finetuned
|
|
import ollama
|
|
|
|
|
|
|
|
|
|
if __name__=="__main__":
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
|
|
parser.add_argument("-n", "--num", help = "Number of test cases to use")
|
|
parser.add_argument("-s", "--shot", help = "n-shot inference examples")
|
|
parser.add_argument("-d", "--dataset", help = "Path to test case dataset")
|
|
|
|
|
|
args = parser.parse_args()
|
|
test_case_num = int(args.num)
|
|
num_shot = int(args.shot)
|
|
dataset_name = str(args.dataset)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sorted_rows = []
|
|
with open('dataset_row_stl.txt', 'r') as file:
|
|
sorted_rows = file.readlines()
|
|
|
|
sorted_rows = sorted_rows[0:num_shot]
|
|
sorted_rows = [int(x) for x in sorted_rows]
|
|
|
|
print("Read in sorted rows.")
|
|
|
|
examples = "Here are " + str(num_shot) + " examples of math questions (Q) with given answers (A).\n"
|
|
with jsonlines.open("mse_text_img_QA_ds_test.jsonl", mode='r') as fp:
|
|
|
|
n = 0
|
|
for j, data in enumerate(fp):
|
|
if j + 1 in sorted_rows:
|
|
print("Num shot row " + str(j + 1))
|
|
|
|
examples += "Q: " + data["body"] + "\n\n"
|
|
is_accepted = False
|
|
best_score = float('-inf')
|
|
output_text = ""
|
|
for i in range(len(data["answers"])):
|
|
if bool(data["answers"][i]["accepted"]) == True:
|
|
if is_accepted == False:
|
|
is_accepted = True
|
|
best_score = int(data["answers"][i]["score"])
|
|
output_text = data["answers"][i]["body"]
|
|
elif int(data["answers"][i]["score"]) > best_score:
|
|
best_score = int(data["answers"][i]["score"])
|
|
output_text = data["answers"][i]["body"]
|
|
elif int(data["answers"][i]["score"]) > best_score:
|
|
best_score = int(data["answers"][i]["score"])
|
|
output_text = data["answers"][i]["body"]
|
|
examples += "A: " + output_text + "\n\n"
|
|
if n == (num_shot - 1):
|
|
examples += "Provide an answer (A) to the following math question (Q) in a similar manner to the previous example(s) given.\n\nQ: "
|
|
|
|
n += 1
|
|
elif n >= num_shot:
|
|
break
|
|
else:
|
|
continue
|
|
|
|
print("Generated examples for", str(num_shot), "shot.")
|
|
|
|
mse_dataset = []
|
|
with jsonlines.open("mse_text_img_QA_ds_test.jsonl", mode='r') as reader:
|
|
|
|
count = 0
|
|
|
|
curr_row = 0
|
|
for row in reader.iter(type=dict, skip_invalid=True):
|
|
curr_row += 1
|
|
if curr_row == 33 or curr_row == 36 or curr_row == 69 \
|
|
or curr_row == 24 or curr_row == 76 \
|
|
or curr_row == 66 or curr_row == 9 \
|
|
or curr_row == 26 or curr_row == 27 \
|
|
or curr_row == 37 or curr_row == 55 \
|
|
or curr_row == 54 or curr_row == 138 \
|
|
or curr_row == 77 or curr_row == 84 or curr_row == 87 \
|
|
or curr_row == 80 or curr_row == 81 or curr_row == 97 \
|
|
or curr_row == 115 or curr_row == 106:
|
|
print("Skipped row " + str(curr_row))
|
|
continue
|
|
elif curr_row in sorted_rows:
|
|
print("Skipped row " + str(curr_row) + " because it is a shorter example")
|
|
continue
|
|
|
|
|
|
|
|
if count >= test_case_num:
|
|
break
|
|
else:
|
|
input_text = row["body"]
|
|
|
|
|
|
is_accepted = False
|
|
best_score = float('-inf')
|
|
output_text = ""
|
|
|
|
next_best_answer = ""
|
|
for i in range(len(row["answers"])):
|
|
if bool(row["answers"][i]["accepted"]) == True:
|
|
if is_accepted == False:
|
|
is_accepted = True
|
|
next_best_answer = output_text
|
|
best_score = int(row["answers"][i]["score"])
|
|
output_text = row["answers"][i]["body"]
|
|
elif int(row["answers"][i]["score"]) > best_score:
|
|
next_best_answer = output_text
|
|
best_score = int(row["answers"][i]["score"])
|
|
output_text = row["answers"][i]["body"]
|
|
|
|
|
|
elif int(row["answers"][i]["score"]) > best_score:
|
|
next_best_answer = output_text
|
|
best_score = int(row["answers"][i]["score"])
|
|
output_text = row["answers"][i]["body"]
|
|
|
|
|
|
if next_best_answer == "" or next_best_answer is None:
|
|
next_best_answer = row["title"]
|
|
|
|
|
|
if num_shot == 0:
|
|
i_text = json.dumps(input_text)
|
|
e_output = json.dumps(output_text)
|
|
r_context = json.dumps(next_best_answer)
|
|
gen_answer = ollama.generate(model="Hudson/llemma:7b", prompt=i_text)
|
|
a_output = json.dumps(gen_answer.response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mse_dataset.append(LLMTestCase(input=i_text, actual_output=a_output, expected_output=e_output, retrieval_context=[r_context]))
|
|
else:
|
|
i_text = json.dumps(examples + input_text)
|
|
e_output = json.dumps(output_text)
|
|
r_context = json.dumps(next_best_answer)
|
|
gen_answer = ollama.generate(model="Hudson/llemma:7b", prompt=i_text)
|
|
a_output = json.dumps(gen_answer.response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mse_dataset.append(LLMTestCase(input=i_text, actual_output=a_output, expected_output=e_output, retrieval_context=[r_context]))
|
|
count = count + 1
|
|
|
|
print("At", str(count), "out of", str(test_case_num), " current row =", str(curr_row))
|
|
|
|
|
|
|
|
|
|
|
|
dataset = EvaluationDataset(test_cases=mse_dataset)
|
|
dataset.save_as(file_type="json", directory="./deepeval-test-dataset", file_name=dataset_name, include_test_cases=True)
|
|
|
|
|