import pandas as pd from tqdm import * import math import numpy as np from scipy.stats import kendalltau, spearmanr log_txt = open("log.txt", "a") def calculate(human_scores, model_scores): """ Calculate the metrics based on the model evaluation results and human annotation results for the 7 different answers to the same question. """ acc = 0 total = 0 score = 0 for i in range(0, len(human_scores)): for j in range(i + 1, len(human_scores)): A_human = human_scores[i] B_human = human_scores[j] if A_human != B_human: total += 1 A_model = model_scores[i] B_model = model_scores[j] if A_model == B_model: score += 0.5 else: if A_human > B_human and A_model > B_model: score += 1 elif A_human < B_human and A_model < B_model: score += 1 if total != 0: acc = score / total else: acc = 1 x = np.array(human_scores) y = np.array(model_scores) kendall, kendall_p_value = kendalltau(x, y) if math.isnan(kendall): kendall = 0 spearman, spearman_p_value = spearmanr(x, y) if math.isnan(spearman): spearman = 0 return acc, abs(kendall), abs(spearman) def eval(path): """ Obtain the metric scores of the results from the specified path. """ # taskId,questionId,answerId,score,rank # Read the result file with open(path, 'r') as file: lines = file.readlines() data = [line.strip().split() for line in lines] sorted_data = sorted(data, key=lambda x: (int(x[0]), int(x[1]), int(x[2]))) # Read the human annotation file annotations = ["output_dialog.csv", "output_story.csv", "output_Xsum.csv", "output_NFCATS.csv"] model_scores = [] for annotation in annotations: df = pd.read_csv(f"human_annotation/{annotation}") # hide row_labels = df.index test_total_num = 20 # test_set average_acc = 0 average_kendall = 0 average_spearman = 0 now_questionId = -1 answerId = -1 for row in tqdm(row_labels): taskId = df.loc[row, "taskId"] questionId = df.loc[row, "questionId"] if int(questionId) < 20 or int(questionId) > 39: continue human_score = df.loc[row, "score"] answerId = (answerId + 1) % 7 model_score = sorted_data[taskId * 140 + (questionId - 20) * 7 + answerId][3] if questionId == now_questionId: human_scores.append(human_score) model_scores.append(model_score) else: if now_questionId != -1: acc, kendall, spearman = calculate(human_scores, model_scores) log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n") average_acc += acc average_kendall += kendall average_spearman += spearman human_scores = [human_score] model_scores = [model_score] now_questionId = questionId acc, kendall, spearman = calculate(human_scores, model_scores) log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n") average_acc += acc average_kendall += kendall average_spearman += spearman log_txt.write(f"On task{taskId}, average acc is {average_acc/test_total_num}, average kendall is {average_kendall/test_total_num}, average spearman is {average_spearman/test_total_num}\n") if __name__ == "__main__": eval("output/baseline1_chatglm3_6B.txt")