陈俊杰
baseline
5f37ab9
raw
history blame
3.83 kB
import pandas as pd
from tqdm import *
import math
import numpy as np
from scipy.stats import kendalltau, spearmanr
log_txt = open("log.txt", "a")
def calculate(human_scores, model_scores):
"""
Calculate the metrics based on the model evaluation results and human annotation results for the 7 different answers to the same question.
"""
acc = 0
total = 0
score = 0
for i in range(0, len(human_scores)):
for j in range(i + 1, len(human_scores)):
A_human = human_scores[i]
B_human = human_scores[j]
if A_human != B_human:
total += 1
A_model = model_scores[i]
B_model = model_scores[j]
if A_model == B_model:
score += 0.5
else:
if A_human > B_human and A_model > B_model:
score += 1
elif A_human < B_human and A_model < B_model:
score += 1
if total != 0:
acc = score / total
else:
acc = 1
x = np.array(human_scores)
y = np.array(model_scores)
kendall, kendall_p_value = kendalltau(x, y)
if math.isnan(kendall):
kendall = 0
spearman, spearman_p_value = spearmanr(x, y)
if math.isnan(spearman):
spearman = 0
return acc, abs(kendall), abs(spearman)
def eval(path):
"""
Obtain the metric scores of the results from the specified path.
"""
# taskId,questionId,answerId,score,rank
# Read the result file
with open(path, 'r') as file:
lines = file.readlines()
data = [line.strip().split() for line in lines]
sorted_data = sorted(data, key=lambda x: (int(x[0]), int(x[1]), int(x[2])))
# Read the human annotation file
annotations = ["output_dialog.csv", "output_story.csv", "output_Xsum.csv", "output_NFCATS.csv"]
model_scores = []
for annotation in annotations:
df = pd.read_csv(f"human_annotation/{annotation}") # hide
row_labels = df.index
test_total_num = 20 # test_set
average_acc = 0
average_kendall = 0
average_spearman = 0
now_questionId = -1
answerId = -1
for row in tqdm(row_labels):
taskId = df.loc[row, "taskId"]
questionId = df.loc[row, "questionId"]
if int(questionId) < 20 or int(questionId) > 39:
continue
human_score = df.loc[row, "score"]
answerId = (answerId + 1) % 7
model_score = sorted_data[taskId * 140 + (questionId - 20) * 7 + answerId][3]
if questionId == now_questionId:
human_scores.append(human_score)
model_scores.append(model_score)
else:
if now_questionId != -1:
acc, kendall, spearman = calculate(human_scores, model_scores)
log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n")
average_acc += acc
average_kendall += kendall
average_spearman += spearman
human_scores = [human_score]
model_scores = [model_score]
now_questionId = questionId
acc, kendall, spearman = calculate(human_scores, model_scores)
log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n")
average_acc += acc
average_kendall += kendall
average_spearman += spearman
log_txt.write(f"On task{taskId}, average acc is {average_acc/test_total_num}, average kendall is {average_kendall/test_total_num}, average spearman is {average_spearman/test_total_num}\n")
if __name__ == "__main__":
eval("output/baseline1_chatglm3_6B.txt")