|
import pandas as pd |
|
from tqdm import * |
|
import math |
|
|
|
import numpy as np |
|
from scipy.stats import kendalltau, spearmanr |
|
|
|
log_txt = open("log.txt", "a") |
|
|
|
def calculate(human_scores, model_scores): |
|
""" |
|
Calculate the metrics based on the model evaluation results and human annotation results for the 7 different answers to the same question. |
|
""" |
|
acc = 0 |
|
total = 0 |
|
score = 0 |
|
for i in range(0, len(human_scores)): |
|
for j in range(i + 1, len(human_scores)): |
|
A_human = human_scores[i] |
|
B_human = human_scores[j] |
|
if A_human != B_human: |
|
total += 1 |
|
A_model = model_scores[i] |
|
B_model = model_scores[j] |
|
if A_model == B_model: |
|
score += 0.5 |
|
else: |
|
if A_human > B_human and A_model > B_model: |
|
score += 1 |
|
elif A_human < B_human and A_model < B_model: |
|
score += 1 |
|
if total != 0: |
|
acc = score / total |
|
else: |
|
acc = 1 |
|
x = np.array(human_scores) |
|
y = np.array(model_scores) |
|
kendall, kendall_p_value = kendalltau(x, y) |
|
if math.isnan(kendall): |
|
kendall = 0 |
|
spearman, spearman_p_value = spearmanr(x, y) |
|
if math.isnan(spearman): |
|
spearman = 0 |
|
|
|
return acc, abs(kendall), abs(spearman) |
|
|
|
def eval(path): |
|
""" |
|
Obtain the metric scores of the results from the specified path. |
|
""" |
|
|
|
|
|
with open(path, 'r') as file: |
|
lines = file.readlines() |
|
data = [line.strip().split() for line in lines] |
|
sorted_data = sorted(data, key=lambda x: (int(x[0]), int(x[1]), int(x[2]))) |
|
|
|
|
|
annotations = ["output_dialog.csv", "output_story.csv", "output_Xsum.csv", "output_NFCATS.csv"] |
|
model_scores = [] |
|
for annotation in annotations: |
|
df = pd.read_csv(f"human_annotation/{annotation}") |
|
row_labels = df.index |
|
test_total_num = 20 |
|
average_acc = 0 |
|
average_kendall = 0 |
|
average_spearman = 0 |
|
now_questionId = -1 |
|
answerId = -1 |
|
for row in tqdm(row_labels): |
|
taskId = df.loc[row, "taskId"] |
|
questionId = df.loc[row, "questionId"] |
|
if int(questionId) < 20 or int(questionId) > 39: |
|
continue |
|
human_score = df.loc[row, "score"] |
|
answerId = (answerId + 1) % 7 |
|
model_score = sorted_data[taskId * 140 + (questionId - 20) * 7 + answerId][3] |
|
if questionId == now_questionId: |
|
human_scores.append(human_score) |
|
model_scores.append(model_score) |
|
else: |
|
if now_questionId != -1: |
|
acc, kendall, spearman = calculate(human_scores, model_scores) |
|
log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n") |
|
average_acc += acc |
|
average_kendall += kendall |
|
average_spearman += spearman |
|
human_scores = [human_score] |
|
model_scores = [model_score] |
|
now_questionId = questionId |
|
|
|
acc, kendall, spearman = calculate(human_scores, model_scores) |
|
log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n") |
|
average_acc += acc |
|
average_kendall += kendall |
|
average_spearman += spearman |
|
log_txt.write(f"On task{taskId}, average acc is {average_acc/test_total_num}, average kendall is {average_kendall/test_total_num}, average spearman is {average_spearman/test_total_num}\n") |
|
|
|
|
|
if __name__ == "__main__": |
|
eval("output/baseline1_chatglm3_6B.txt") |
|
|