File size: 3,827 Bytes
5f37ab9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
from tqdm import *
import math

import numpy as np
from scipy.stats import kendalltau, spearmanr

log_txt = open("log.txt", "a")

def calculate(human_scores, model_scores):
    """
    Calculate the metrics based on the model evaluation results and human annotation results for the 7 different answers to the same question.
    """
    acc = 0
    total = 0
    score = 0
    for i in range(0, len(human_scores)):
        for j in range(i + 1, len(human_scores)):
            A_human = human_scores[i]
            B_human = human_scores[j]
            if A_human != B_human:
                total += 1
                A_model = model_scores[i]
                B_model = model_scores[j]
                if A_model == B_model:
                    score += 0.5
                else:
                    if A_human > B_human and A_model > B_model:
                        score += 1
                    elif A_human < B_human and A_model < B_model:
                        score += 1
    if total != 0: 
        acc = score / total
    else:
        acc = 1
    x = np.array(human_scores)
    y = np.array(model_scores)
    kendall, kendall_p_value = kendalltau(x, y)
    if math.isnan(kendall):
        kendall = 0
    spearman, spearman_p_value = spearmanr(x, y)
    if math.isnan(spearman):
        spearman = 0
    
    return acc, abs(kendall), abs(spearman)

def eval(path):
    """
    Obtain the metric scores of the results from the specified path.
    """
    # taskId,questionId,answerId,score,rank
    # Read the result file
    with open(path, 'r') as file:
        lines = file.readlines()
    data = [line.strip().split() for line in lines]
    sorted_data = sorted(data, key=lambda x: (int(x[0]), int(x[1]), int(x[2])))

    # Read the human annotation file
    annotations = ["output_dialog.csv", "output_story.csv", "output_Xsum.csv", "output_NFCATS.csv"]
    model_scores = []
    for annotation in annotations:
        df = pd.read_csv(f"human_annotation/{annotation}") # hide
        row_labels = df.index
        test_total_num = 20 # test_set
        average_acc = 0
        average_kendall = 0
        average_spearman = 0
        now_questionId = -1
        answerId = -1
        for row in tqdm(row_labels):
            taskId = df.loc[row, "taskId"]
            questionId = df.loc[row, "questionId"]
            if int(questionId) < 20 or int(questionId) > 39:
                continue
            human_score = df.loc[row, "score"]
            answerId = (answerId + 1) % 7
            model_score = sorted_data[taskId * 140 + (questionId - 20) * 7 + answerId][3]
            if questionId == now_questionId:
                human_scores.append(human_score)
                model_scores.append(model_score)
            else:
                if now_questionId != -1:
                    acc, kendall, spearman = calculate(human_scores, model_scores)
                    log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n")
                    average_acc += acc
                    average_kendall += kendall
                    average_spearman += spearman
                human_scores = [human_score]
                model_scores = [model_score]
                now_questionId = questionId

        acc, kendall, spearman = calculate(human_scores, model_scores)
        log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n")
        average_acc += acc
        average_kendall += kendall
        average_spearman += spearman
        log_txt.write(f"On task{taskId}, average acc is {average_acc/test_total_num}, average kendall is {average_kendall/test_total_num}, average spearman is {average_spearman/test_total_num}\n")


if __name__ == "__main__":
    eval("output/baseline1_chatglm3_6B.txt")