File size: 6,008 Bytes
0c400ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439a704
0c400ac
 
 
 
 
 
 
 
 
439a704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4e212f
0c400ac
 
 
 
 
 
 
 
439a704
0c400ac
439a704
 
0c400ac
439a704
 
 
f4e212f
 
 
 
 
 
 
 
 
 
 
439a704
 
0c400ac
 
bfab51a
 
 
 
0c400ac
 
f4e212f
 
 
0c400ac
 
 
f4e212f
0c400ac
439a704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from typing import Sequence

import numpy
from sklearn.metrics import roc_curve, auc

import datasets
import evaluate

_DESCRIPTION = """
MC-AUROC (Multi-class Area Under the Receiver Operating Characteristic Curve) is a performance metric used in multiclass classification tasks.
 It evaluates the ability of a model to distinguish between positive and negative classes across different threshold values. 
 The curve is generated by plotting the true positive rate (sensitivity) against the false positive rate (1-specificity) at various threshold settings. 
 AUROC provides a single scalar value indicating the overall discriminatory power of the model, with higher values suggesting better performance. 
"""

_KWARGS_DESCRIPTION = """
AUROC metric for binary classification predictions. Here we use one-vs-all strategy to calculate the AUROC for multi-class classification problems. 
The multi-class AUROC is calculated by treating each class as the positive class and the rest as the negative class. 
The final score is the average of the AUROC scores for each class. 

Args:
probabilities: list-like. Predicted probabilities or decision scores for the each class.
true_labels: list-like. True labels indicating the actual class memberships (must be ordinal, starting from 0).
Returns:
auroc_score: float. Multi-class Area Under the Receiver Operating Characteristic Curve (MC-AUROC) score.
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class AVG_MULTICLASS_AUROC(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            inputs_description=_KWARGS_DESCRIPTION,
            citation="",
            features=[
                datasets.Features(
                    {
                        "predictions": datasets.Sequence(datasets.Value("float")),
                        "references": datasets.Value("int8")
                    }
                ),
            ],
            reference_urls=[
                "https://en.wikipedia.org/wiki/Receiver_operating_characteristic"
            ],
        )

    def _evaluate_statistics(self, variates, coverage):
        """Evaluates the left and right margins for a given M-C distribution


        Parameters
        ----------

        variates : numpy.ndarray
            A 1-D array containing the simulated variates

        coverage : float
            A number, between 0 and 1 to indicate the desired coverage.  Typically,
            this number is set to 0.95 (95% coverage).


        Returns
        -------

        stats : (float, float, float, float)
            mean, mode and credible intervals for the input simulation

        """

        left_half = (1 - coverage) / 2  # size of excluded (half) area
        sorted_variates = numpy.sort(variates)

        # n.b.: we return the equally tailed range

        # calculates position of score which would exclude the left_half (left)
        lower_index = int(round(len(variates) * left_half))

        # calculates position of score which would exclude the right_half (right)
        upper_index = int(round(len(variates) * (1 - left_half)))

        lower = sorted_variates[lower_index - 1]
        upper = sorted_variates[upper_index - 1]

        return lower, upper

    def _compute(self, predictions: Sequence[Sequence[float]], references: Sequence[int], CI=False):
        """
        Computes the average AUROC score for multi-class classification problems.
        """
        probabilities = predictions

        n_classes = list(range(len(probabilities[0])))
        fpr = dict()
        tpr = dict()
        thresholds = dict()
        roc_auc = dict()
        roc_auc_ci_low = dict()
        roc_auc_ci_high = dict()
        for i in range(len(n_classes)):
            fpr[i], tpr[i], thresholds[i] = roc_curve(y_true=[1 if x == n_classes[i] else 0 for x in references],
                                                      y_score=[prob[i] for prob in probabilities])

            if CI:
                confusion_matrices = self._get_CMs(i, probabilities, references, thresholds)
                
                low_ci_tpr, high_ci_tpr = [0] * len(thresholds[i]), [0] * len(thresholds[i])
                位 = 1.0
                for k in range(len(thresholds[i])):
                    variates = numpy.random.beta(confusion_matrices[k]["TP"] + 位, confusion_matrices[k]["FN"] + 位, 1000000)
                    low_ci_tpr[k], high_ci_tpr[k] = self._evaluate_statistics(variates, 0.95)
    
                roc_auc_ci_low[i] = auc(fpr[i], low_ci_tpr)
                roc_auc_ci_high[i] = auc(fpr[i], high_ci_tpr)


            roc_auc[i] = auc(fpr[i], tpr[i])

            # if AUC is NaN, set it to 0
            if numpy.isnan(roc_auc[i]):
                roc_auc[i] = 0

        # Compute average AUC
        average_auc = numpy.mean(list(roc_auc.values()))
        if CI:
            average_auc_ci_low = numpy.mean(list(roc_auc_ci_low.values())) 
            average_auc_ci_high = numpy.mean(list(roc_auc_ci_high.values()))

        return {
            "mc_auroc_score": average_auc,
            "mc_auroc_ci": (average_auc_ci_low, average_auc_ci_high) if CI else None
        }

    def _get_CMs(self, i, probabilities, references, thresholds):
        confusion_matrices = []
        for threshold in thresholds[i]:
            TP = 0
            FP = 0
            TN = 0
            FN = 0
            for j in range(len(probabilities)):
                if probabilities[j][i] >= threshold:
                    if references[j] == i:
                        TP += 1
                    else:
                        FP += 1
                else:
                    if references[j] == i:
                        FN += 1
                    else:
                        TN += 1
            cm = {"TP": TP, "FP": FP, "TN": TN, "FN": FN, "threshold": threshold, "class": i}
            confusion_matrices.append(cm)

        return confusion_matrices