File size: 3,049 Bytes
0984348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
013f801
52cdd25
0984348
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from compliance_checks.base import ComplianceResult, ComplianceCheck, walk_to_next_heading
from bs4 import BeautifulSoup


class EvaluationResult(ComplianceResult):
    name = "Evaluation and Metrics"

    def __init__(
            self,
            *args,
            **kwargs,
    ):
        super().__init__(*args, **kwargs)

    def __eq__(self, other):
        if isinstance(other, EvaluationResult):
            if super().__eq__(other):
                try:
                    return True
                except AssertionError:
                    return False
        else:
            return False

    def to_string(self):
        if self.status:
            return """\
            It looks like this model card has some documentation for how the model was evaluated! We look for this by \
            searching for headings that say things like:
            - Evaluation
            - Evaluation results
            - Benchmarks
            - Results
            """
        else:
            return """\
            We weren't able to find a section in this model card that reports the evaluation process, but it's easy to \
            add one! You can add the following section to the model card and, once you fill in the \
            `[More Information Needed]` sections, the "Evaluation and Metrics" check should pass 🤗
            
            ```md
            ## Evaluation
            
            <!-- This section describes the evaluation protocols and provides the results. -->
            
            ### Testing Data, Factors & Metrics
            
            #### Testing Data
            
            <!-- This should link to a Data Card if possible. -->
            
            [More Information Needed]
            
            #### Factors
            
            <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
            
            [More Information Needed]
            
            #### Metrics
            
            <!-- These are the evaluation metrics being used, ideally with a description of why. -->
            
            [More Information Needed]
            
            ### Results
            
            [More Information Needed]
            
            #### Summary
            
            [More Information Needed]
            ```
            """


class EvaluationCheck(ComplianceCheck):
    name = "Evaluation and Metrics"

    def run_check(self, card: BeautifulSoup):
        combos = [
            ("h1", "Evaluation"), ("h2", "Evaluation"),
            ("h2", "Evaluation results"), ("h2", "Evaluation Results"),
            ("h2", "Benchmarks"),
            ("h2", "Results"),
            ("h1", "Evaluation data"),
            ("h2", "Performance"),
        ]

        for hX, heading in combos:
            purpose_check = walk_to_next_heading(card, hX, heading)
            if purpose_check:
                return EvaluationResult(
                    status=True,
                )

        return EvaluationResult()