File size: 8,577 Bytes
a8af1a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
{
  "metadata": {
    "Name": "Model A",
    "Provider": "TechCorp",
    "Version": "2.1",
    "Release Date": "2023-09-15",
    "Type": "Large Language Model",
    "Modalities": ["Text-to-Text"]
  },
  "scores": {
    "Bias, Stereotypes, and Representational Harms Evaluation": {
      "Comprehensive Evaluation Methodology": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Evaluations at various stages (data collection, preprocessing, model architecture, training, deployment)",
          "Both intrinsic (e.g., embedding analysis) and extrinsic (e.g., downstream task performance) evaluation methods"
        ]
      },
      "Inclusive Protected Class Consideration": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Evaluation of non-standard protected classes (e.g., socioeconomic status, education level, regional differences)",
          "Consideration of intersectionality and how identity aspects interact",
          "Assessment of potential harms to non-typical groups (e.g., by profession or hobbies)"
        ]
      },
      "Cultural and Linguistic Diversity": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Tests of model performance and biases across languages and cultures",
          "Consideration of how protected categories may shift in meaning across regions"
        ]
      },
      "Stereotype and Harmful Association Detection": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Detection of stereotypical word associations in text models",
          "Sentiment analysis and toxicity measurements, especially regarding specific groups"
        ]
      },
      "Performance Disparities Assessment": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Detailed breakdowns of performance metrics (accuracy, precision, recall) for various subgroups",
          "Performance analysis for disadvantaged subgroups",
          "Intersectionality considerations in performance analysis"
        ]
      }
    },
    "Cultural Values and Sensitive Content Evaluation": {
      "Hate Speech and Toxicity Evaluation": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Assessments of harmful text generation",
          "Evaluations of toxicity, hurtfulness, or offensiveness"
        ]
      },
      "Cultural Value Representation": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Use of pre-existing scholarship (e.g., World Values Survey, Geert Hofstede's work)",
          "Inductive and participatory evaluations grounded in specific cultural contexts",
          "Assessments of ethical scenarios and political value representation"
        ]
      },
      "Diverse Cultural Context": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Assessments that don't equate nationality with cultural context",
          "Representation of differing cultural values within countries"
        ]
      }
    },
    "Disparate Performance": {
      "Subpopulation Performance Analysis": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Non-aggregated (disaggregated) evaluation results with in-depth breakdowns across subpopulations",
          "Metrics such as subgroup accuracy, calibration, AUC, recall, precision, min-max ratios"
        ]
      },
      "Cross-lingual and Dialect Evaluation": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Cross-lingual prompting on standard benchmarks",
          "Examination of performance across dialects",
          "Analysis of hallucination disparity across languages"
        ]
      },
      "Image Generation Quality Assessment": {
        "status": "N/A",
        "source": null,
        "applicable_evaluations": []
      }
    },
    "Environmental Costs and Carbon Emissions Evaluation": {
      "Energy Consumption Measurement": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Measurement of energy used in training, testing, and deploying the system",
          "Evaluation of compute power consumption"
        ]
      },
      "Carbon Footprint Quantification": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Use of tools like CodeCarbon or Carbontracker",
          "Measurement of carbon emissions for training and inference",
          "Conversion of energy consumption to carbon emissions"
        ]
      },
      "Hardware Resource Evaluation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Assessment of CPU, GPU, and TPU usage",
          "Measurement of FLOPS (Floating Point Operations)"
        ]
      }
    },
    "Privacy and Data Protection Evaluation": {
      "Data Minimization and Consent Practices": {
        "status": "Yes",
        "source": "Both",
        "applicable_evaluations": [
          "Implementation of data minimization practices",
          "Use of opt-in data collection methods",
          "Assessment of active consent for collecting, processing, and sharing data"
        ]
      },
      "Memorization and Data Leakage Evaluation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Examination of the maximum amount of discoverable information given training data",
          "Evaluation of extractable information without training data access"
        ]
      },
      "Personal Information Revelation Assessment": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Direct prompting tests to reveal Personally Identifiable Information (PII)",
          "Use of tools like ProPILE to audit PII revelation likelihood",
          "Evaluation of the system's ability to infer personal attributes"
        ]
      }
    },
    "Financial Costs Evaluation": {
      "Comprehensive Cost Evaluation": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Estimation of infrastructure and hardware costs",
          "Calculation of labor hours from researchers, developers, and crowd workers",
          "Tracking of compute costs using low-cost or standard pricing per instance-hour"
        ]
      },
      "Storage and Training Cost Analysis": {
        "status": "Yes",
        "source": "1P",
        "applicable_evaluations": [
          "Assessment of storage costs for both datasets and resulting models",
          "Consideration of in-house vs. cloud storage options",
          "Evaluation of training costs based on in-house GPUs or per-hour-priced instances"
        ]
      },
      "Hosting and Inference Cost Evaluation": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Evaluation of low-latency serving costs",
          "Assessment of inference costs based on token usage",
          "Consideration of factors such as initial prompt length and requested token response length"
        ]
      }
    },
    "Data and Content Moderation Labor Evaluation": {
      "Crowdwork Standards Compliance": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Assessment of compliance with Criteria for Fairer Microwork",
          "Evaluation against Partnership on AI's Responsible Sourcing of Data Enrichment Services guidelines",
          "Comparison with Oxford Internet Institute's Fairwork Principles"
        ]
      },
      "Crowdworker Demographics and Compensation": {
        "status": "Yes",
        "source": "3P",
        "applicable_evaluations": [
          "Documentation of crowd workers' demographics",
          "Transparency in reporting instructions given to crowdworkers",
          "Assessment of how crowdworkers were evaluated and compensated"
        ]
      },
      "Psychological Support and Content Exposure": {
        "status": "No",
        "source": null,
        "applicable_evaluations": [
          "Documentation of immediate trauma support availability",
          "Assessment of long-term professional psychological support provision",
          "Evaluation of practices for controlling exposure to traumatic material"
        ]
      }
    }
  }
}