Ankit15nov commited on
Commit
8cfdceb
1 Parent(s): 6bb0190

week 3 assignment 2

Browse files
Files changed (1) hide show
  1. Stanford's_HELM_LM_Evaluation.ipynb +717 -0
Stanford's_HELM_LM_Evaluation.ipynb ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "hcPOD7h1s68l"
7
+ },
8
+ "source": [
9
+ "### Stanford's Heuristic Evaluation of Language Models\n",
10
+ "\n",
11
+ "Based on the work in this [repository](https://github.com/stanford-crfm/helm), we'll be implementing HELM to evaluate a model today!"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "markdown",
16
+ "metadata": {
17
+ "id": "1XneqmcEtpRe"
18
+ },
19
+ "source": [
20
+ "As always, let's grab some dependencies! \n",
21
+ "\n",
22
+ "**PLEASE RESTART YOUR ENV AFTER INSTALLING THESE DEPENDENCIES**"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 1,
28
+ "metadata": {
29
+ "id": "vAbfmcNjtsPr"
30
+ },
31
+ "outputs": [],
32
+ "source": [
33
+ "!pip install -q crfm-helm\n",
34
+ "!pip install -q typing_extensions==4.5.0"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 2,
40
+ "metadata": {
41
+ "id": "HgAjBesZyZJx"
42
+ },
43
+ "outputs": [
44
+ {
45
+ "name": "stdout",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "Looking in indexes: https://download.pytorch.org/whl/cu118\n",
49
+ "Requirement already satisfied: torch in /home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages (1.12.1)\n",
50
+ "Collecting torch\n",
51
+ " Downloading https://download.pytorch.org/whl/cu118/torch-2.0.1%2Bcu118-cp310-cp310-linux_x86_64.whl (2267.3 MB)\n",
52
+ "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/2.3 GB\u001b[0m \u001b[31m318.5 MB/s\u001b[0m eta \u001b[36m0:00:03\u001b[0m^C\n",
53
+ "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/2.3 GB\u001b[0m \u001b[31m302.0 MB/s\u001b[0m eta \u001b[36m0:00:04\u001b[0m\n",
54
+ "\u001b[?25h\u001b[31mERROR: Operation cancelled by user\u001b[0m\u001b[31m\n",
55
+ "\u001b[0m"
56
+ ]
57
+ }
58
+ ],
59
+ "source": [
60
+ "!pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "markdown",
65
+ "metadata": {
66
+ "id": "dckk0SXyweMm"
67
+ },
68
+ "source": [
69
+ "Now, let's dive in to doing a simple evaluation!"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 3,
75
+ "metadata": {
76
+ "id": "3Li97dKSttn-"
77
+ },
78
+ "outputs": [],
79
+ "source": [
80
+ "!echo 'entries: [{description: \"mmlu:subject=philosophy,model=huggingface/gpt2\", priority: 1}]' > run_specs.conf"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "metadata": {
87
+ "colab": {
88
+ "base_uri": "https://localhost:8080/"
89
+ },
90
+ "id": "BFK0KChJwlI-",
91
+ "outputId": "8385bfce-0e7b-409f-c436-0f0de4b02348"
92
+ },
93
+ "outputs": [
94
+ {
95
+ "name": "stdout",
96
+ "output_type": "stream",
97
+ "text": [
98
+ "main {\n",
99
+ " Read 1 run entries from run_specs.conf\n",
100
+ " 1 entries produced 1 run specs\n",
101
+ " run_specs {\n",
102
+ " RunSpec(name='mmlu:subject=philosophy,method=multiple_choice_joint,model=huggingface_gpt2', scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), adapter_spec=AdapterSpec(method='multiple_choice_joint', global_prefix='', instructions='The following are multiple choice questions (with answers) about philosophy.\\n', input_prefix='Question: ', input_suffix='\\n', reference_prefix='A. ', reference_suffix='\\n', output_prefix='Answer: ', output_suffix='\\n', instance_prefix='\\n', substitutions=[], max_train_instances=5, max_eval_instances=1, num_outputs=5, num_train_trials=1, sample_train=True, model='huggingface/gpt2', temperature=0.0, max_tokens=1, stop_sequences=['\\n'], random=None), metric_specs=[MetricSpec(class_name='helm.benchmark.basic_metrics.BasicMetric', args={'names': ['exact_match', 'quasi_exact_match', 'prefix_exact_match', 'quasi_prefix_exact_match']})], data_augmenter_spec=DataAugmenterSpec(perturbation_specs=[], should_augment_train_instances=False, should_include_original_train=False, should_skip_unchanged_train=False, should_augment_eval_instances=False, should_include_original_eval=False, should_skip_unchanged_eval=False, seeds_per_instance=1), groups=['mmlu'])\n",
103
+ " } [0.0s]\n",
104
+ " Running in local mode with base path: prod_env\n",
105
+ "Looking in path: prod_env\n",
106
+ " AutoClient: cache_path = prod_env/cache\n",
107
+ " AutoClient: mongo_uri = \n",
108
+ " Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
109
+ " Found 1 account(s).\n",
110
+ " 0%| | 0/1 [00:00<?, ?it/s] Running mmlu:subject=philosophy,method=multiple_choice_joint,model=huggingface_gpt2 {\n",
111
+ " scenario.get_instances {\n",
112
+ " ensure_file_downloaded {\n",
113
+ " Not downloading https://people.eecs.berkeley.edu/~hendrycks/data.tar because benchmark_output/scenarios/mmlu/data already exists\n",
114
+ " } [0.0s]\n",
115
+ " benchmark_output/scenarios/mmlu/data/auxiliary_train/philosophy_auxiliary_train.csv doesn't exist, skipping\n",
116
+ " Reading benchmark_output/scenarios/mmlu/data/dev/philosophy_dev.csv\n",
117
+ " Reading benchmark_output/scenarios/mmlu/data/val/philosophy_val.csv\n",
118
+ " Reading benchmark_output/scenarios/mmlu/data/test/philosophy_test.csv\n",
119
+ " } [0.004s]\n",
120
+ " 350 instances, 5 train instances, 1/345 eval instances\n",
121
+ " DataPreprocessor.preprocess {\n",
122
+ " } [0.0s]\n",
123
+ " MultipleChoiceJointAdapter.adapt {\n",
124
+ " 6 instances, choosing 5/5 train instances, 1 eval instances\n",
125
+ " Adapting with train_trial_index=0 {\n",
126
+ " Sampled 5 examples for trial #0.\n",
127
+ " Parallelizing computation on 1 items over 4 threads {\n",
128
+ " Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
129
+ "\n",
130
+ " Loading huggingface/gpt2 with Hugging Face Transformers {\n",
131
+ " 0%| | 0/1 [00:00<?, ?it/s]\u001b[A } [0.053s]\n",
132
+ "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 18.02it/s]\n",
133
+ " } [0.056s]\n",
134
+ " Sample prompts {\n",
135
+ " reference index = None, request_mode = None {\n",
136
+ " The following are multiple choice questions (with answers) about philosophy.\n",
137
+ " \n",
138
+ " Question: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\n",
139
+ " A. metaphysics\n",
140
+ " B. epistemology\n",
141
+ " C. quantum physics\n",
142
+ " D. axiology\n",
143
+ " Answer: A\n",
144
+ " \n",
145
+ " Question: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\n",
146
+ " A. pleasure.\n",
147
+ " B. happiness.\n",
148
+ " C. good.\n",
149
+ " D. virtue.\n",
150
+ " Answer: C\n",
151
+ " \n",
152
+ " Question: Psychological egoism is:\n",
153
+ " A. an ethical theory about how we ought to behave.\n",
154
+ " B. a generalization concerning the way people tend to behave.\n",
155
+ " C. a claim about human nature and the ways people are capable of behaving.\n",
156
+ " D. none of the above.\n",
157
+ " Answer: C\n",
158
+ " \n",
159
+ " Question: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\n",
160
+ " A. optimist\n",
161
+ " B. satisfied\n",
162
+ " C. nominally religious\n",
163
+ " D. pessimist\n",
164
+ " Answer: D\n",
165
+ " \n",
166
+ " Question: According to d'Holbach, people always act according to _____.\n",
167
+ " A. free choices\n",
168
+ " B. dictates of the soul\n",
169
+ " C. necessary natural laws\n",
170
+ " D. undetermined will\n",
171
+ " Answer: C\n",
172
+ " \n",
173
+ " Question: What does the notion of “meaning in life” refer to?\n",
174
+ " A. external meaning\n",
175
+ " B. god's plan\n",
176
+ " C. internalmeaning\n",
177
+ " D. meaning in an afterlife\n",
178
+ " Answer:\n",
179
+ " } [0.0s]\n",
180
+ " } [0.0s]\n",
181
+ " } [0.056s]\n",
182
+ " 1 requests\n",
183
+ " } [0.056s]\n",
184
+ " Executor.execute {\n",
185
+ " Parallelizing computation on 1 items over 4 threads {\n",
186
+ " Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
187
+ "\n",
188
+ " CUDA is available, initializing with a GPU...\n",
189
+ " 0%| | 0/1 [00:00<?, ?it/s] Loading Hugging Face model for config gpt2 {\n",
190
+ "\u001b[A/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/cuda/__init__.py:146: UserWarning: \n",
191
+ "NVIDIA A10G with CUDA capability sm_86 is not compatible with the current PyTorch installation.\n",
192
+ "The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.\n",
193
+ "If you want to use the NVIDIA A10G GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/\n",
194
+ "\n",
195
+ " warnings.warn(incompatible_device_warn.format(device_name, capability, \" \".join(arch_list), device_name))\n",
196
+ " } [1.554s]\n",
197
+ " Loading Hugging Face tokenizer model for config gpt2 {\n",
198
+ " } [0.128s]\n",
199
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
200
+ " HuggingFace error: CUDA error: no kernel image is available for execution on the device\n",
201
+ "CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\n",
202
+ "For debugging consider passing CUDA_LAUNCH_BLOCKING=1.\n",
203
+ " Request failed. Retrying (attempt #2) in 1 seconds... (See above for error details)\n",
204
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
205
+ " HuggingFace error: CUDA error: no kernel image is available for execution on the device\n",
206
+ "CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\n",
207
+ "For debugging consider passing CUDA_LAUNCH_BLOCKING=1.\n",
208
+ " Request failed. Retrying (attempt #3) in 11 seconds... (See above for error details)\n",
209
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
210
+ " HuggingFace error: CUDA error: no kernel image is available for execution on the device\n",
211
+ "CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\n",
212
+ "For debugging consider passing CUDA_LAUNCH_BLOCKING=1.\n",
213
+ " Request failed. Retrying (attempt #4) in 31 seconds... (See above for error details)\n",
214
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
215
+ " HuggingFace error: CUDA error: no kernel image is available for execution on the device\n",
216
+ "CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\n",
217
+ "For debugging consider passing CUDA_LAUNCH_BLOCKING=1.\n",
218
+ " Request failed. Retrying (attempt #5) in 71 seconds... (See above for error details)\n"
219
+ ]
220
+ }
221
+ ],
222
+ "source": [
223
+ "!helm-run --conf-paths run_specs.conf --local --max-eval-instances 1 --suite v1"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": 5,
229
+ "metadata": {
230
+ "colab": {
231
+ "base_uri": "https://localhost:8080/"
232
+ },
233
+ "id": "2nMIkB4Sz-4W",
234
+ "outputId": "e0821e91-4af2-42e4-8b6e-13ec560841e5"
235
+ },
236
+ "outputs": [
237
+ {
238
+ "name": "stdout",
239
+ "output_type": "stream",
240
+ "text": [
241
+ "main {\n",
242
+ " Reading schema from schema.yaml...\n",
243
+ " Reading contamination information from contamination.yaml...\n",
244
+ " validate_contamination {\n",
245
+ " } [0.0s]\n",
246
+ " 0%| | 0/7 [00:00<?, ?it/s] WARNING: costs.json doesn't have run_spec.json or stats.json, skipping\n",
247
+ " WARNING: groups.json doesn't have run_spec.json or stats.json, skipping\n",
248
+ " WARNING: groups_metadata.json doesn't have run_spec.json or stats.json, skipping\n",
249
+ " WARNING: mmlu:subject=philosophy,method=multiple_choice_joint,model=huggingface_gpt2 doesn't have run_spec.json or stats.json, skipping\n",
250
+ " WARNING: run_specs.json doesn't have run_spec.json or stats.json, skipping\n",
251
+ " WARNING: runs.json doesn't have run_spec.json or stats.json, skipping\n",
252
+ " WARNING: summary.json doesn't have run_spec.json or stats.json, skipping\n",
253
+ "100%|██████████████████████████████████████████| 7/7 [00:00<00:00, 36024.70it/s]\n",
254
+ " Summarizer.check_metrics_defined {\n",
255
+ " } [0.0s]\n",
256
+ " Summarizer.write_executive_summary {\n",
257
+ " Writing 43 characters to benchmark_output/runs/v1/summary.json\n",
258
+ " } [0.0s]\n",
259
+ " Writing 2 characters to benchmark_output/runs/v1/runs.json\n",
260
+ " Writing 2 characters to benchmark_output/runs/v1/run_specs.json\n",
261
+ " Writing 5062 characters to benchmark_output/runs/v1/groups.json\n",
262
+ " Writing 29556 characters to benchmark_output/runs/v1/groups_metadata.json\n",
263
+ " Summarizer.write_cost_report {\n",
264
+ " Writing 2 characters to benchmark_output/runs/v1/costs.json\n",
265
+ " } [0.0s]\n",
266
+ " Parallelizing computation on 0 items over 8 threads {\n",
267
+ "0it [00:00, ?it/s]\n",
268
+ " } [0.0s]\n",
269
+ " Symlinking benchmark_output/runs/v1 to latest.\n",
270
+ " Done.\n",
271
+ "} [0.36s]\n"
272
+ ]
273
+ }
274
+ ],
275
+ "source": [
276
+ "!helm-summarize --suite v1"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "markdown",
281
+ "metadata": {
282
+ "id": "-vu7kWwK5aMm"
283
+ },
284
+ "source": [
285
+ "You can now check out the results in the `benchmark_output` folder!"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "markdown",
290
+ "metadata": {
291
+ "id": "23kU_nPO5pRi"
292
+ },
293
+ "source": [
294
+ "### Assignment Part 3:\n",
295
+ "\n",
296
+ "Try this process out on any Hugging Face model (maybe `BLOOMz`, as an example) and use an alternate suite or metric to examine!\n",
297
+ "\n",
298
+ "Refer to the [docs](https://crfm-helm.readthedocs.io/en/latest/) for a comprehensive overview of the available options!\n",
299
+ "\n"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": 6,
305
+ "metadata": {
306
+ "id": "a4C7shhp6Q8u"
307
+ },
308
+ "outputs": [],
309
+ "source": [
310
+ "### YOUR CODE HERE"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": 5,
316
+ "metadata": {},
317
+ "outputs": [],
318
+ "source": [
319
+ "!echo 'entries: [{description: \"mmlu:subject=philosophy,model=gooseai/gpt-j-6b\", priority: 1}]' > run_specs.conf"
320
+ ]
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "execution_count": 6,
325
+ "metadata": {},
326
+ "outputs": [
327
+ {
328
+ "name": "stdout",
329
+ "output_type": "stream",
330
+ "text": [
331
+ "main {\n",
332
+ " Read 1 run entries from run_specs.conf\n",
333
+ " 1 entries produced 1 run specs\n",
334
+ " run_specs {\n",
335
+ " RunSpec(name='mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b', scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), adapter_spec=AdapterSpec(method='multiple_choice_joint', global_prefix='', instructions='The following are multiple choice questions (with answers) about philosophy.\\n', input_prefix='Question: ', input_suffix='\\n', reference_prefix='A. ', reference_suffix='\\n', output_prefix='Answer: ', output_suffix='\\n', instance_prefix='\\n', substitutions=[], max_train_instances=5, max_eval_instances=1, num_outputs=5, num_train_trials=1, sample_train=True, model='gooseai/gpt-j-6b', temperature=0.0, max_tokens=1, stop_sequences=['\\n'], random=None), metric_specs=[MetricSpec(class_name='helm.benchmark.basic_metrics.BasicMetric', args={'names': ['exact_match', 'quasi_exact_match', 'prefix_exact_match', 'quasi_prefix_exact_match']})], data_augmenter_spec=DataAugmenterSpec(perturbation_specs=[], should_augment_train_instances=False, should_include_original_train=False, should_skip_unchanged_train=False, should_augment_eval_instances=False, should_include_original_eval=False, should_skip_unchanged_eval=False, seeds_per_instance=1), groups=['mmlu'])\n",
336
+ " } [0.0s]\n",
337
+ " Running in local mode with base path: prod_env\n",
338
+ "Looking in path: prod_env\n",
339
+ " AutoClient: cache_path = prod_env/cache\n",
340
+ " AutoClient: mongo_uri = \n",
341
+ " Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
342
+ " Found 1 account(s).\n",
343
+ " 0%| | 0/1 [00:00<?, ?it/s] Running mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b {\n",
344
+ " scenario.get_instances {\n",
345
+ " ensure_file_downloaded {\n",
346
+ " Not downloading https://people.eecs.berkeley.edu/~hendrycks/data.tar because benchmark_output/scenarios/mmlu/data already exists\n",
347
+ " } [0.0s]\n",
348
+ " benchmark_output/scenarios/mmlu/data/auxiliary_train/philosophy_auxiliary_train.csv doesn't exist, skipping\n",
349
+ " Reading benchmark_output/scenarios/mmlu/data/dev/philosophy_dev.csv\n",
350
+ " Reading benchmark_output/scenarios/mmlu/data/val/philosophy_val.csv\n",
351
+ " Reading benchmark_output/scenarios/mmlu/data/test/philosophy_test.csv\n",
352
+ " } [0.004s]\n",
353
+ " 350 instances, 5 train instances, 1/345 eval instances\n",
354
+ " DataPreprocessor.preprocess {\n",
355
+ " } [0.0s]\n",
356
+ " MultipleChoiceJointAdapter.adapt {\n",
357
+ " 6 instances, choosing 5/5 train instances, 1 eval instances\n",
358
+ " Adapting with train_trial_index=0 {\n",
359
+ " Sampled 5 examples for trial #0.\n",
360
+ " Parallelizing computation on 1 items over 4 threads {\n",
361
+ " Created cache with config: SqliteCacheConfig(path='prod_env/cache/EleutherAI.sqlite')\n",
362
+ "\n",
363
+ " Loading EleutherAI/gpt-j-6B with Hugging Face Transformers {\n",
364
+ " 0%| | 0/1 [00:00<?, ?it/s]\u001b[A Local files do not exist for HuggingFace tokenizer: EleutherAI/gpt-j-6B. Downloading...\n",
365
+ "\n",
366
+ "\n",
367
+ "Downloading (…)okenizer_config.json: 100%|█████| 619/619 [00:00<00:00, 7.77MB/s]\u001b[A\u001b[A\n",
368
+ "\n",
369
+ "\n",
370
+ "Downloading (…)olve/main/vocab.json: 100%|███| 798k/798k [00:00<00:00, 48.5MB/s]\u001b[A\u001b[A\n",
371
+ "\n",
372
+ "\n",
373
+ "Downloading (…)olve/main/merges.txt: 100%|███| 456k/456k [00:00<00:00, 84.1MB/s]\u001b[A\u001b[A\n",
374
+ "\n",
375
+ "\n",
376
+ "Downloading (…)/main/tokenizer.json: 100%|██| 1.37M/1.37M [00:00<00:00, 114MB/s]\u001b[A\u001b[A\n",
377
+ "\n",
378
+ "\n",
379
+ "Downloading (…)in/added_tokens.json: 100%|█| 4.04k/4.04k [00:00<00:00, 58.4MB/s]\u001b[A\u001b[A\n",
380
+ "\n",
381
+ "\n",
382
+ "Downloading (…)cial_tokens_map.json: 100%|█████| 357/357 [00:00<00:00, 5.15MB/s]\u001b[A\u001b[A\n",
383
+ " } [0.804s]\n",
384
+ "\n",
385
+ "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 1.24it/s]\u001b[A\n",
386
+ " } [0.809s]\n",
387
+ " Sample prompts {\n",
388
+ " reference index = None, request_mode = None {\n",
389
+ " The following are multiple choice questions (with answers) about philosophy.\n",
390
+ " \n",
391
+ " Question: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\n",
392
+ " A. metaphysics\n",
393
+ " B. epistemology\n",
394
+ " C. quantum physics\n",
395
+ " D. axiology\n",
396
+ " Answer: A\n",
397
+ " \n",
398
+ " Question: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\n",
399
+ " A. pleasure.\n",
400
+ " B. happiness.\n",
401
+ " C. good.\n",
402
+ " D. virtue.\n",
403
+ " Answer: C\n",
404
+ " \n",
405
+ " Question: Psychological egoism is:\n",
406
+ " A. an ethical theory about how we ought to behave.\n",
407
+ " B. a generalization concerning the way people tend to behave.\n",
408
+ " C. a claim about human nature and the ways people are capable of behaving.\n",
409
+ " D. none of the above.\n",
410
+ " Answer: C\n",
411
+ " \n",
412
+ " Question: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\n",
413
+ " A. optimist\n",
414
+ " B. satisfied\n",
415
+ " C. nominally religious\n",
416
+ " D. pessimist\n",
417
+ " Answer: D\n",
418
+ " \n",
419
+ " Question: According to d'Holbach, people always act according to _____.\n",
420
+ " A. free choices\n",
421
+ " B. dictates of the soul\n",
422
+ " C. necessary natural laws\n",
423
+ " D. undetermined will\n",
424
+ " Answer: C\n",
425
+ " \n",
426
+ " Question: What does the notion of “meaning in life” refer to?\n",
427
+ " A. external meaning\n",
428
+ " B. god's plan\n",
429
+ " C. internalmeaning\n",
430
+ " D. meaning in an afterlife\n",
431
+ " Answer:\n",
432
+ " } [0.0s]\n",
433
+ " } [0.0s]\n",
434
+ " } [0.81s]\n",
435
+ " 1 requests\n",
436
+ " } [0.81s]\n",
437
+ " Executor.execute {\n",
438
+ " Parallelizing computation on 1 items over 4 threads {\n",
439
+ "\n",
440
+ " 0%| | 0/1 [00:00<?, ?it/s]\u001b[A\n",
441
+ " } [0.007s]\n",
442
+ " } [0.008s]\n",
443
+ " } [0.825s]\n",
444
+ " Error when running mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b:\n",
445
+ "Traceback (most recent call last):\n",
446
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 84, in process\n",
447
+ " result: RequestResult = self.service.make_request(self.execution_spec.auth, state.request)\n",
448
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/services/server_service.py\", line 96, in make_request\n",
449
+ " request_result: RequestResult = self.client.make_request(request)\n",
450
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/clients/auto_client.py\", line 172, in make_request\n",
451
+ " client: Client = self._get_client(request.model)\n",
452
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/clients/auto_client.py\", line 108, in _get_client\n",
453
+ " api_key=self.credentials[\"gooseaiApiKey\"], cache_config=cache_config, org_id=org_id\n",
454
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 393, in __getitem__\n",
455
+ " val = self.get(item)\n",
456
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 236, in get\n",
457
+ " return self._get(ConfigTree.parse_key(key), 0, default)\n",
458
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 176, in _get\n",
459
+ " raise ConfigMissingException(\n",
460
+ "pyhocon.exceptions.ConfigMissingException: 'No configuration setting found for key gooseaiApiKey'\n",
461
+ "\n",
462
+ "The above exception was the direct cause of the following exception:\n",
463
+ "\n",
464
+ "Traceback (most recent call last):\n",
465
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 140, in run_all\n",
466
+ " self.run_one(run_spec)\n",
467
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 214, in run_one\n",
468
+ " scenario_state = self.executor.execute(scenario_state)\n",
469
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/hierarchical_logger.py\", line 104, in wrapper\n",
470
+ " return fn(*args, **kwargs)\n",
471
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 73, in execute\n",
472
+ " request_states = parallel_map(\n",
473
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/general.py\", line 227, in parallel_map\n",
474
+ " results = list(tqdm(executor.map(process, items), total=len(items), disable=None))\n",
475
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tqdm/std.py\", line 1195, in __iter__\n",
476
+ " for obj in iterable:\n",
477
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 621, in result_iterator\n",
478
+ " yield _result_or_cancel(fs.pop())\n",
479
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 319, in _result_or_cancel\n",
480
+ " return fut.result(timeout)\n",
481
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 458, in result\n",
482
+ " return self.__get_result()\n",
483
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 403, in __get_result\n",
484
+ " raise self._exception\n",
485
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/thread.py\", line 58, in run\n",
486
+ " result = self.fn(*self.args, **self.kwargs)\n",
487
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 86, in process\n",
488
+ " raise ExecutorError(f\"{str(e)} Request: {state.request}\") from e\n",
489
+ "helm.benchmark.executor.ExecutorError: 'No configuration setting found for key gooseaiApiKey' Request: Request(model='gooseai/gpt-j-6b', embedding=False, prompt=\"The following are multiple choice questions (with answers) about philosophy.\\n\\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\\nA. metaphysics\\nB. epistemology\\nC. quantum physics\\nD. axiology\\nAnswer: A\\n\\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\\nA. pleasure.\\nB. happiness.\\nC. good.\\nD. virtue.\\nAnswer: C\\n\\nQuestion: Psychological egoism is:\\nA. an ethical theory about how we ought to behave.\\nB. a generalization concerning the way people tend to behave.\\nC. a claim about human nature and the ways people are capable of behaving.\\nD. none of the above.\\nAnswer: C\\n\\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\\nA. optimist\\nB. satisfied\\nC. nominally religious\\nD. pessimist\\nAnswer: D\\n\\nQuestion: According to d'Holbach, people always act according to _____.\\nA. free choices\\nB. dictates of the soul\\nC. necessary natural laws\\nD. undetermined will\\nAnswer: C\\n\\nQuestion: What does the notion of “meaning in life” refer to?\\nA. external meaning\\nB. god's plan\\nC. internalmeaning\\nD. meaning in an afterlife\\nAnswer:\", temperature=0.0, num_completions=1, top_k_per_token=5, max_tokens=1, stop_sequences=[], echo_prompt=False, top_p=1, presence_penalty=0, frequency_penalty=0, random=None, messages=None)\n",
490
+ "\n",
491
+ "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 1.21it/s]\n",
492
+ "} [0.843s]\n",
493
+ "Traceback (most recent call last):\n",
494
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/bin/helm-run\", line 8, in <module>\n",
495
+ " sys.exit(main())\n",
496
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/hierarchical_logger.py\", line 104, in wrapper\n",
497
+ " return fn(*args, **kwargs)\n",
498
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/run.py\", line 289, in main\n",
499
+ " run_benchmarking(\n",
500
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/run.py\", line 107, in run_benchmarking\n",
501
+ " runner.run_all(run_specs)\n",
502
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 149, in run_all\n",
503
+ " raise RunnerError(f\"Failed runs: [{failed_runs_str}]\")\n",
504
+ "helm.benchmark.runner.RunnerError: Failed runs: [\"mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b\"]\n"
505
+ ]
506
+ }
507
+ ],
508
+ "source": [
509
+ "!helm-run --conf-paths run_specs.conf --local --max-eval-instances 1 --suite v1"
510
+ ]
511
+ },
512
+ {
513
+ "cell_type": "code",
514
+ "execution_count": 7,
515
+ "metadata": {},
516
+ "outputs": [
517
+ {
518
+ "name": "stdout",
519
+ "output_type": "stream",
520
+ "text": [
521
+ "main {\n",
522
+ " Read 1 run entries from run_specs.conf\n",
523
+ " 1 entries produced 1 run specs\n",
524
+ " run_specs {\n",
525
+ " RunSpec(name='mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b', scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), adapter_spec=AdapterSpec(method='multiple_choice_joint', global_prefix='', instructions='The following are multiple choice questions (with answers) about philosophy.\\n', input_prefix='Question: ', input_suffix='\\n', reference_prefix='A. ', reference_suffix='\\n', output_prefix='Answer: ', output_suffix='\\n', instance_prefix='\\n', substitutions=[], max_train_instances=5, max_eval_instances=1, num_outputs=5, num_train_trials=1, sample_train=True, model='gooseai/gpt-j-6b', temperature=0.0, max_tokens=1, stop_sequences=['\\n'], random=None), metric_specs=[MetricSpec(class_name='helm.benchmark.basic_metrics.BasicMetric', args={'names': ['exact_match', 'quasi_exact_match', 'prefix_exact_match', 'quasi_prefix_exact_match']})], data_augmenter_spec=DataAugmenterSpec(perturbation_specs=[], should_augment_train_instances=False, should_include_original_train=False, should_skip_unchanged_train=False, should_augment_eval_instances=False, should_include_original_eval=False, should_skip_unchanged_eval=False, seeds_per_instance=1), groups=['mmlu'])\n",
526
+ " } [0.0s]\n",
527
+ " Running in local mode with base path: prod_env\n",
528
+ "Looking in path: prod_env\n",
529
+ " AutoClient: cache_path = prod_env/cache\n",
530
+ " AutoClient: mongo_uri = \n",
531
+ " Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
532
+ " Found 1 account(s).\n",
533
+ " 0%| | 0/1 [00:00<?, ?it/s] Running mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b {\n",
534
+ " scenario.get_instances {\n",
535
+ " ensure_file_downloaded {\n",
536
+ " Not downloading https://people.eecs.berkeley.edu/~hendrycks/data.tar because benchmark_output/scenarios/mmlu/data already exists\n",
537
+ " } [0.0s]\n",
538
+ " benchmark_output/scenarios/mmlu/data/auxiliary_train/philosophy_auxiliary_train.csv doesn't exist, skipping\n",
539
+ " Reading benchmark_output/scenarios/mmlu/data/dev/philosophy_dev.csv\n",
540
+ " Reading benchmark_output/scenarios/mmlu/data/val/philosophy_val.csv\n",
541
+ " Reading benchmark_output/scenarios/mmlu/data/test/philosophy_test.csv\n",
542
+ " } [0.004s]\n",
543
+ " 350 instances, 5 train instances, 1/345 eval instances\n",
544
+ " DataPreprocessor.preprocess {\n",
545
+ " } [0.0s]\n",
546
+ " MultipleChoiceJointAdapter.adapt {\n",
547
+ " 6 instances, choosing 5/5 train instances, 1 eval instances\n",
548
+ " Adapting with train_trial_index=0 {\n",
549
+ " Sampled 5 examples for trial #0.\n",
550
+ " Parallelizing computation on 1 items over 4 threads {\n",
551
+ " Created cache with config: SqliteCacheConfig(path='prod_env/cache/EleutherAI.sqlite')\n",
552
+ "\n",
553
+ " Loading EleutherAI/gpt-j-6B with Hugging Face Transformers {\n",
554
+ " 0%| | 0/1 [00:00<?, ?it/s]\u001b[A } [0.095s]\n",
555
+ "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.32it/s]\n",
556
+ " } [0.097s]\n",
557
+ " Sample prompts {\n",
558
+ " reference index = None, request_mode = None {\n",
559
+ " The following are multiple choice questions (with answers) about philosophy.\n",
560
+ " \n",
561
+ " Question: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\n",
562
+ " A. metaphysics\n",
563
+ " B. epistemology\n",
564
+ " C. quantum physics\n",
565
+ " D. axiology\n",
566
+ " Answer: A\n",
567
+ " \n",
568
+ " Question: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\n",
569
+ " A. pleasure.\n",
570
+ " B. happiness.\n",
571
+ " C. good.\n",
572
+ " D. virtue.\n",
573
+ " Answer: C\n",
574
+ " \n",
575
+ " Question: Psychological egoism is:\n",
576
+ " A. an ethical theory about how we ought to behave.\n",
577
+ " B. a generalization concerning the way people tend to behave.\n",
578
+ " C. a claim about human nature and the ways people are capable of behaving.\n",
579
+ " D. none of the above.\n",
580
+ " Answer: C\n",
581
+ " \n",
582
+ " Question: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\n",
583
+ " A. optimist\n",
584
+ " B. satisfied\n",
585
+ " C. nominally religious\n",
586
+ " D. pessimist\n",
587
+ " Answer: D\n",
588
+ " \n",
589
+ " Question: According to d'Holbach, people always act according to _____.\n",
590
+ " A. free choices\n",
591
+ " B. dictates of the soul\n",
592
+ " C. necessary natural laws\n",
593
+ " D. undetermined will\n",
594
+ " Answer: C\n",
595
+ " \n",
596
+ " Question: What does the notion of “meaning in life” refer to?\n",
597
+ " A. external meaning\n",
598
+ " B. god's plan\n",
599
+ " C. internalmeaning\n",
600
+ " D. meaning in an afterlife\n",
601
+ " Answer:\n",
602
+ " } [0.0s]\n",
603
+ " } [0.0s]\n",
604
+ " } [0.098s]\n",
605
+ " 1 requests\n",
606
+ " } [0.098s]\n",
607
+ " Executor.execute {\n",
608
+ " Parallelizing computation on 1 items over 4 threads {\n",
609
+ "\n",
610
+ " 0%| | 0/1 [00:00<?, ?it/s]\u001b[A\n",
611
+ " } [0.008s]\n",
612
+ " } [0.008s]\n",
613
+ " } [0.113s]\n",
614
+ " Error when running mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b:\n",
615
+ "Traceback (most recent call last):\n",
616
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 84, in process\n",
617
+ " result: RequestResult = self.service.make_request(self.execution_spec.auth, state.request)\n",
618
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/services/server_service.py\", line 96, in make_request\n",
619
+ " request_result: RequestResult = self.client.make_request(request)\n",
620
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/clients/auto_client.py\", line 172, in make_request\n",
621
+ " client: Client = self._get_client(request.model)\n",
622
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/clients/auto_client.py\", line 108, in _get_client\n",
623
+ " api_key=self.credentials[\"gooseaiApiKey\"], cache_config=cache_config, org_id=org_id\n",
624
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 393, in __getitem__\n",
625
+ " val = self.get(item)\n",
626
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 236, in get\n",
627
+ " return self._get(ConfigTree.parse_key(key), 0, default)\n",
628
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 176, in _get\n",
629
+ " raise ConfigMissingException(\n",
630
+ "pyhocon.exceptions.ConfigMissingException: 'No configuration setting found for key gooseaiApiKey'\n",
631
+ "\n",
632
+ "The above exception was the direct cause of the following exception:\n",
633
+ "\n",
634
+ "Traceback (most recent call last):\n",
635
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 140, in run_all\n",
636
+ " self.run_one(run_spec)\n",
637
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 214, in run_one\n",
638
+ " scenario_state = self.executor.execute(scenario_state)\n",
639
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/hierarchical_logger.py\", line 104, in wrapper\n",
640
+ " return fn(*args, **kwargs)\n",
641
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 73, in execute\n",
642
+ " request_states = parallel_map(\n",
643
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/general.py\", line 227, in parallel_map\n",
644
+ " results = list(tqdm(executor.map(process, items), total=len(items), disable=None))\n",
645
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tqdm/std.py\", line 1195, in __iter__\n",
646
+ " for obj in iterable:\n",
647
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 621, in result_iterator\n",
648
+ " yield _result_or_cancel(fs.pop())\n",
649
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 319, in _result_or_cancel\n",
650
+ " return fut.result(timeout)\n",
651
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 458, in result\n",
652
+ " return self.__get_result()\n",
653
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 403, in __get_result\n",
654
+ " raise self._exception\n",
655
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/thread.py\", line 58, in run\n",
656
+ " result = self.fn(*self.args, **self.kwargs)\n",
657
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 86, in process\n",
658
+ " raise ExecutorError(f\"{str(e)} Request: {state.request}\") from e\n",
659
+ "helm.benchmark.executor.ExecutorError: 'No configuration setting found for key gooseaiApiKey' Request: Request(model='gooseai/gpt-j-6b', embedding=False, prompt=\"The following are multiple choice questions (with answers) about philosophy.\\n\\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\\nA. metaphysics\\nB. epistemology\\nC. quantum physics\\nD. axiology\\nAnswer: A\\n\\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\\nA. pleasure.\\nB. happiness.\\nC. good.\\nD. virtue.\\nAnswer: C\\n\\nQuestion: Psychological egoism is:\\nA. an ethical theory about how we ought to behave.\\nB. a generalization concerning the way people tend to behave.\\nC. a claim about human nature and the ways people are capable of behaving.\\nD. none of the above.\\nAnswer: C\\n\\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\\nA. optimist\\nB. satisfied\\nC. nominally religious\\nD. pessimist\\nAnswer: D\\n\\nQuestion: According to d'Holbach, people always act according to _____.\\nA. free choices\\nB. dictates of the soul\\nC. necessary natural laws\\nD. undetermined will\\nAnswer: C\\n\\nQuestion: What does the notion of “meaning in life” refer to?\\nA. external meaning\\nB. god's plan\\nC. internalmeaning\\nD. meaning in an afterlife\\nAnswer:\", temperature=0.0, num_completions=1, top_k_per_token=5, max_tokens=1, stop_sequences=[], echo_prompt=False, top_p=1, presence_penalty=0, frequency_penalty=0, random=None, messages=None)\n",
660
+ "\n",
661
+ "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 8.74it/s]\n",
662
+ "} [0.131s]\n",
663
+ "Traceback (most recent call last):\n",
664
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/bin/helm-run\", line 8, in <module>\n",
665
+ " sys.exit(main())\n",
666
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/hierarchical_logger.py\", line 104, in wrapper\n",
667
+ " return fn(*args, **kwargs)\n",
668
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/run.py\", line 289, in main\n",
669
+ " run_benchmarking(\n",
670
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/run.py\", line 107, in run_benchmarking\n",
671
+ " runner.run_all(run_specs)\n",
672
+ " File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 149, in run_all\n",
673
+ " raise RunnerError(f\"Failed runs: [{failed_runs_str}]\")\n",
674
+ "helm.benchmark.runner.RunnerError: Failed runs: [\"mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b\"]\n"
675
+ ]
676
+ }
677
+ ],
678
+ "source": [
679
+ "!helm-run --conf-paths run_specs.conf --local --max-eval-instances 1 --suite v1"
680
+ ]
681
+ },
682
+ {
683
+ "cell_type": "code",
684
+ "execution_count": null,
685
+ "metadata": {},
686
+ "outputs": [],
687
+ "source": []
688
+ }
689
+ ],
690
+ "metadata": {
691
+ "accelerator": "GPU",
692
+ "colab": {
693
+ "gpuType": "A100",
694
+ "machine_shape": "hm",
695
+ "provenance": []
696
+ },
697
+ "kernelspec": {
698
+ "display_name": "conda_pytorch_p310",
699
+ "language": "python",
700
+ "name": "conda_pytorch_p310"
701
+ },
702
+ "language_info": {
703
+ "codemirror_mode": {
704
+ "name": "ipython",
705
+ "version": 3
706
+ },
707
+ "file_extension": ".py",
708
+ "mimetype": "text/x-python",
709
+ "name": "python",
710
+ "nbconvert_exporter": "python",
711
+ "pygments_lexer": "ipython3",
712
+ "version": "3.10.12"
713
+ }
714
+ },
715
+ "nbformat": 4,
716
+ "nbformat_minor": 1
717
+ }