Tristan commited on
Commit
a332017
1 Parent(s): e82b954

Training in progress, epoch 1

Browse files
eval_job_output.txt CHANGED
@@ -1,4 +1,4 @@
1
- slurm submission log: 2024-05-22 17:07:25.162212
2
  created following sbatch script:
3
 
4
  ###############################
@@ -7,13 +7,13 @@ created following sbatch script:
7
 
8
  #SBATCH --account=nlp
9
  #SBATCH --cpus-per-task=16
10
- #SBATCH --dependency=afterok:7642740
11
  #SBATCH --gres=gpu:1
12
- #SBATCH --job-name=tthrush-job-2791360
13
  #SBATCH --mem=60G
14
  #SBATCH --nodelist=sphinx1
15
  #SBATCH --open-mode=append
16
- #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/eval_job_output.txt
17
  #SBATCH --partition=sphinx
18
  #SBATCH --time=14-0
19
 
@@ -24,7 +24,7 @@ created following sbatch script:
24
  cd .
25
 
26
  # launch commands
27
- srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/perf'
28
 
29
  ###############################
30
 
@@ -34,249 +34,7 @@ submission to slurm complete!
34
  ###############################
35
  slurm submission output
36
 
37
- Submitted batch job 7642741
38
-
39
-
40
-
41
- ###############################
42
-
43
- slurm submission log: 2024-05-22 17:23:51.579657
44
- created following sbatch script:
45
-
46
- ###############################
47
-
48
- #!/bin/bash
49
-
50
- #SBATCH --account=nlp
51
- #SBATCH --cpus-per-task=16
52
- #SBATCH --dependency=afterok:7642780
53
- #SBATCH --gres=gpu:1
54
- #SBATCH --job-name=tthrush-job-1978619
55
- #SBATCH --mem=60G
56
- #SBATCH --nodelist=sphinx1
57
- #SBATCH --open-mode=append
58
- #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/eval_job_output.txt
59
- #SBATCH --partition=sphinx
60
- #SBATCH --time=14-0
61
-
62
- # activate your desired anaconda environment
63
- . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
-
65
- # cd to working directory
66
- cd .
67
-
68
- # launch commands
69
- srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/perf'
70
-
71
- ###############################
72
-
73
- submission to slurm complete!
74
-
75
-
76
- ###############################
77
- slurm submission output
78
-
79
- Submitted batch job 7642781
80
-
81
-
82
-
83
- ###############################
84
-
85
- slurm submission log: 2024-05-22 17:29:15.965569
86
- created following sbatch script:
87
-
88
- ###############################
89
-
90
- #!/bin/bash
91
-
92
- #SBATCH --account=nlp
93
- #SBATCH --cpus-per-task=16
94
- #SBATCH --dependency=afterok:7642805
95
- #SBATCH --gres=gpu:1
96
- #SBATCH --job-name=tthrush-job-2597090
97
- #SBATCH --mem=60G
98
- #SBATCH --nodelist=sphinx1
99
- #SBATCH --open-mode=append
100
- #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/eval_job_output.txt
101
- #SBATCH --partition=sphinx
102
- #SBATCH --time=14-0
103
-
104
- # activate your desired anaconda environment
105
- . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
106
-
107
- # cd to working directory
108
- cd .
109
-
110
- # launch commands
111
- srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/perf'
112
-
113
- ###############################
114
-
115
- submission to slurm complete!
116
-
117
-
118
- ###############################
119
- slurm submission output
120
-
121
- Submitted batch job 7642806
122
-
123
-
124
-
125
- ###############################
126
-
127
- /var/lib/slurm/slurmd/job7642806/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory
128
-
129
- CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'.
130
- To initialize your shell, run
131
-
132
- $ conda init <SHELL_NAME>
133
-
134
- Currently supported shells are:
135
- - bash
136
- - fish
137
- - tcsh
138
- - xonsh
139
- - zsh
140
- - powershell
141
-
142
- See 'conda init --help' for more information and options.
143
-
144
- IMPORTANT: You may need to close and restart your shell after running 'conda init'.
145
-
146
-
147
- ###############################
148
- start time: 2024-05-22 17:31:53.790973
149
- machine: sphinx1
150
- conda env: pretraining-coreset-selection
151
- ###############################
152
- running following processes
153
-
154
- lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/perf
155
-
156
-
157
- ###############################
158
- command outputs:
159
-
160
-
161
- 2024-05-22:17:31:59,220 INFO [utils.py:145] Note: detected 255 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
162
- 2024-05-22:17:31:59,220 INFO [utils.py:148] Note: NumExpr detected 255 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
163
- 2024-05-22:17:31:59,220 INFO [utils.py:160] NumExpr defaulting to 8 threads.
164
- 2024-05-22:17:31:59,893 INFO [config.py:58] PyTorch version 2.2.2 available.
165
- 2024-05-22:17:32:04,203 INFO [__main__.py:156] Verbosity set to INFO
166
- 2024-05-22:17:32:14,116 WARNING [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details.
167
- /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py:1429: FutureWarning: The repository for hails/mmlu_no_train contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hails/mmlu_no_train
168
- You can avoid this message in future by passing the argument `trust_remote_code=True`.
169
- Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
170
- warnings.warn(
171
- 2024-05-22:17:33:33,299 WARNING [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details.
172
- 2024-05-22:17:33:33,305 INFO [__main__.py:229] Selected Tasks: ['arc_easy', 'lambada', 'piqa', 'sciq', 'xnli_en', 'xnli_fr']
173
- 2024-05-22:17:33:33,714 INFO [huggingface.py:148] Using device 'cuda'
174
- Traceback (most recent call last):
175
- File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/lm_eval", line 8, in <module>
176
- sys.exit(cli_evaluate())
177
- File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/__main__.py", line 231, in cli_evaluate
178
- results = evaluator.simple_evaluate(
179
- File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/utils.py", line 415, in _wrapper
180
- return fn(*args, **kwargs)
181
- File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/evaluator.py", line 98, in simple_evaluate
182
- lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
183
- File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/api/model.py", line 134, in create_from_arg_string
184
- return cls(**args, **args2)
185
- File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/models/huggingface.py", line 174, in __init__
186
- self._get_config(
187
- File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/models/huggingface.py", line 420, in _get_config
188
- self._config = transformers.AutoConfig.from_pretrained(
189
- File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 928, in from_pretrained
190
- config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
191
- File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/configuration_utils.py", line 631, in get_config_dict
192
- config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
193
- File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/configuration_utils.py", line 686, in _get_config_dict
194
- resolved_config_file = cached_file(
195
- File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/utils/hub.py", line 369, in cached_file
196
- raise EnvironmentError(
197
- OSError: /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 does not appear to have a file named config.json. Checkout 'https://huggingface.co//juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/tree/main' for available files.
198
- ###############################
199
- end time: 2024-05-22 17:33:43.911118
200
- elapsed time: 0:01:50.120145
201
- slurm submission log: 2024-05-22 17:41:39.457215
202
- created following sbatch script:
203
-
204
- ###############################
205
-
206
- #!/bin/bash
207
-
208
- #SBATCH --account=nlp
209
- #SBATCH --cpus-per-task=16
210
- #SBATCH --dependency=afterok:7642834
211
- #SBATCH --gres=gpu:1
212
- #SBATCH --job-name=tthrush-job-24240
213
- #SBATCH --mem=60G
214
- #SBATCH --nodelist=sphinx1
215
- #SBATCH --open-mode=append
216
- #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/eval_job_output.txt
217
- #SBATCH --partition=sphinx
218
- #SBATCH --time=14-0
219
-
220
- # activate your desired anaconda environment
221
- . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
222
-
223
- # cd to working directory
224
- cd .
225
-
226
- # launch commands
227
- srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/perf'
228
-
229
- ###############################
230
-
231
- submission to slurm complete!
232
-
233
-
234
- ###############################
235
- slurm submission output
236
-
237
- Submitted batch job 7642835
238
-
239
-
240
-
241
- ###############################
242
-
243
- slurm submission log: 2024-05-22 19:52:23.060911
244
- created following sbatch script:
245
-
246
- ###############################
247
-
248
- #!/bin/bash
249
-
250
- #SBATCH --account=nlp
251
- #SBATCH --cpus-per-task=16
252
- #SBATCH --dependency=afterok:7643057
253
- #SBATCH --gres=gpu:1
254
- #SBATCH --job-name=tthrush-job-4137796
255
- #SBATCH --mem=60G
256
- #SBATCH --nodelist=sphinx1
257
- #SBATCH --open-mode=append
258
- #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/eval_job_output.txt
259
- #SBATCH --partition=sphinx
260
- #SBATCH --time=14-0
261
-
262
- # activate your desired anaconda environment
263
- . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
264
-
265
- # cd to working directory
266
- cd .
267
-
268
- # launch commands
269
- srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/perf'
270
-
271
- ###############################
272
-
273
- submission to slurm complete!
274
-
275
-
276
- ###############################
277
- slurm submission output
278
-
279
- Submitted batch job 7643058
280
 
281
 
282
 
 
1
+ slurm submission log: 2024-05-23 14:58:53.803713
2
  created following sbatch script:
3
 
4
  ###############################
 
7
 
8
  #SBATCH --account=nlp
9
  #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:7645740
11
  #SBATCH --gres=gpu:1
12
+ #SBATCH --job-name=tthrush-job-1104501
13
  #SBATCH --mem=60G
14
  #SBATCH --nodelist=sphinx1
15
  #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init/llms/pythia-70m_sciq_1/eval_job_output.txt
17
  #SBATCH --partition=sphinx
18
  #SBATCH --time=14-0
19
 
 
24
  cd .
25
 
26
  # launch commands
27
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init/llms/pythia-70m_sciq_1,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init/llms/pythia-70m_sciq_1/perf'
28
 
29
  ###############################
30
 
 
34
  ###############################
35
  slurm submission output
36
 
37
+ Submitted batch job 7645741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
 
logs/events.out.tfevents.1716527049.sphinx2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cd3605c757a4df33c8a571d76311882aafb9dd1234da5cf769c80b69c8c3b68
3
+ size 95678
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d1d0b5f6fe2fd1b3e778eb06905f9ff0b46cd859dd0d2696a8fdd8dfb3af932
3
  size 281715176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72d975c1a09895fb677a80c6976a87b7c0c808aff25c8bd8eea46a1f10e6607c
3
  size 281715176
train_job_output.txt CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42604c6e8628be2643fe1460d9bd416ac8e71af2f5f0e7182a340f5cb4e9907f
3
- size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e5ec4dcc9a1a5c561e3555297a62670552352ebb9dca8bbc21575d63cf52a8c
3
+ size 5240