Tristan commited on
Commit
4a21319
1 Parent(s): 737bf22

Training in progress, epoch 1

Browse files
eval_job_output.txt CHANGED
@@ -1,4 +1,4 @@
1
- slurm submission log: 2024-05-11 08:25:31.074168
2
  created following sbatch script:
3
 
4
  ###############################
@@ -7,13 +7,13 @@ created following sbatch script:
7
 
8
  #SBATCH --account=nlp
9
  #SBATCH --cpus-per-task=16
10
- #SBATCH --dependency=afterok:7597680
11
  #SBATCH --gres=gpu:1
12
- #SBATCH --job-name=tthrush-job-2692470
13
  #SBATCH --mem=60G
14
  #SBATCH --nodelist=sphinx2
15
  #SBATCH --open-mode=append
16
- #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_4/pythia-70m_sciq/eval_job_output.txt
17
  #SBATCH --partition=sphinx
18
  #SBATCH --time=14-0
19
 
@@ -24,7 +24,7 @@ created following sbatch script:
24
  cd .
25
 
26
  # launch commands
27
- srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_4/pythia-70m_sciq,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_4/pythia-70m_sciq/perf'
28
 
29
  ###############################
30
 
@@ -34,7 +34,115 @@ submission to slurm complete!
34
  ###############################
35
  slurm submission output
36
 
37
- Submitted batch job 7597681
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
 
 
1
+ slurm submission log: 2024-05-11 17:54:08.850609
2
  created following sbatch script:
3
 
4
  ###############################
 
7
 
8
  #SBATCH --account=nlp
9
  #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:
11
  #SBATCH --gres=gpu:1
12
+ #SBATCH --job-name=tthrush-job-2902311
13
  #SBATCH --mem=60G
14
  #SBATCH --nodelist=sphinx2
15
  #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/eval_job_output.txt
17
  #SBATCH --partition=sphinx
18
  #SBATCH --time=14-0
19
 
 
24
  cd .
25
 
26
  # launch commands
27
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/perf'
28
 
29
  ###############################
30
 
 
34
  ###############################
35
  slurm submission output
36
 
37
+
38
+
39
+ sbatch: error: Batch job submission failed: Job dependency problem
40
+
41
+ ###############################
42
+
43
+ slurm submission log: 2024-05-11 17:55:07.106159
44
+ created following sbatch script:
45
+
46
+ ###############################
47
+
48
+ #!/bin/bash
49
+
50
+ #SBATCH --account=nlp
51
+ #SBATCH --cpus-per-task=16
52
+ #SBATCH --dependency=afterok:7598873
53
+ #SBATCH --gres=gpu:1
54
+ #SBATCH --job-name=tthrush-job-552824
55
+ #SBATCH --mem=60G
56
+ #SBATCH --nodelist=sphinx2
57
+ #SBATCH --open-mode=append
58
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/eval_job_output.txt
59
+ #SBATCH --partition=sphinx
60
+ #SBATCH --time=14-0
61
+
62
+ # activate your desired anaconda environment
63
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
+
65
+ # cd to working directory
66
+ cd .
67
+
68
+ # launch commands
69
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/perf'
70
+
71
+ ###############################
72
+
73
+ submission to slurm complete!
74
+
75
+
76
+ ###############################
77
+ slurm submission output
78
+
79
+ Submitted batch job 7598874
80
+
81
+
82
+
83
+ ###############################
84
+
85
+ ###############################
86
+ start time: 2024-05-11 17:58:20.674188
87
+ machine: sphinx2
88
+ conda env: pretraining-coreset-selection
89
+ ###############################
90
+ running following processes
91
+
92
+ lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/perf
93
+
94
+
95
+ ###############################
96
+ command outputs:
97
+
98
+
99
+ 2024-05-11:17:58:30,428 INFO [utils.py:145] Note: detected 255 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
100
+ 2024-05-11:17:58:30,429 INFO [utils.py:148] Note: NumExpr detected 255 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
101
+ 2024-05-11:17:58:30,429 INFO [utils.py:160] NumExpr defaulting to 8 threads.
102
+ 2024-05-11:17:58:31,929 INFO [config.py:58] PyTorch version 2.2.2 available.
103
+ 2024-05-11:17:58:45,100 INFO [__main__.py:156] Verbosity set to INFO
104
+ 2024-05-11:17:59:06,539 WARNING [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details.
105
+ srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
106
+ slurmstepd: error: *** JOB 7598874 ON sphinx2 CANCELLED AT 2024-05-11T17:59:25 ***
107
+ slurmstepd: error: *** STEP 7598874.0 ON sphinx2 CANCELLED AT 2024-05-11T17:59:25 ***
108
+ Received SIGTERM, job terminating, terminating 1 processes...
109
+ slurm submission log: 2024-05-11 18:01:39.856307
110
+ created following sbatch script:
111
+
112
+ ###############################
113
+
114
+ #!/bin/bash
115
+
116
+ #SBATCH --account=nlp
117
+ #SBATCH --cpus-per-task=16
118
+ #SBATCH --dependency=afterok:7598912
119
+ #SBATCH --gres=gpu:1
120
+ #SBATCH --job-name=tthrush-job-2986377
121
+ #SBATCH --mem=60G
122
+ #SBATCH --nodelist=sphinx2
123
+ #SBATCH --open-mode=append
124
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/eval_job_output.txt
125
+ #SBATCH --partition=sphinx
126
+ #SBATCH --time=14-0
127
+
128
+ # activate your desired anaconda environment
129
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
130
+
131
+ # cd to working directory
132
+ cd .
133
+
134
+ # launch commands
135
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/perf'
136
+
137
+ ###############################
138
+
139
+ submission to slurm complete!
140
+
141
+
142
+ ###############################
143
+ slurm submission output
144
+
145
+ Submitted batch job 7598913
146
 
147
 
148
 
logs/events.out.tfevents.1715485378.sphinx2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15263a7225da1f4f0806bc2fba683d6809899abf9a452ff616540262cc4a3253
3
+ size 10945
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f51424cb0f092588de032db63dcef771464354ab09c74d632a708c36fa8acd6
3
  size 281715176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:612536822ee693a2397fef160cdf02f61f9e2b66af69438301805f21341c83f0
3
  size 281715176
train_job_output.txt CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e04bfdb92ab1879030b0d4669e0346cd3f25731fb0a293addd36be27daf958d
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daaa4e440a16d03580dcaf76961ff26dde4440ee9c04759894a86a546a4710e0
3
  size 5048