Tristan commited on
Commit
913ba44
1 Parent(s): cf7cf0d

Training in progress, epoch 1

Browse files
eval_job_output.txt CHANGED
@@ -1,4 +1,4 @@
1
- slurm submission log: 2024-05-09 15:03:34.997660
2
  created following sbatch script:
3
 
4
  ###############################
@@ -7,13 +7,13 @@ created following sbatch script:
7
 
8
  #SBATCH --account=nlp
9
  #SBATCH --cpus-per-task=16
10
- #SBATCH --dependency=afterok:7592327
11
  #SBATCH --gres=gpu:1
12
- #SBATCH --job-name=tthrush-job-3419847
13
  #SBATCH --mem=60G
14
  #SBATCH --nodelist=sphinx2
15
  #SBATCH --open-mode=append
16
- #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_arc_easy/eval_job_output.txt
17
  #SBATCH --partition=sphinx
18
  #SBATCH --time=14-0
19
 
@@ -24,7 +24,7 @@ created following sbatch script:
24
  cd .
25
 
26
  # launch commands
27
- srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_arc_easy,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_arc_easy/perf'
28
 
29
  ###############################
30
 
@@ -34,7 +34,133 @@ submission to slurm complete!
34
  ###############################
35
  slurm submission output
36
 
37
- Submitted batch job 7592328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
 
 
1
+ slurm submission log: 2024-05-09 22:15:13.606492
2
  created following sbatch script:
3
 
4
  ###############################
 
7
 
8
  #SBATCH --account=nlp
9
  #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:7593081
11
  #SBATCH --gres=gpu:1
12
+ #SBATCH --job-name=tthrush-job-1314089
13
  #SBATCH --mem=60G
14
  #SBATCH --nodelist=sphinx2
15
  #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/eval_job_output.txt
17
  #SBATCH --partition=sphinx
18
  #SBATCH --time=14-0
19
 
 
24
  cd .
25
 
26
  # launch commands
27
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/perf'
28
 
29
  ###############################
30
 
 
34
  ###############################
35
  slurm submission output
36
 
37
+ Submitted batch job 7593082
38
+
39
+
40
+
41
+ ###############################
42
+
43
+ slurm submission log: 2024-05-09 23:03:16.593571
44
+ created following sbatch script:
45
+
46
+ ###############################
47
+
48
+ #!/bin/bash
49
+
50
+ #SBATCH --account=nlp
51
+ #SBATCH --cpus-per-task=16
52
+ #SBATCH --dependency=afterok:7593159
53
+ #SBATCH --gres=gpu:1
54
+ #SBATCH --job-name=tthrush-job-1934302
55
+ #SBATCH --mem=60G
56
+ #SBATCH --nodelist=sphinx2
57
+ #SBATCH --open-mode=append
58
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/eval_job_output.txt
59
+ #SBATCH --partition=sphinx
60
+ #SBATCH --time=14-0
61
+
62
+ # activate your desired anaconda environment
63
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
+
65
+ # cd to working directory
66
+ cd .
67
+
68
+ # launch commands
69
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/perf'
70
+
71
+ ###############################
72
+
73
+ submission to slurm complete!
74
+
75
+
76
+ ###############################
77
+ slurm submission output
78
+
79
+ Submitted batch job 7593160
80
+
81
+
82
+
83
+ ###############################
84
+
85
+ slurm submission log: 2024-05-10 08:21:55.097144
86
+ created following sbatch script:
87
+
88
+ ###############################
89
+
90
+ #!/bin/bash
91
+
92
+ #SBATCH --account=nlp
93
+ #SBATCH --cpus-per-task=16
94
+ #SBATCH --dependency=afterok:7593615
95
+ #SBATCH --gres=gpu:1
96
+ #SBATCH --job-name=tthrush-job-1652949
97
+ #SBATCH --mem=60G
98
+ #SBATCH --nodelist=sphinx1
99
+ #SBATCH --open-mode=append
100
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/eval_job_output.txt
101
+ #SBATCH --partition=sphinx
102
+ #SBATCH --time=14-0
103
+
104
+ # activate your desired anaconda environment
105
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
106
+
107
+ # cd to working directory
108
+ cd .
109
+
110
+ # launch commands
111
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/perf'
112
+
113
+ ###############################
114
+
115
+ submission to slurm complete!
116
+
117
+
118
+ ###############################
119
+ slurm submission output
120
+
121
+ Submitted batch job 7593616
122
+
123
+
124
+
125
+ ###############################
126
+
127
+ slurm submission log: 2024-05-10 08:23:21.265984
128
+ created following sbatch script:
129
+
130
+ ###############################
131
+
132
+ #!/bin/bash
133
+
134
+ #SBATCH --account=nlp
135
+ #SBATCH --cpus-per-task=16
136
+ #SBATCH --dependency=afterok:7593628
137
+ #SBATCH --gres=gpu:1
138
+ #SBATCH --job-name=tthrush-job-4696529
139
+ #SBATCH --mem=60G
140
+ #SBATCH --nodelist=sphinx2
141
+ #SBATCH --open-mode=append
142
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/eval_job_output.txt
143
+ #SBATCH --partition=sphinx
144
+ #SBATCH --time=14-0
145
+
146
+ # activate your desired anaconda environment
147
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
148
+
149
+ # cd to working directory
150
+ cd .
151
+
152
+ # launch commands
153
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/perf'
154
+
155
+ ###############################
156
+
157
+ submission to slurm complete!
158
+
159
+
160
+ ###############################
161
+ slurm submission output
162
+
163
+ Submitted batch job 7593629
164
 
165
 
166
 
logs/events.out.tfevents.1715379130.sphinx2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64ac36094afdbbec94ef0ccebe19d0ce2c20bed25caee8e4cc3da64caefd3af6
3
+ size 11525
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7983c8343af0c152854a5e7705a951c19f4b0f8e8a8a6387b6abbbca8d3d9e33
3
  size 281715176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62bc6d540d51bb852727ad007bcbb5e6570a057dd49e748dc87a85855976b878
3
  size 281715176
train_job_o ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ slurm submission log: 2024-05-09 22:15:13.373799
2
+ created following sbatch script:
3
+
4
+ ###############################
5
+
6
+ #!/bin/bash
7
+
8
+ #SBATCH --account=nlp
9
+ #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:7593080
11
+ #SBATCH --gres=gpu:2
12
+ #SBATCH --job-name=tthrush-job-1909730
13
+ #SBATCH --mem=400G
14
+ #SBATCH --nodelist=sphinx2
15
+ #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/train_job_o
17
+ #SBATCH --partition=sphinx
18
+ #SBATCH --time=14-0
19
+
20
+ # activate your desired anaconda environment
21
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
22
+
23
+ # cd to working directory
24
+ cd .
25
+
26
+ # launch commands
27
+ srun --unbuffered run_as_child_processes 'utput.txt' 'torchrun --master_port 29505 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/arc_easy --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy --o utput_hub_id pythia-70m_arc_easy --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
28
+
29
+ ###############################
30
+
31
+ submission to slurm complete!
32
+
33
+
34
+ ###############################
35
+ slurm submission output
36
+
37
+ Submitted batch job 7593081
38
+
39
+
40
+
41
+ ###############################
42
+
43
+ slurm submission log: 2024-05-09 23:03:16.360972
44
+ created following sbatch script:
45
+
46
+ ###############################
47
+
48
+ #!/bin/bash
49
+
50
+ #SBATCH --account=nlp
51
+ #SBATCH --cpus-per-task=16
52
+ #SBATCH --dependency=afterok:7593158
53
+ #SBATCH --gres=gpu:2
54
+ #SBATCH --job-name=tthrush-job-1041728
55
+ #SBATCH --mem=400G
56
+ #SBATCH --nodelist=sphinx2
57
+ #SBATCH --open-mode=append
58
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/train_job_o
59
+ #SBATCH --partition=sphinx
60
+ #SBATCH --time=14-0
61
+
62
+ # activate your desired anaconda environment
63
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
+
65
+ # cd to working directory
66
+ cd .
67
+
68
+ # launch commands
69
+ srun --unbuffered run_as_child_processes 'utput.txt' 'torchrun --master_port 29505 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/arc_easy --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy --o utput_hub_id pythia-70m_arc_easy --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
70
+
71
+ ###############################
72
+
73
+ submission to slurm complete!
74
+
75
+
76
+ ###############################
77
+ slurm submission output
78
+
79
+ Submitted batch job 7593159
80
+
81
+
82
+
83
+ ###############################
84
+
train_job_output.txt CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a3e2a60cd14b3dbbaf3d866569ed94ac9b6a79055688c2eb14eeb2de944679d
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa76f20c5f442a876f1203253c41b87df813ddfe4e21883c4f29a95c52428c94
3
  size 5112