Training in progress, epoch 1
Browse files- config.json +30 -0
- eval_job_output.txt +126 -0
- logs/events.out.tfevents.1715496757.sphinx2 +3 -0
- model.safetensors +3 -0
- train_job_output.txt +180 -0
- training_args.bin +3 -0
config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "EleutherAI/pythia-70m",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 0,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 512,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 2048,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"max_position_embeddings": 2048,
|
18 |
+
"model_type": "gpt_neox",
|
19 |
+
"num_attention_heads": 8,
|
20 |
+
"num_hidden_layers": 6,
|
21 |
+
"rope_scaling": null,
|
22 |
+
"rotary_emb_base": 10000,
|
23 |
+
"rotary_pct": 0.25,
|
24 |
+
"tie_word_embeddings": false,
|
25 |
+
"torch_dtype": "float32",
|
26 |
+
"transformers_version": "4.39.3",
|
27 |
+
"use_cache": true,
|
28 |
+
"use_parallel_residual": true,
|
29 |
+
"vocab_size": 50304
|
30 |
+
}
|
eval_job_output.txt
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
slurm submission log: 2024-05-11 22:52:02.402564
|
2 |
+
created following sbatch script:
|
3 |
+
|
4 |
+
###############################
|
5 |
+
|
6 |
+
#!/bin/bash
|
7 |
+
|
8 |
+
#SBATCH --account=nlp
|
9 |
+
#SBATCH --cpus-per-task=16
|
10 |
+
#SBATCH --dependency=afterok:
|
11 |
+
#SBATCH --gres=gpu:1
|
12 |
+
#SBATCH --job-name=tthrush-job-4888498
|
13 |
+
#SBATCH --mem=60G
|
14 |
+
#SBATCH --nodelist=sphinx2
|
15 |
+
#SBATCH --open-mode=append
|
16 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/eval_job_output.txt
|
17 |
+
#SBATCH --partition=sphinx
|
18 |
+
#SBATCH --time=14-0
|
19 |
+
|
20 |
+
# activate your desired anaconda environment
|
21 |
+
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
22 |
+
|
23 |
+
# cd to working directory
|
24 |
+
cd .
|
25 |
+
|
26 |
+
# launch commands
|
27 |
+
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/perf'
|
28 |
+
|
29 |
+
###############################
|
30 |
+
|
31 |
+
submission to slurm complete!
|
32 |
+
|
33 |
+
|
34 |
+
###############################
|
35 |
+
slurm submission output
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
sbatch: error: Batch job submission failed: Job dependency problem
|
40 |
+
|
41 |
+
###############################
|
42 |
+
|
43 |
+
slurm submission log: 2024-05-11 22:53:20.065335
|
44 |
+
created following sbatch script:
|
45 |
+
|
46 |
+
###############################
|
47 |
+
|
48 |
+
#!/bin/bash
|
49 |
+
|
50 |
+
#SBATCH --account=nlp
|
51 |
+
#SBATCH --cpus-per-task=16
|
52 |
+
#SBATCH --dependency=afterok:7599822
|
53 |
+
#SBATCH --gres=gpu:1
|
54 |
+
#SBATCH --job-name=tthrush-job-2562954
|
55 |
+
#SBATCH --mem=60G
|
56 |
+
#SBATCH --nodelist=sphinx2
|
57 |
+
#SBATCH --open-mode=append
|
58 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/eval_job_output.txt
|
59 |
+
#SBATCH --partition=sphinx
|
60 |
+
#SBATCH --time=14-0
|
61 |
+
|
62 |
+
# activate your desired anaconda environment
|
63 |
+
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
64 |
+
|
65 |
+
# cd to working directory
|
66 |
+
cd .
|
67 |
+
|
68 |
+
# launch commands
|
69 |
+
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/perf'
|
70 |
+
|
71 |
+
###############################
|
72 |
+
|
73 |
+
submission to slurm complete!
|
74 |
+
|
75 |
+
|
76 |
+
###############################
|
77 |
+
slurm submission output
|
78 |
+
|
79 |
+
Submitted batch job 7599823
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
###############################
|
84 |
+
|
85 |
+
slurm submission log: 2024-05-11 23:09:48.287855
|
86 |
+
created following sbatch script:
|
87 |
+
|
88 |
+
###############################
|
89 |
+
|
90 |
+
#!/bin/bash
|
91 |
+
|
92 |
+
#SBATCH --account=nlp
|
93 |
+
#SBATCH --cpus-per-task=16
|
94 |
+
#SBATCH --dependency=afterok:7599868
|
95 |
+
#SBATCH --gres=gpu:1
|
96 |
+
#SBATCH --job-name=tthrush-job-4073620
|
97 |
+
#SBATCH --mem=60G
|
98 |
+
#SBATCH --nodelist=sphinx2
|
99 |
+
#SBATCH --open-mode=append
|
100 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/eval_job_output.txt
|
101 |
+
#SBATCH --partition=sphinx
|
102 |
+
#SBATCH --time=14-0
|
103 |
+
|
104 |
+
# activate your desired anaconda environment
|
105 |
+
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
106 |
+
|
107 |
+
# cd to working directory
|
108 |
+
cd .
|
109 |
+
|
110 |
+
# launch commands
|
111 |
+
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/perf'
|
112 |
+
|
113 |
+
###############################
|
114 |
+
|
115 |
+
submission to slurm complete!
|
116 |
+
|
117 |
+
|
118 |
+
###############################
|
119 |
+
slurm submission output
|
120 |
+
|
121 |
+
Submitted batch job 7599869
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
###############################
|
126 |
+
|
logs/events.out.tfevents.1715496757.sphinx2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7b95287207e6c57dad1065425823fc192b62fbafd42362af2be6f885df08a25
|
3 |
+
size 10957
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c793f8be353dd40918cbae1d61766c40cc450e1b728e05f81524d115b6117add
|
3 |
+
size 281715176
|
train_job_output.txt
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
0 |
0%| | 0/10682 [00:00<?, ?it/s][rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
|
|
|
|
1 |
0%| | 1/10682 [00:05<15:57:49, 5.38s/it]
|
2 |
0%| | 2/10682 [00:07<10:01:48, 3.38s/it]
|
3 |
0%| | 3/10682 [00:08<7:22:52, 2.49s/it]
|
4 |
0%| | 4/10682 [00:10<5:58:45, 2.02s/it]
|
5 |
0%| | 5/10682 [00:11<5:00:13, 1.69s/it]
|
6 |
0%| | 6/10682 [00:12<4:15:17, 1.43s/it]
|
7 |
0%| | 7/10682 [00:13<3:46:05, 1.27s/it]
|
8 |
0%| | 8/10682 [00:13<3:23:48, 1.15s/it]
|
9 |
0%| | 9/10682 [00:14<3:05:00, 1.04s/it]
|
10 |
0%| | 10/10682 [00:15<2:51:07, 1.04it/s]
|
11 |
0%| | 11/10682 [00:16<2:37:51, 1.13it/s]
|
12 |
0%| | 12/10682 [00:16<2:25:52, 1.22it/s]
|
13 |
0%| | 13/10682 [00:17<2:17:06, 1.30it/s]
|
14 |
0%| | 14/10682 [00:18<2:11:06, 1.36it/s]
|
15 |
0%| | 15/10682 [00:18<2:05:30, 1.42it/s]
|
16 |
0%| | 16/10682 [00:19<1:59:37, 1.49it/s]
|
17 |
0%| | 17/10682 [00:20<1:56:34, 1.52it/s]
|
18 |
0%| | 18/10682 [00:20<1:52:32, 1.58it/s]
|
19 |
0%| | 19/10682 [00:21<1:49:53, 1.62it/s]
|
20 |
0%| | 20/10682 [00:21<1:48:21, 1.64it/s]
|
21 |
0%| | 21/10682 [00:22<1:46:12, 1.67it/s]
|
22 |
0%| | 22/10682 [00:22<1:44:20, 1.70it/s]
|
23 |
0%| | 23/10682 [00:23<1:41:52, 1.74it/s]
|
24 |
0%| | 24/10682 [00:24<1:40:45, 1.76it/s]
|
25 |
0%| | 25/10682 [00:24<1:40:14, 1.77it/s]
|
26 |
|
|
|
27 |
0%| | 25/10682 [00:24<1:40:14, 1.77it/s]
|
28 |
0%| | 26/10682 [00:25<1:39:10, 1.79it/s]
|
29 |
0%| | 27/10682 [00:25<1:37:34, 1.82it/s]
|
30 |
0%| | 28/10682 [00:26<1:36:48, 1.83it/s]
|
31 |
0%| | 29/10682 [00:26<1:36:00, 1.85it/s]
|
32 |
0%| | 30/10682 [00:27<1:35:12, 1.86it/s]
|
33 |
0%| | 31/10682 [00:27<1:34:57, 1.87it/s]
|
34 |
0%| | 32/10682 [00:28<1:34:15, 1.88it/s]
|
35 |
0%| | 33/10682 [00:28<1:34:03, 1.89it/s]
|
36 |
0%| | 34/10682 [00:29<1:33:35, 1.90it/s]
|
37 |
0%| | 35/10682 [00:29<1:32:53, 1.91it/s]
|
38 |
0%| | 36/10682 [00:30<1:32:07, 1.93it/s]
|
39 |
0%| | 37/10682 [00:30<1:31:43, 1.93it/s]
|
40 |
0%| | 38/10682 [00:31<1:31:06, 1.95it/s]
|
41 |
0%| | 39/10682 [00:31<1:30:51, 1.95it/s]
|
42 |
0%| | 40/10682 [00:32<1:30:16, 1.96it/s]
|
43 |
0%| | 41/10682 [00:32<1:30:12, 1.97it/s]
|
44 |
0%| | 42/10682 [00:33<1:31:55, 1.93it/s]
|
45 |
0%| | 43/10682 [00:34<1:31:57, 1.93it/s]
|
46 |
0%| | 44/10682 [00:34<1:32:51, 1.91it/s]
|
47 |
0%| | 45/10682 [00:35<1:34:55, 1.87it/s]
|
48 |
0%| | 46/10682 [00:35<1:34:07, 1.88it/s]
|
49 |
0%| | 47/10682 [00:36<1:34:23, 1.88it/s]
|
50 |
0%| | 48/10682 [00:36<1:34:34, 1.87it/s]
|
51 |
0%| | 49/10682 [00:37<1:34:01, 1.88it/s]
|
52 |
0%| | 50/10682 [00:37<1:33:22, 1.90it/s]
|
53 |
|
|
|
54 |
0%| | 50/10682 [00:37<1:33:22, 1.90it/s]
|
55 |
0%| | 51/10682 [00:38<1:32:21, 1.92it/s]
|
56 |
0%| | 52/10682 [00:38<1:31:09, 1.94it/s]
|
57 |
0%| | 53/10682 [00:39<1:30:15, 1.96it/s]
|
58 |
1%| | 54/10682 [00:39<1:29:35, 1.98it/s]
|
59 |
1%| | 55/10682 [00:40<1:29:07, 1.99it/s]
|
60 |
1%| | 56/10682 [00:40<1:29:19, 1.98it/s]
|
61 |
1%| | 57/10682 [00:41<1:29:05, 1.99it/s]
|
62 |
1%| | 58/10682 [00:41<1:29:05, 1.99it/s]
|
63 |
1%| | 59/10682 [00:42<1:28:56, 1.99it/s]
|
64 |
1%| | 60/10682 [00:42<1:28:40, 2.00it/s]
|
65 |
1%| | 61/10682 [00:43<1:28:37, 2.00it/s]
|
66 |
1%| | 62/10682 [00:43<1:28:44, 1.99it/s]
|
67 |
1%| | 63/10682 [00:44<1:28:26, 2.00it/s]
|
68 |
1%| | 64/10682 [00:44<1:28:10, 2.01it/s]
|
69 |
1%| | 65/10682 [00:45<1:27:48, 2.02it/s]
|
70 |
1%| | 66/10682 [00:45<1:27:39, 2.02it/s]
|
71 |
1%| | 67/10682 [00:46<1:27:42, 2.02it/s]
|
72 |
1%| | 68/10682 [00:46<1:27:37, 2.02it/s]
|
73 |
1%| | 69/10682 [00:47<1:27:36, 2.02it/s]
|
74 |
1%| | 70/10682 [00:47<1:27:33, 2.02it/s]
|
75 |
1%| | 71/10682 [00:48<1:27:35, 2.02it/s]
|
76 |
1%| | 72/10682 [00:48<1:27:30, 2.02it/s]
|
77 |
1%| | 73/10682 [00:49<1:27:31, 2.02it/s]
|
78 |
1%| | 74/10682 [00:49<1:27:25, 2.02it/s]
|
79 |
1%| | 75/10682 [00:50<1:27:17, 2.03it/s]{'loss': 9.2238, 'grad_norm': 1.1420856714248657, 'learning_rate': 7.015902712815715e-05, 'epoch': 0.1}
|
|
|
80 |
|
81 |
1%| | 75/10682 [00:50<1:27:17, 2.03it/s]
|
82 |
1%| | 76/10682 [00:50<1:27:25, 2.02it/s]
|
83 |
1%| | 77/10682 [00:51<1:27:27, 2.02it/s]
|
84 |
1%| | 78/10682 [00:51<1:27:22, 2.02it/s]
|
85 |
1%| | 79/10682 [00:52<1:27:13, 2.03it/s]
|
86 |
1%| | 80/10682 [00:52<1:27:13, 2.03it/s]
|
87 |
1%| | 81/10682 [00:53<1:27:19, 2.02it/s]
|
88 |
1%| | 82/10682 [00:53<1:27:19, 2.02it/s]
|
89 |
1%| | 83/10682 [00:54<1:27:39, 2.02it/s]
|
90 |
1%| | 84/10682 [00:54<1:27:38, 2.02it/s]
|
91 |
1%| | 85/10682 [00:55<1:27:39, 2.01it/s]
|
92 |
1%| | 86/10682 [00:55<1:27:50, 2.01it/s]
|
93 |
1%| | 87/10682 [00:56<1:27:38, 2.02it/s]
|
94 |
1%| | 88/10682 [00:56<1:27:22, 2.02it/s]
|
95 |
1%| | 89/10682 [00:57<1:27:23, 2.02it/s]
|
96 |
1%| | 90/10682 [00:57<1:27:17, 2.02it/s]
|
97 |
1%| | 91/10682 [00:58<1:27:10, 2.02it/s]
|
98 |
1%| | 92/10682 [00:58<1:27:09, 2.03it/s]
|
99 |
1%| | 93/10682 [00:59<1:27:03, 2.03it/s]
|
100 |
1%| | 94/10682 [00:59<1:26:59, 2.03it/s]
|
101 |
1%| | 95/10682 [01:00<1:26:56, 2.03it/s]
|
102 |
1%| | 96/10682 [01:00<1:26:53, 2.03it/s]
|
103 |
1%| | 97/10682 [01:01<1:26:58, 2.03it/s]
|
104 |
1%| | 98/10682 [01:01<1:26:53, 2.03it/s]
|
105 |
1%| | 99/10682 [01:02<1:26:56, 2.03it/s]
|
106 |
1%| | 100/10682 [01:02<1:26:55, 2.03it/s]{'loss': 8.428, 'grad_norm': 0.7997293472290039, 'learning_rate': 9.354536950420954e-05, 'epoch': 0.13}
|
|
|
107 |
|
108 |
1%| | 100/10682 [01:02<1:26:55, 2.03it/s]
|
109 |
1%| | 101/10682 [01:03<1:27:08, 2.02it/s]
|
110 |
1%| | 102/10682 [01:03<1:27:00, 2.03it/s]
|
111 |
1%| | 103/10682 [01:04<1:26:59, 2.03it/s]
|
112 |
1%| | 104/10682 [01:04<1:26:57, 2.03it/s]
|
113 |
1%| | 105/10682 [01:04<1:26:51, 2.03it/s]
|
114 |
1%| | 106/10682 [01:05<1:26:49, 2.03it/s]
|
115 |
1%| | 107/10682 [01:05<1:26:50, 2.03it/s]
|
116 |
1%| | 108/10682 [01:06<1:26:51, 2.03it/s]
|
117 |
1%| | 109/10682 [01:06<1:26:47, 2.03it/s]
|
118 |
1%| | 110/10682 [01:07<1:26:40, 2.03it/s]
|
119 |
1%| | 111/10682 [01:07<1:26:43, 2.03it/s]
|
120 |
1%| | 112/10682 [01:08<1:26:42, 2.03it/s]
|
121 |
1%| | 113/10682 [01:08<1:26:45, 2.03it/s]
|
122 |
1%| | 114/10682 [01:09<1:26:41, 2.03it/s]
|
123 |
1%| | 115/10682 [01:09<1:26:39, 2.03it/s]
|
124 |
1%| | 116/10682 [01:10<1:26:40, 2.03it/s]
|
125 |
1%| | 117/10682 [01:10<1:26:41, 2.03it/s]
|
126 |
1%| | 118/10682 [01:11<1:26:46, 2.03it/s]
|
127 |
1%| | 119/10682 [01:11<1:26:41, 2.03it/s]
|
128 |
1%| | 120/10682 [01:12<1:26:34, 2.03it/s]
|
129 |
1%| | 121/10682 [01:12<1:26:37, 2.03it/s]
|
130 |
1%| | 122/10682 [01:13<1:26:44, 2.03it/s]
|
131 |
1%| | 123/10682 [01:13<1:26:45, 2.03it/s]
|
132 |
1%| | 124/10682 [01:14<1:26:44, 2.03it/s]
|
133 |
1%| | 125/10682 [01:14<1:26:43, 2.03it/s]
|
134 |
|
|
|
135 |
1%| | 125/10682 [01:14<1:26:43, 2.03it/s]
|
136 |
1%| | 126/10682 [01:15<1:26:50, 2.03it/s]
|
137 |
1%| | 127/10682 [01:15<1:26:45, 2.03it/s]
|
138 |
1%| | 128/10682 [01:16<1:26:35, 2.03it/s]
|
139 |
1%| | 129/10682 [01:16<1:26:35, 2.03it/s]
|
140 |
1%| | 130/10682 [01:17<1:26:35, 2.03it/s]
|
141 |
1%| | 131/10682 [01:17<1:26:31, 2.03it/s]
|
142 |
1%| | 132/10682 [01:18<1:26:32, 2.03it/s]
|
143 |
1%| | 133/10682 [01:18<1:26:33, 2.03it/s]
|
144 |
1%|β | 134/10682 [01:19<1:26:27, 2.03it/s]
|
145 |
1%|β | 135/10682 [01:19<1:26:29, 2.03it/s]
|
146 |
1%|β | 136/10682 [01:20<1:26:31, 2.03it/s]
|
147 |
1%|β | 137/10682 [01:20<1:26:31, 2.03it/s]
|
148 |
1%|β | 138/10682 [01:21<1:26:30, 2.03it/s]
|
149 |
1%|β | 139/10682 [01:21<1:26:30, 2.03it/s]
|
150 |
1%|β | 140/10682 [01:22<1:26:29, 2.03it/s]
|
151 |
1%|β | 141/10682 [01:22<1:26:35, 2.03it/s]
|
152 |
1%|β | 142/10682 [01:23<1:26:28, 2.03it/s]
|
153 |
1%|β | 143/10682 [01:23<1:26:27, 2.03it/s]
|
154 |
1%|β | 144/10682 [01:24<1:26:27, 2.03it/s]
|
155 |
1%|β | 145/10682 [01:24<1:26:23, 2.03it/s]
|
156 |
1%|β | 146/10682 [01:25<1:26:32, 2.03it/s]
|
157 |
1%|β | 147/10682 [01:25<1:26:25, 2.03it/s]
|
158 |
1%|β | 148/10682 [01:26<1:26:20, 2.03it/s]
|
159 |
1%|β | 149/10682 [01:26<1:26:23, 2.03it/s]
|
160 |
1%|β | 150/10682 [01:27<1:26:20, 2.03it/s]
|
161 |
{'loss': 7.289, 'grad_norm': 0.367520809173584, 'learning_rate': 0.0001403180542563143, 'epoch': 0.2}
|
|
|
162 |
1%|β | 150/10682 [01:27<1:26:20, 2.03it/s]
|
163 |
1%|β | 151/10682 [01:27<1:26:26, 2.03it/s]
|
164 |
1%|β | 152/10682 [01:28<1:26:25, 2.03it/s]
|
165 |
1%|β | 153/10682 [01:28<1:26:23, 2.03it/s]
|
166 |
1%|β | 154/10682 [01:29<1:26:26, 2.03it/s]
|
167 |
1%|β | 155/10682 [01:29<1:26:23, 2.03it/s]
|
168 |
1%|β | 156/10682 [01:30<1:26:20, 2.03it/s]
|
169 |
1%|β | 157/10682 [01:30<1:26:22, 2.03it/s]
|
170 |
1%|β | 158/10682 [01:31<1:26:23, 2.03it/s]
|
171 |
1%|β | 159/10682 [01:31<1:26:25, 2.03it/s]
|
172 |
1%|β | 160/10682 [01:32<1:26:15, 2.03it/s]
|
173 |
2%|β | 161/10682 [01:32<1:26:14, 2.03it/s]
|
174 |
2%|β | 162/10682 [01:33<1:26:18, 2.03it/s]
|
175 |
2%|β | 163/10682 [01:33<1:26:12, 2.03it/s]
|
176 |
2%|β | 164/10682 [01:34<1:26:10, 2.03it/s]
|
177 |
2%|β | 165/10682 [01:34<1:26:10, 2.03it/s]
|
178 |
2%|β | 166/10682 [01:35<1:26:08, 2.03it/s]
|
179 |
2%|β | 167/10682 [01:35<1:26:13, 2.03it/s]
|
180 |
2%|β | 168/10682 [01:36<1:26:12, 2.03it/s]
|
181 |
2%|β | 169/10682 [01:36<1:26:15, 2.03it/s]
|
182 |
2%|β | 170/10682 [01:36<1:26:14, 2.03it/s]
|
183 |
2%|β | 171/10682 [01:37<1:26:08, 2.03it/s]
|
184 |
2%|β | 172/10682 [01:37<1:26:18, 2.03it/s]
|
185 |
2%|β | 173/10682 [01:38<1:26:12, 2.03it/s]
|
186 |
2%|β | 174/10682 [01:38<1:26:13, 2.03it/s]
|
187 |
2%|β | 175/10682 [01:39<1:26:11, 2.03it/s]{'loss': 6.8807, 'grad_norm': 0.33734890818595886, 'learning_rate': 0.00016370439663236668, 'epoch': 0.23}
|
188 |
|
|
|
189 |
2%|β | 175/10682 [01:39<1:26:11, 2.03it/s]
|
190 |
2%|β | 176/10682 [01:39<1:26:12, 2.03it/s]
|
191 |
2%|β | 177/10682 [01:40<1:26:12, 2.03it/s]
|
192 |
2%|β | 178/10682 [01:40<1:26:09, 2.03it/s]
|
193 |
2%|β | 179/10682 [01:41<1:26:12, 2.03it/s]
|
194 |
2%|β | 180/10682 [01:41<1:26:13, 2.03it/s]
|
195 |
2%|β | 181/10682 [01:42<1:26:08, 2.03it/s]
|
196 |
2%|β | 182/10682 [01:42<1:26:14, 2.03it/s]
|
197 |
2%|β | 183/10682 [01:43<1:26:10, 2.03it/s]
|
198 |
2%|β | 184/10682 [01:43<1:26:09, 2.03it/s]
|
199 |
2%|β | 185/10682 [01:44<1:26:08, 2.03it/s]
|
200 |
2%|β | 186/10682 [01:44<1:26:02, 2.03it/s]
|
201 |
2%|β | 187/10682 [01:45<1:26:04, 2.03it/s]
|
202 |
2%|β | 188/10682 [01:45<1:26:03, 2.03it/s]
|
203 |
2%|β | 189/10682 [01:46<1:25:58, 2.03it/s]
|
204 |
2%|β | 190/10682 [01:46<1:25:58, 2.03it/s]
|
205 |
2%|β | 191/10682 [01:47<1:25:58, 2.03it/s]
|
206 |
2%|β | 192/10682 [01:47<1:26:01, 2.03it/s]
|
207 |
2%|β | 193/10682 [01:48<1:26:01, 2.03it/s]
|
208 |
2%|β | 194/10682 [01:48<1:26:01, 2.03it/s]
|
209 |
2%|β | 195/10682 [01:49<1:25:58, 2.03it/s]
|
210 |
2%|β | 196/10682 [01:49<1:26:02, 2.03it/s]
|
211 |
2%|β | 197/10682 [01:50<1:26:00, 2.03it/s]
|
212 |
2%|β | 198/10682 [01:50<1:26:04, 2.03it/s]
|
213 |
2%|β | 199/10682 [01:51<1:26:00, 2.03it/s]
|
214 |
2%|β | 200/10682 [01:51<1:25:57, 2.03it/s]{'loss': 6.5556, 'grad_norm': 0.46203550696372986, 'learning_rate': 0.00018709073900841907, 'epoch': 0.26}
|
|
|
215 |
|
216 |
2%|β | 200/10682 [01:51<1:25:57, 2.03it/s]
|
217 |
2%|β | 201/10682 [01:52<1:26:04, 2.03it/s]
|
218 |
2%|β | 202/10682 [01:52<1:26:00, 2.03it/s]
|
219 |
2%|β | 203/10682 [01:53<1:26:03, 2.03it/s]
|
220 |
2%|β | 204/10682 [01:53<1:26:00, 2.03it/s]
|
221 |
2%|β | 205/10682 [01:54<1:25:56, 2.03it/s]
|
222 |
2%|β | 206/10682 [01:54<1:25:57, 2.03it/s]
|
223 |
2%|β | 207/10682 [01:55<1:25:52, 2.03it/s]
|
224 |
2%|β | 208/10682 [01:55<1:25:51, 2.03it/s]
|
225 |
2%|β | 209/10682 [01:56<1:25:54, 2.03it/s]
|
226 |
2%|β | 210/10682 [01:56<1:25:49, 2.03it/s]
|
227 |
2%|β | 211/10682 [01:57<1:25:45, 2.04it/s]
|
228 |
2%|β | 212/10682 [01:57<1:25:49, 2.03it/s]
|
229 |
2%|β | 213/10682 [01:58<1:25:44, 2.04it/s]
|
230 |
2%|β | 214/10682 [01:58<1:25:40, 2.04it/s]
|
231 |
2%|β | 215/10682 [01:59<1:25:44, 2.03it/s]
|
232 |
2%|β | 216/10682 [01:59<1:25:41, 2.04it/s]
|
233 |
2%|β | 217/10682 [02:00<1:25:42, 2.03it/s]
|
234 |
2%|β | 218/10682 [02:00<1:25:45, 2.03it/s]
|
235 |
2%|β | 219/10682 [02:01<1:25:42, 2.03it/s]
|
236 |
2%|β | 220/10682 [02:01<1:25:40, 2.04it/s]
|
237 |
2%|β | 221/10682 [02:02<1:25:44, 2.03it/s]
|
238 |
2%|β | 222/10682 [02:02<1:25:42, 2.03it/s]
|
239 |
2%|β | 223/10682 [02:03<1:25:39, 2.04it/s]
|
240 |
2%|β | 224/10682 [02:03<1:25:41, 2.03it/s]
|
241 |
2%|β | 225/10682 [02:04<1:25:38, 2.04it/s]{'loss': 6.2908, 'grad_norm': 0.7612385153770447, 'learning_rate': 0.00021047708138447147, 'epoch': 0.29}
|
|
|
242 |
|
243 |
2%|β | 225/10682 [02:04<1:25:38, 2.04it/s]
|
244 |
2%|β | 226/10682 [02:04<1:25:45, 2.03it/s]
|
245 |
2%|β | 227/10682 [02:05<1:25:46, 2.03it/s]
|
246 |
2%|β | 228/10682 [02:05<1:25:42, 2.03it/s]
|
247 |
2%|β | 229/10682 [02:06<1:25:41, 2.03it/s]
|
248 |
2%|β | 230/10682 [02:06<1:25:43, 2.03it/s]
|
249 |
2%|β | 231/10682 [02:07<1:25:37, 2.03it/s]
|
250 |
2%|β | 232/10682 [02:07<1:25:41, 2.03it/s]
|
251 |
2%|β | 233/10682 [02:07<1:25:39, 2.03it/s]
|
252 |
2%|β | 234/10682 [02:08<1:25:33, 2.04it/s]
|
253 |
2%|β | 235/10682 [02:08<1:25:38, 2.03it/s]
|
254 |
2%|β | 236/10682 [02:09<1:25:37, 2.03it/s]
|
255 |
2%|β | 237/10682 [02:09<1:25:35, 2.03it/s]
|
256 |
2%|β | 238/10682 [02:10<1:25:40, 2.03it/s]
|
257 |
2%|β | 239/10682 [02:10<1:25:36, 2.03it/s]
|
258 |
2%|β | 240/10682 [02:11<1:25:38, 2.03it/s]
|
259 |
2%|β | 241/10682 [02:11<1:25:39, 2.03it/s]
|
260 |
2%|β | 242/10682 [02:12<1:25:38, 2.03it/s]
|
261 |
2%|β | 243/10682 [02:12<1:25:43, 2.03it/s]
|
262 |
2%|β | 244/10682 [02:13<1:25:35, 2.03it/s]
|
263 |
2%|β | 245/10682 [02:13<1:25:33, 2.03it/s]
|
264 |
2%|β | 246/10682 [02:14<1:25:38, 2.03it/s]
|
265 |
2%|β | 247/10682 [02:14<1:25:35, 2.03it/s]
|
266 |
2%|β | 248/10682 [02:15<1:25:37, 2.03it/s]
|
267 |
2%|β | 249/10682 [02:15<1:25:34, 2.03it/s]
|
268 |
2%|β | 250/10682 [02:16<1:25:30, 2.03it/s]{'loss': 6.0883, 'grad_norm': 0.3854532241821289, 'learning_rate': 0.00023386342376052386, 'epoch': 0.33}
|
269 |
|
|
|
270 |
2%|β | 250/10682 [02:16<1:25:30, 2.03it/s]
|
271 |
2%|β | 251/10682 [02:16<1:25:42, 2.03it/s]
|
272 |
2%|β | 252/10682 [02:17<1:25:35, 2.03it/s]
|
273 |
2%|β | 253/10682 [02:17<1:25:34, 2.03it/s]
|
274 |
2%|β | 254/10682 [02:18<1:25:33, 2.03it/s]
|
275 |
2%|β | 255/10682 [02:18<1:25:34, 2.03it/s]
|
276 |
2%|β | 256/10682 [02:19<1:25:33, 2.03it/s]
|
277 |
2%|β | 257/10682 [02:19<1:25:31, 2.03it/s]
|
278 |
2%|β | 258/10682 [02:20<1:25:29, 2.03it/s]
|
279 |
2%|β | 259/10682 [02:20<1:25:25, 2.03it/s]
|
280 |
2%|β | 260/10682 [02:21<1:25:28, 2.03it/s]
|
281 |
2%|β | 261/10682 [02:21<1:25:25, 2.03it/s]
|
282 |
2%|β | 262/10682 [02:22<1:25:23, 2.03it/s]
|
283 |
2%|β | 263/10682 [02:22<1:25:27, 2.03it/s]
|
284 |
2%|β | 264/10682 [02:23<1:25:22, 2.03it/s]
|
285 |
2%|β | 265/10682 [02:23<1:25:19, 2.03it/s]
|
286 |
2%|β | 266/10682 [02:24<1:25:23, 2.03it/s]
|
287 |
2%|β | 267/10682 [02:24<1:25:23, 2.03it/s]
|
288 |
3%|β | 268/10682 [02:25<1:25:23, 2.03it/s]
|
289 |
3%|β | 269/10682 [02:25<1:25:27, 2.03it/s]
|
290 |
3%|β | 270/10682 [02:26<1:25:27, 2.03it/s]
|
291 |
3%|β | 271/10682 [02:26<1:25:24, 2.03it/s]
|
292 |
3%|β | 272/10682 [02:27<1:25:22, 2.03it/s]
|
293 |
3%|β | 273/10682 [02:27<1:25:20, 2.03it/s]
|
294 |
3%|β | 274/10682 [02:28<1:25:21, 2.03it/s]
|
295 |
3%|β | 275/10682 [02:28<1:25:21, 2.03it/s]
|
296 |
{'loss': 5.9181, 'grad_norm': 0.7595835328102112, 'learning_rate': 0.00025724976613657625, 'epoch': 0.36}
|
|
|
297 |
3%|β | 275/10682 [02:28<1:25:21, 2.03it/s]
|
298 |
3%|β | 276/10682 [02:29<1:25:26, 2.03it/s]
|
299 |
3%|β | 277/10682 [02:29<1:25:26, 2.03it/s]
|
300 |
3%|β | 278/10682 [02:30<1:25:24, 2.03it/s]
|
301 |
3%|β | 279/10682 [02:30<1:25:23, 2.03it/s]
|
302 |
3%|β | 280/10682 [02:31<1:25:23, 2.03it/s]
|
303 |
3%|β | 281/10682 [02:31<1:25:21, 2.03it/s]
|
304 |
3%|β | 282/10682 [02:32<1:25:25, 2.03it/s]
|
305 |
3%|β | 283/10682 [02:32<1:25:22, 2.03it/s]
|
306 |
3%|β | 284/10682 [02:33<1:25:20, 2.03it/s]
|
307 |
3%|β | 285/10682 [02:33<1:25:21, 2.03it/s]
|
308 |
3%|β | 286/10682 [02:34<1:25:24, 2.03it/s]
|
309 |
3%|β | 287/10682 [02:34<1:25:27, 2.03it/s]
|
310 |
3%|β | 288/10682 [02:35<1:25:22, 2.03it/s]
|
311 |
3%|β | 289/10682 [02:35<1:25:16, 2.03it/s]
|
312 |
3%|β | 290/10682 [02:36<1:25:18, 2.03it/s]
|
313 |
3%|β | 291/10682 [02:36<1:25:15, 2.03it/s]
|
314 |
3%|β | 292/10682 [02:37<1:25:08, 2.03it/s]
|
315 |
3%|β | 293/10682 [02:37<1:25:12, 2.03it/s]
|
316 |
3%|β | 294/10682 [02:38<1:25:09, 2.03it/s]
|
317 |
3%|β | 295/10682 [02:38<1:25:07, 2.03it/s]
|
318 |
3%|β | 296/10682 [02:39<1:25:10, 2.03it/s]
|
319 |
3%|β | 297/10682 [02:39<1:25:02, 2.04it/s]
|
320 |
3%|β | 298/10682 [02:39<1:25:05, 2.03it/s]
|
321 |
3%|β | 299/10682 [02:40<1:25:08, 2.03it/s]
|
322 |
3%|β | 300/10682 [02:40<1:25:04, 2.03it/s]{'loss': 5.7819, 'grad_norm': 0.6112937927246094, 'learning_rate': 0.0002806361085126286, 'epoch': 0.39}
|
|
|
323 |
|
324 |
3%|β | 300/10682 [02:40<1:25:04, 2.03it/s]
|
325 |
3%|β | 301/10682 [02:41<1:25:24, 2.03it/s]
|
326 |
3%|β | 302/10682 [02:41<1:25:14, 2.03it/s]
|
327 |
3%|β | 303/10682 [02:42<1:25:13, 2.03it/s]
|
328 |
3%|β | 304/10682 [02:42<1:25:08, 2.03it/s]
|
329 |
3%|β | 305/10682 [02:43<1:25:08, 2.03it/s]
|
330 |
3%|β | 306/10682 [02:43<1:25:08, 2.03it/s]
|
331 |
3%|β | 307/10682 [02:44<1:25:02, 2.03it/s]
|
332 |
3%|β | 308/10682 [02:44<1:25:02, 2.03it/s]
|
333 |
3%|β | 309/10682 [02:45<1:25:04, 2.03it/s]
|
334 |
3%|β | 310/10682 [02:45<1:25:05, 2.03it/s]
|
335 |
3%|β | 311/10682 [02:46<1:25:07, 2.03it/s]
|
336 |
3%|β | 312/10682 [02:46<1:25:04, 2.03it/s]
|
337 |
3%|β | 313/10682 [02:47<1:25:04, 2.03it/s]
|
338 |
3%|β | 314/10682 [02:47<1:25:01, 2.03it/s]
|
339 |
3%|β | 315/10682 [02:48<1:24:58, 2.03it/s]
|
340 |
3%|β | 316/10682 [02:48<1:24:57, 2.03it/s]
|
341 |
3%|β | 317/10682 [02:49<1:24:59, 2.03it/s]
|
342 |
3%|β | 318/10682 [02:49<1:24:58, 2.03it/s]
|
343 |
3%|β | 319/10682 [02:50<1:25:00, 2.03it/s]
|
344 |
3%|β | 320/10682 [02:50<1:24:55, 2.03it/s]
|
345 |
3%|β | 321/10682 [02:51<1:25:00, 2.03it/s]
|
346 |
3%|β | 322/10682 [02:51<1:25:05, 2.03it/s]
|
347 |
3%|β | 323/10682 [02:52<1:24:59, 2.03it/s]
|
348 |
3%|β | 324/10682 [02:52<1:24:58, 2.03it/s]
|
349 |
3%|β | 325/10682 [02:53<1:24:55, 2.03it/s]
|
350 |
{'loss': 5.6576, 'grad_norm': 1.0010818243026733, 'learning_rate': 0.00030402245088868103, 'epoch': 0.43}
|
|
|
351 |
3%|β | 325/10682 [02:53<1:24:55, 2.03it/s]
|
352 |
3%|β | 326/10682 [02:53<1:25:01, 2.03it/s]
|
353 |
3%|β | 327/10682 [02:54<1:25:04, 2.03it/s]
|
354 |
3%|β | 328/10682 [02:54<1:25:01, 2.03it/s]
|
355 |
3%|β | 329/10682 [02:55<1:25:04, 2.03it/s]
|
356 |
3%|β | 330/10682 [02:55<1:25:00, 2.03it/s]
|
357 |
3%|β | 331/10682 [02:56<1:24:56, 2.03it/s]
|
358 |
3%|β | 332/10682 [02:56<1:24:56, 2.03it/s]
|
359 |
3%|β | 333/10682 [02:57<1:24:52, 2.03it/s]
|
360 |
3%|β | 334/10682 [02:57<1:24:51, 2.03it/s]
|
361 |
3%|β | 335/10682 [02:58<1:24:51, 2.03it/s]
|
362 |
3%|β | 336/10682 [02:58<1:24:48, 2.03it/s]
|
363 |
3%|β | 337/10682 [02:59<1:24:50, 2.03it/s]
|
364 |
3%|β | 338/10682 [02:59<1:24:49, 2.03it/s]
|
365 |
3%|β | 339/10682 [03:00<1:24:46, 2.03it/s]
|
366 |
3%|β | 340/10682 [03:00<1:24:53, 2.03it/s]
|
367 |
3%|β | 341/10682 [03:01<1:24:48, 2.03it/s]
|
368 |
3%|β | 342/10682 [03:01<1:24:45, 2.03it/s]
|
369 |
3%|β | 343/10682 [03:02<1:24:49, 2.03it/s]
|
370 |
3%|β | 344/10682 [03:02<1:24:46, 2.03it/s]
|
371 |
3%|β | 345/10682 [03:03<1:24:46, 2.03it/s]
|
372 |
3%|β | 346/10682 [03:03<1:24:44, 2.03it/s]
|
373 |
3%|β | 347/10682 [03:04<1:24:40, 2.03it/s]
|
374 |
3%|β | 348/10682 [03:04<1:24:45, 2.03it/s]
|
375 |
3%|β | 349/10682 [03:05<1:24:42, 2.03it/s]
|
376 |
3%|β | 350/10682 [03:05<1:24:40, 2.03it/s]{'loss': 5.5561, 'grad_norm': 0.5823507308959961, 'learning_rate': 0.00032740879326473337, 'epoch': 0.46}
|
377 |
|
|
|
378 |
3%|β | 350/10682 [03:05<1:24:40, 2.03it/s]
|
379 |
3%|β | 351/10682 [03:06<1:24:50, 2.03it/s]
|
380 |
3%|β | 352/10682 [03:06<1:24:41, 2.03it/s]
|
381 |
3%|β | 353/10682 [03:07<1:24:44, 2.03it/s]
|
382 |
3%|β | 354/10682 [03:07<1:24:42, 2.03it/s]
|
383 |
3%|β | 355/10682 [03:08<1:24:36, 2.03it/s]
|
384 |
3%|β | 356/10682 [03:08<1:24:42, 2.03it/s]
|
385 |
3%|β | 357/10682 [03:09<1:24:39, 2.03it/s]
|
386 |
3%|β | 358/10682 [03:09<1:24:36, 2.03it/s]
|
387 |
3%|β | 359/10682 [03:10<1:24:39, 2.03it/s]
|
388 |
3%|β | 360/10682 [03:10<1:24:39, 2.03it/s]
|
389 |
3%|β | 361/10682 [03:10<1:24:40, 2.03it/s]
|
390 |
3%|β | 362/10682 [03:11<1:24:39, 2.03it/s]
|
391 |
3%|β | 363/10682 [03:11<1:24:35, 2.03it/s]
|
392 |
3%|β | 364/10682 [03:12<1:24:36, 2.03it/s]
|
393 |
3%|β | 365/10682 [03:12<1:24:38, 2.03it/s]
|
394 |
3%|β | 366/10682 [03:13<1:24:36, 2.03it/s]
|
395 |
3%|β | 367/10682 [03:13<1:24:37, 2.03it/s]
|
396 |
3%|β | 368/10682 [03:14<1:24:38, 2.03it/s]
|
397 |
3%|β | 369/10682 [03:14<1:24:37, 2.03it/s]
|
398 |
3%|β | 370/10682 [03:15<1:24:39, 2.03it/s]
|
399 |
3%|β | 371/10682 [03:15<1:24:34, 2.03it/s]
|
400 |
3%|β | 372/10682 [03:16<1:24:37, 2.03it/s]
|
401 |
3%|β | 373/10682 [03:16<1:24:37, 2.03it/s]
|
402 |
4%|β | 374/10682 [03:17<1:24:36, 2.03it/s]
|
403 |
4%|β | 375/10682 [03:17<1:24:36, 2.03it/s]
|
404 |
|
|
|
405 |
4%|β | 375/10682 [03:17<1:24:36, 2.03it/s]
|
406 |
4%|β | 376/10682 [03:18<1:25:00, 2.02it/s]
|
407 |
4%|β | 377/10682 [03:18<1:24:53, 2.02it/s]
|
408 |
4%|β | 378/10682 [03:19<1:24:43, 2.03it/s]
|
409 |
4%|β | 379/10682 [03:19<1:24:39, 2.03it/s]
|
410 |
4%|β | 380/10682 [03:20<1:24:37, 2.03it/s]
|
411 |
4%|β | 381/10682 [03:20<1:24:30, 2.03it/s]
|
412 |
4%|β | 382/10682 [03:21<1:24:32, 2.03it/s]
|
413 |
4%|β | 383/10682 [03:21<1:24:28, 2.03it/s]
|
414 |
4%|β | 384/10682 [03:22<1:24:27, 2.03it/s]
|
415 |
4%|β | 385/10682 [03:22<1:24:28, 2.03it/s]
|
416 |
4%|β | 386/10682 [03:23<1:24:22, 2.03it/s]
|
417 |
4%|β | 387/10682 [03:23<1:24:24, 2.03it/s]
|
418 |
4%|β | 388/10682 [03:24<1:24:26, 2.03it/s]
|
419 |
4%|β | 389/10682 [03:24<1:24:25, 2.03it/s]
|
420 |
4%|β | 390/10682 [03:25<1:24:25, 2.03it/s]
|
421 |
4%|β | 391/10682 [03:25<1:24:25, 2.03it/s]
|
422 |
4%|β | 392/10682 [03:26<1:24:20, 2.03it/s]
|
423 |
4%|β | 393/10682 [03:26<1:24:23, 2.03it/s]
|
424 |
4%|β | 394/10682 [03:27<1:24:21, 2.03it/s]
|
425 |
4%|β | 395/10682 [03:27<1:24:21, 2.03it/s]
|
426 |
4%|β | 396/10682 [03:28<1:24:23, 2.03it/s]
|
427 |
4%|β | 397/10682 [03:28<1:24:28, 2.03it/s]
|
428 |
4%|β | 398/10682 [03:29<1:24:28, 2.03it/s]
|
429 |
4%|β | 399/10682 [03:29<1:24:28, 2.03it/s]
|
430 |
4%|β | 400/10682 [03:30<1:24:31, 2.03it/s]{'loss': 5.375, 'grad_norm': 0.6424997448921204, 'learning_rate': 0.00037418147801683815, 'epoch': 0.52}
|
|
|
431 |
|
432 |
4%|β | 400/10682 [03:30<1:24:31, 2.03it/s]
|
433 |
4%|β | 401/10682 [03:30<1:24:46, 2.02it/s]
|
434 |
4%|β | 402/10682 [03:31<1:24:39, 2.02it/s]
|
435 |
4%|β | 403/10682 [03:31<1:24:35, 2.03it/s]
|
436 |
4%|β | 404/10682 [03:32<1:24:31, 2.03it/s]
|
437 |
4%|β | 405/10682 [03:32<1:24:23, 2.03it/s]
|
438 |
4%|β | 406/10682 [03:33<1:24:22, 2.03it/s]
|
439 |
4%|β | 407/10682 [03:33<1:24:23, 2.03it/s]
|
440 |
4%|β | 408/10682 [03:34<1:24:18, 2.03it/s]
|
441 |
4%|β | 409/10682 [03:34<1:24:17, 2.03it/s]
|
442 |
4%|β | 410/10682 [03:35<1:24:17, 2.03it/s]
|
443 |
4%|β | 411/10682 [03:35<1:24:12, 2.03it/s]
|
444 |
4%|β | 412/10682 [03:36<1:24:08, 2.03it/s]
|
445 |
4%|β | 413/10682 [03:36<1:24:10, 2.03it/s]
|
446 |
4%|β | 414/10682 [03:37<1:24:06, 2.03it/s]
|
447 |
4%|β | 415/10682 [03:37<1:24:09, 2.03it/s]
|
448 |
4%|β | 416/10682 [03:38<1:24:10, 2.03it/s]
|
449 |
4%|β | 417/10682 [03:38<1:24:06, 2.03it/s]
|
450 |
4%|β | 418/10682 [03:39<1:24:06, 2.03it/s]
|
451 |
4%|β | 419/10682 [03:39<1:24:08, 2.03it/s]
|
452 |
4%|β | 420/10682 [03:40<1:24:08, 2.03it/s]
|
453 |
4%|β | 421/10682 [03:40<1:24:09, 2.03it/s]
|
454 |
4%|β | 422/10682 [03:41<1:24:08, 2.03it/s]
|
455 |
4%|β | 423/10682 [03:41<1:24:06, 2.03it/s]
|
456 |
4%|β | 424/10682 [03:42<1:24:06, 2.03it/s]
|
457 |
4%|β | 425/10682 [03:42<1:24:03, 2.03it/s]{'loss': 5.2944, 'grad_norm': 0.4700624942779541, 'learning_rate': 0.0003975678203928906, 'epoch': 0.56}
|
|
|
458 |
|
459 |
4%|β | 425/10682 [03:42<1:24:03, 2.03it/s]
|
460 |
4%|β | 426/10682 [03:43<1:24:12, 2.03it/s]
|
461 |
4%|β | 427/10682 [03:43<1:24:08, 2.03it/s]
|
462 |
4%|β | 428/10682 [03:43<1:24:06, 2.03it/s]
|
463 |
4%|β | 429/10682 [03:44<1:24:09, 2.03it/s]
|
464 |
4%|β | 430/10682 [03:44<1:24:07, 2.03it/s]
|
465 |
4%|β | 431/10682 [03:45<1:24:08, 2.03it/s]
|
466 |
4%|β | 432/10682 [03:45<1:24:04, 2.03it/s]
|
467 |
4%|β | 433/10682 [03:46<1:23:59, 2.03it/s]
|
468 |
4%|β | 434/10682 [03:46<1:24:01, 2.03it/s]
|
469 |
4%|β | 435/10682 [03:47<1:23:59, 2.03it/s]
|
470 |
4%|β | 436/10682 [03:47<1:24:02, 2.03it/s]
|
471 |
4%|β | 437/10682 [03:48<1:24:03, 2.03it/s]
|
472 |
4%|β | 438/10682 [03:48<1:23:58, 2.03it/s]
|
473 |
4%|β | 439/10682 [03:49<1:24:02, 2.03it/s]
|
474 |
4%|β | 440/10682 [03:49<1:23:57, 2.03it/s]
|
475 |
4%|β | 441/10682 [03:50<1:23:53, 2.03it/s]
|
476 |
4%|β | 442/10682 [03:50<1:23:57, 2.03it/s]
|
477 |
4%|β | 443/10682 [03:51<1:23:55, 2.03it/s]
|
478 |
4%|β | 444/10682 [03:51<1:23:51, 2.03it/s]
|
479 |
4%|β | 445/10682 [03:52<1:23:58, 2.03it/s]
|
480 |
4%|β | 446/10682 [03:52<1:23:52, 2.03it/s]
|
481 |
4%|β | 447/10682 [03:53<1:23:56, 2.03it/s]
|
482 |
4%|β | 448/10682 [03:53<1:23:55, 2.03it/s]
|
483 |
4%|β | 449/10682 [03:54<1:23:52, 2.03it/s]
|
484 |
4%|β | 450/10682 [03:54<1:23:56, 2.03it/s]
|
485 |
{'loss': 5.2223, 'grad_norm': 0.4889560043811798, 'learning_rate': 0.00042095416276894293, 'epoch': 0.59}
|
|
|
486 |
4%|β | 450/10682 [03:54<1:23:56, 2.03it/s]
|
487 |
4%|β | 451/10682 [03:55<1:23:56, 2.03it/s]
|
488 |
4%|β | 452/10682 [03:55<1:23:55, 2.03it/s]
|
489 |
4%|β | 453/10682 [03:56<1:23:53, 2.03it/s]
|
490 |
4%|β | 454/10682 [03:56<1:23:48, 2.03it/s]
|
491 |
4%|β | 455/10682 [03:57<1:23:58, 2.03it/s]
|
492 |
4%|β | 456/10682 [03:57<1:23:51, 2.03it/s]
|
493 |
4%|β | 457/10682 [03:58<1:23:51, 2.03it/s]
|
494 |
4%|β | 458/10682 [03:58<1:23:50, 2.03it/s]
|
495 |
4%|β | 459/10682 [03:59<1:23:47, 2.03it/s]
|
496 |
4%|β | 460/10682 [03:59<1:23:45, 2.03it/s]
|
497 |
4%|β | 461/10682 [04:00<1:23:48, 2.03it/s]
|
498 |
4%|β | 462/10682 [04:00<1:23:44, 2.03it/s]
|
499 |
4%|β | 463/10682 [04:01<1:23:48, 2.03it/s]
|
500 |
4%|β | 464/10682 [04:01<1:23:47, 2.03it/s]
|
501 |
4%|β | 465/10682 [04:02<1:23:48, 2.03it/s]
|
502 |
4%|β | 466/10682 [04:02<1:23:50, 2.03it/s]
|
503 |
4%|β | 467/10682 [04:03<1:23:53, 2.03it/s]
|
504 |
4%|β | 468/10682 [04:03<1:23:52, 2.03it/s]
|
505 |
4%|β | 469/10682 [04:04<1:23:51, 2.03it/s]
|
506 |
4%|β | 470/10682 [04:04<1:23:49, 2.03it/s]
|
507 |
4%|β | 471/10682 [04:05<1:23:48, 2.03it/s]
|
508 |
4%|β | 472/10682 [04:05<1:23:45, 2.03it/s]
|
509 |
4%|β | 473/10682 [04:06<1:23:50, 2.03it/s]
|
510 |
4%|β | 474/10682 [04:06<1:23:45, 2.03it/s]
|
511 |
4%|β | 475/10682 [04:07<1:23:40, 2.03it/s]{'loss': 5.1492, 'grad_norm': 0.5106998682022095, 'learning_rate': 0.0004443405051449954, 'epoch': 0.62}
|
|
|
512 |
|
513 |
4%|β | 475/10682 [04:07<1:23:40, 2.03it/s]
|
514 |
4%|β | 476/10682 [04:07<1:23:50, 2.03it/s]
|
515 |
4%|β | 477/10682 [04:08<1:23:45, 2.03it/s]
|
516 |
4%|β | 478/10682 [04:08<1:23:44, 2.03it/s]
|
517 |
4%|β | 479/10682 [04:09<1:23:43, 2.03it/s]
|
518 |
4%|β | 480/10682 [04:09<1:23:37, 2.03it/s]
|
519 |
5%|β | 481/10682 [04:10<1:23:37, 2.03it/s]
|
520 |
5%|β | 482/10682 [04:10<1:23:40, 2.03it/s]
|
521 |
5%|β | 483/10682 [04:11<1:23:35, 2.03it/s]
|
522 |
5%|β | 484/10682 [04:11<1:23:36, 2.03it/s]
|
523 |
5%|β | 485/10682 [04:12<1:23:39, 2.03it/s]
|
524 |
5%|β | 486/10682 [04:12<1:23:37, 2.03it/s]
|
525 |
5%|β | 487/10682 [04:13<1:23:40, 2.03it/s]
|
526 |
5%|β | 488/10682 [04:13<1:23:36, 2.03it/s]
|
527 |
5%|β | 489/10682 [04:14<1:23:36, 2.03it/s]
|
528 |
5%|β | 490/10682 [04:14<1:23:43, 2.03it/s]
|
529 |
5%|β | 491/10682 [04:14<1:23:37, 2.03it/s]
|
530 |
5%|β | 492/10682 [04:15<1:23:40, 2.03it/s]
|
531 |
5%|β | 493/10682 [04:15<1:23:37, 2.03it/s]
|
532 |
5%|β | 494/10682 [04:16<1:23:39, 2.03it/s]
|
533 |
5%|β | 495/10682 [04:16<1:23:40, 2.03it/s]
|
534 |
5%|β | 496/10682 [04:17<1:23:36, 2.03it/s]
|
535 |
5%|β | 497/10682 [04:17<1:23:34, 2.03it/s]
|
536 |
5%|β | 498/10682 [04:18<1:23:35, 2.03it/s]
|
537 |
5%|β | 499/10682 [04:18<1:23:34, 2.03it/s]
|
538 |
5%|β | 500/10682 [04:19<1:23:36, 2.03it/s]
|
539 |
{'loss': 5.0961, 'grad_norm': 0.5852717161178589, 'learning_rate': 0.0004677268475210477, 'epoch': 0.66}
|
|
|
540 |
5%|β | 500/10682 [04:19<1:23:36, 2.03it/s]
|
541 |
5%|β | 501/10682 [04:19<1:23:38, 2.03it/s]
|
542 |
5%|β | 502/10682 [04:20<1:23:35, 2.03it/s]
|
543 |
5%|β | 503/10682 [04:20<1:23:33, 2.03it/s]
|
544 |
5%|β | 504/10682 [04:21<1:23:27, 2.03it/s]
|
545 |
5%|β | 505/10682 [04:21<1:23:25, 2.03it/s]
|
546 |
5%|β | 506/10682 [04:22<1:23:30, 2.03it/s]
|
547 |
5%|β | 507/10682 [04:22<1:23:32, 2.03it/s]
|
548 |
5%|β | 508/10682 [04:23<1:23:34, 2.03it/s]
|
549 |
5%|β | 509/10682 [04:23<1:23:31, 2.03it/s]
|
550 |
5%|β | 510/10682 [04:24<1:23:28, 2.03it/s]
|
551 |
5%|β | 511/10682 [04:24<1:23:30, 2.03it/s]
|
552 |
5%|β | 512/10682 [04:25<1:23:30, 2.03it/s]
|
553 |
5%|β | 513/10682 [04:25<1:23:31, 2.03it/s]
|
554 |
5%|β | 514/10682 [04:26<1:23:28, 2.03it/s]
|
555 |
5%|β | 515/10682 [04:26<1:23:25, 2.03it/s]
|
556 |
5%|β | 516/10682 [04:27<1:23:24, 2.03it/s]
|
557 |
5%|β | 517/10682 [04:27<1:23:23, 2.03it/s]
|
558 |
5%|β | 518/10682 [04:28<1:23:19, 2.03it/s]
|
559 |
5%|β | 519/10682 [04:28<1:23:23, 2.03it/s]
|
560 |
5%|β | 520/10682 [04:29<1:23:21, 2.03it/s]
|
561 |
5%|β | 521/10682 [04:29<1:23:23, 2.03it/s]
|
562 |
5%|β | 522/10682 [04:30<1:23:26, 2.03it/s]
|
563 |
5%|β | 523/10682 [04:30<1:23:26, 2.03it/s]
|
564 |
5%|β | 524/10682 [04:31<1:23:26, 2.03it/s]
|
565 |
5%|β | 525/10682 [04:31<1:23:17, 2.03it/s]{'loss': 5.0379, 'grad_norm': 0.4721851348876953, 'learning_rate': 0.0004911131898971, 'epoch': 0.69}
|
|
|
566 |
|
567 |
5%|β | 525/10682 [04:31<1:23:17, 2.03it/s]
|
568 |
5%|β | 526/10682 [04:32<1:23:25, 2.03it/s]
|
569 |
5%|β | 527/10682 [04:32<1:23:22, 2.03it/s]
|
570 |
5%|β | 528/10682 [04:33<1:23:13, 2.03it/s]
|
571 |
5%|β | 529/10682 [04:33<1:23:15, 2.03it/s]
|
572 |
5%|β | 530/10682 [04:34<1:23:11, 2.03it/s]
|
573 |
5%|β | 531/10682 [04:34<1:23:05, 2.04it/s]
|
574 |
5%|β | 532/10682 [04:35<1:23:09, 2.03it/s]
|
575 |
5%|β | 533/10682 [04:35<1:23:10, 2.03it/s]
|
576 |
5%|β | 534/10682 [04:36<1:23:07, 2.03it/s]
|
577 |
5%|β | 535/10682 [04:36<1:23:09, 2.03it/s]
|
578 |
5%|β | 536/10682 [04:37<1:23:13, 2.03it/s]
|
579 |
5%|β | 537/10682 [04:37<1:23:13, 2.03it/s]
|
580 |
5%|β | 538/10682 [04:38<1:23:09, 2.03it/s]
|
581 |
5%|β | 539/10682 [04:38<1:23:12, 2.03it/s]
|
582 |
5%|β | 540/10682 [04:39<1:23:08, 2.03it/s]
|
583 |
5%|β | 541/10682 [04:39<1:23:06, 2.03it/s]
|
584 |
5%|β | 542/10682 [04:40<1:23:10, 2.03it/s]
|
585 |
5%|β | 543/10682 [04:40<1:23:03, 2.03it/s]
|
586 |
5%|β | 544/10682 [04:41<1:22:57, 2.04it/s]
|
587 |
5%|β | 545/10682 [04:41<1:22:57, 2.04it/s]
|
588 |
5%|β | 546/10682 [04:42<1:23:04, 2.03it/s]
|
589 |
5%|β | 547/10682 [04:42<1:23:01, 2.03it/s]
|
590 |
5%|β | 548/10682 [04:43<1:23:04, 2.03it/s]
|
591 |
5%|β | 549/10682 [04:43<1:23:11, 2.03it/s]
|
592 |
5%|β | 550/10682 [04:44<1:23:07, 2.03it/s]
|
593 |
{'loss': 4.9823, 'grad_norm': 0.5419530272483826, 'learning_rate': 0.0005144995322731525, 'epoch': 0.72}
|
|
|
594 |
5%|β | 550/10682 [04:44<1:23:07, 2.03it/s]
|
595 |
5%|β | 551/10682 [04:44<1:23:14, 2.03it/s]
|
596 |
5%|β | 552/10682 [04:45<1:23:08, 2.03it/s]
|
597 |
5%|β | 553/10682 [04:45<1:23:05, 2.03it/s]
|
598 |
5%|β | 554/10682 [04:46<1:23:07, 2.03it/s]
|
599 |
5%|β | 555/10682 [04:46<1:23:04, 2.03it/s]
|
600 |
5%|β | 556/10682 [04:46<1:23:02, 2.03it/s]
|
601 |
5%|β | 557/10682 [04:47<1:23:02, 2.03it/s]
|
602 |
5%|β | 558/10682 [04:47<1:22:53, 2.04it/s]
|
603 |
5%|β | 559/10682 [04:48<1:22:56, 2.03it/s]
|
604 |
5%|β | 560/10682 [04:48<1:22:58, 2.03it/s]
|
605 |
5%|β | 561/10682 [04:49<1:22:57, 2.03it/s]
|
606 |
5%|β | 562/10682 [04:49<1:23:01, 2.03it/s]
|
607 |
5%|β | 563/10682 [04:50<1:22:55, 2.03it/s]
|
608 |
5%|β | 564/10682 [04:50<1:22:50, 2.04it/s]
|
609 |
5%|β | 565/10682 [04:51<1:22:55, 2.03it/s]
|
610 |
5%|β | 566/10682 [04:51<1:22:54, 2.03it/s]
|
611 |
5%|β | 567/10682 [04:52<1:22:52, 2.03it/s]
|
612 |
5%|β | 568/10682 [04:52<1:22:55, 2.03it/s]
|
613 |
5%|β | 569/10682 [04:53<1:22:49, 2.03it/s]
|
614 |
5%|β | 570/10682 [04:53<1:22:49, 2.03it/s]
|
615 |
5%|β | 571/10682 [04:54<1:22:56, 2.03it/s]
|
616 |
5%|β | 572/10682 [04:54<1:22:55, 2.03it/s]
|
617 |
5%|β | 573/10682 [04:55<1:22:58, 2.03it/s]
|
618 |
5%|β | 574/10682 [04:55<1:22:51, 2.03it/s]
|
619 |
5%|β | 575/10682 [04:56<1:22:54, 2.03it/s]
|
620 |
{'loss': 4.9327, 'grad_norm': 0.5166158080101013, 'learning_rate': 0.0005378858746492049, 'epoch': 0.75}
|
|
|
621 |
5%|β | 575/10682 [04:56<1:22:54, 2.03it/s]
|
622 |
5%|β | 576/10682 [04:56<1:23:00, 2.03it/s]
|
623 |
5%|β | 577/10682 [04:57<1:22:52, 2.03it/s]
|
624 |
5%|β | 578/10682 [04:57<1:22:54, 2.03it/s]
|
625 |
5%|β | 579/10682 [04:58<1:22:50, 2.03it/s]
|
626 |
5%|β | 580/10682 [04:58<1:22:46, 2.03it/s]
|
627 |
5%|β | 581/10682 [04:59<1:22:49, 2.03it/s]
|
628 |
5%|β | 582/10682 [04:59<1:22:44, 2.03it/s]
|
629 |
5%|β | 583/10682 [05:00<1:22:42, 2.04it/s]
|
630 |
5%|β | 584/10682 [05:00<1:22:46, 2.03it/s]
|
631 |
5%|β | 585/10682 [05:01<1:22:42, 2.03it/s]
|
632 |
5%|β | 586/10682 [05:01<1:22:45, 2.03it/s]
|
633 |
5%|β | 587/10682 [05:02<1:22:45, 2.03it/s]
|
634 |
6%|β | 588/10682 [05:02<1:22:42, 2.03it/s]
|
635 |
6%|β | 589/10682 [05:03<1:22:43, 2.03it/s]
|
636 |
6%|β | 590/10682 [05:03<1:22:42, 2.03it/s]
|
637 |
6%|β | 591/10682 [05:04<1:22:41, 2.03it/s]
|
638 |
6%|β | 592/10682 [05:04<1:22:46, 2.03it/s]
|
639 |
6%|β | 593/10682 [05:05<1:22:41, 2.03it/s]
|
640 |
6%|β | 594/10682 [05:05<1:22:42, 2.03it/s]
|
641 |
6%|β | 595/10682 [05:06<1:22:45, 2.03it/s]
|
642 |
6%|β | 596/10682 [05:06<1:22:45, 2.03it/s]
|
643 |
6%|β | 597/10682 [05:07<1:22:45, 2.03it/s]
|
644 |
6%|β | 598/10682 [05:07<1:22:40, 2.03it/s]
|
645 |
6%|β | 599/10682 [05:08<1:22:39, 2.03it/s]
|
646 |
6%|β | 600/10682 [05:08<1:22:42, 2.03it/s]
|
647 |
{'loss': 4.8904, 'grad_norm': 0.47772836685180664, 'learning_rate': 0.0005612722170252572, 'epoch': 0.79}
|
|
|
648 |
6%|β | 600/10682 [05:08<1:22:42, 2.03it/s]
|
649 |
6%|β | 601/10682 [05:09<1:22:56, 2.03it/s]
|
650 |
6%|β | 602/10682 [05:09<1:22:56, 2.03it/s]
|
651 |
6%|β | 603/10682 [05:10<1:22:49, 2.03it/s]
|
652 |
6%|β | 604/10682 [05:10<1:22:51, 2.03it/s]
|
653 |
6%|β | 605/10682 [05:11<1:22:47, 2.03it/s]
|
654 |
6%|β | 606/10682 [05:11<1:22:42, 2.03it/s]
|
655 |
6%|β | 607/10682 [05:12<1:22:46, 2.03it/s]
|
656 |
6%|β | 608/10682 [05:12<1:22:41, 2.03it/s]
|
657 |
6%|β | 609/10682 [05:13<1:29:38, 1.87it/s]
|
658 |
6%|β | 610/10682 [05:13<1:27:31, 1.92it/s]
|
659 |
6%|β | 611/10682 [05:14<1:25:59, 1.95it/s]
|
660 |
6%|β | 612/10682 [05:14<1:24:58, 1.98it/s]
|
661 |
6%|β | 613/10682 [05:15<1:24:20, 1.99it/s]
|
662 |
6%|β | 614/10682 [05:15<1:23:43, 2.00it/s]
|
663 |
6%|β | 615/10682 [05:16<1:23:26, 2.01it/s]
|
664 |
6%|β | 616/10682 [05:16<1:30:13, 1.86it/s]
|
665 |
6%|β | 617/10682 [05:17<1:27:49, 1.91it/s]
|
666 |
6%|β | 618/10682 [05:17<1:26:16, 1.94it/s]
|
667 |
6%|β | 619/10682 [05:18<1:25:07, 1.97it/s]
|
668 |
6%|β | 620/10682 [05:18<1:24:15, 1.99it/s]
|
669 |
6%|β | 621/10682 [05:19<1:23:44, 2.00it/s]
|
670 |
6%|β | 622/10682 [05:19<1:23:18, 2.01it/s]
|
671 |
6%|β | 623/10682 [05:20<1:23:01, 2.02it/s]
|
672 |
6%|β | 624/10682 [05:20<1:22:56, 2.02it/s]
|
673 |
6%|β | 625/10682 [05:21<1:22:46, 2.03it/s]
|
674 |
{'loss': 4.84, 'grad_norm': 0.46007564663887024, 'learning_rate': 0.0005846585594013096, 'epoch': 0.82}
|
|
|
675 |
6%|β | 625/10682 [05:21<1:22:46, 2.03it/s]
|
676 |
6%|β | 626/10682 [05:21<1:22:47, 2.02it/s]
|
677 |
6%|β | 627/10682 [05:22<1:22:42, 2.03it/s]
|
678 |
6%|β | 628/10682 [05:22<1:22:37, 2.03it/s]
|
679 |
6%|β | 629/10682 [05:23<1:22:35, 2.03it/s]
|
680 |
6%|β | 630/10682 [05:23<1:22:34, 2.03it/s]
|
681 |
6%|β | 631/10682 [05:24<1:22:36, 2.03it/s]
|
682 |
6%|β | 632/10682 [05:24<1:22:34, 2.03it/s]
|
683 |
6%|β | 633/10682 [05:25<1:22:31, 2.03it/s]
|
684 |
6%|β | 634/10682 [05:25<1:22:29, 2.03it/s]
|
685 |
6%|β | 635/10682 [05:26<1:22:27, 2.03it/s]
|
686 |
6%|β | 636/10682 [05:26<1:22:26, 2.03it/s]
|
687 |
6%|β | 637/10682 [05:27<1:22:25, 2.03it/s]
|
688 |
6%|β | 638/10682 [05:27<1:22:25, 2.03it/s]
|
689 |
6%|β | 639/10682 [05:28<1:22:22, 2.03it/s]
|
690 |
6%|β | 640/10682 [05:28<1:22:25, 2.03it/s]
|
691 |
6%|β | 641/10682 [05:29<1:22:22, 2.03it/s]
|
692 |
6%|β | 642/10682 [05:29<1:22:34, 2.03it/s]
|
693 |
6%|β | 643/10682 [05:30<1:22:29, 2.03it/s]
|
694 |
6%|β | 644/10682 [05:30<1:22:26, 2.03it/s]
|
695 |
6%|β | 645/10682 [05:31<1:22:23, 2.03it/s]
|
696 |
6%|β | 646/10682 [05:31<1:22:21, 2.03it/s]
|
697 |
6%|β | 647/10682 [05:32<1:22:23, 2.03it/s]
|
698 |
6%|β | 648/10682 [05:32<1:22:19, 2.03it/s]
|
699 |
6%|β | 649/10682 [05:33<1:22:13, 2.03it/s]
|
700 |
6%|β | 650/10682 [05:33<1:22:13, 2.03it/s]{'loss': 4.8005, 'grad_norm': 0.5678160190582275, 'learning_rate': 0.0006080449017773621, 'epoch': 0.85}
|
|
|
701 |
|
702 |
6%|β | 650/10682 [05:33<1:22:13, 2.03it/s]
|
703 |
6%|β | 651/10682 [05:34<1:22:16, 2.03it/s]
|
704 |
6%|β | 652/10682 [05:34<1:22:10, 2.03it/s]
|
705 |
6%|β | 653/10682 [05:35<1:22:16, 2.03it/s]
|
706 |
6%|β | 654/10682 [05:35<1:22:15, 2.03it/s]
|
707 |
6%|β | 655/10682 [05:35<1:22:15, 2.03it/s]
|
708 |
6%|β | 656/10682 [05:36<1:22:15, 2.03it/s]
|
709 |
6%|β | 657/10682 [05:36<1:22:10, 2.03it/s]
|
710 |
6%|β | 658/10682 [05:37<1:22:12, 2.03it/s]
|
711 |
6%|β | 659/10682 [05:37<1:22:11, 2.03it/s]
|
712 |
6%|β | 660/10682 [05:38<1:22:09, 2.03it/s]
|
713 |
6%|β | 661/10682 [05:38<1:22:13, 2.03it/s]
|
714 |
6%|β | 662/10682 [05:39<1:22:08, 2.03it/s]
|
715 |
6%|β | 663/10682 [05:39<1:22:09, 2.03it/s]
|
716 |
6%|β | 664/10682 [05:40<1:22:09, 2.03it/s]
|
717 |
6%|β | 665/10682 [05:40<1:22:09, 2.03it/s]
|
718 |
6%|β | 666/10682 [05:41<1:22:07, 2.03it/s]
|
719 |
6%|β | 667/10682 [05:41<1:22:09, 2.03it/s]
|
720 |
6%|β | 668/10682 [05:42<1:22:09, 2.03it/s]
|
721 |
6%|β | 669/10682 [05:42<1:22:11, 2.03it/s]
|
722 |
6%|β | 670/10682 [05:43<1:22:10, 2.03it/s]
|
723 |
6%|β | 671/10682 [05:43<1:22:05, 2.03it/s]
|
724 |
6%|β | 672/10682 [05:44<1:22:01, 2.03it/s]
|
725 |
6%|β | 673/10682 [05:44<1:22:07, 2.03it/s]
|
726 |
6%|β | 674/10682 [05:45<1:22:03, 2.03it/s]
|
727 |
6%|β | 675/10682 [05:45<1:22:01, 2.03it/s]
|
728 |
{'loss': 4.7671, 'grad_norm': 0.4880385100841522, 'learning_rate': 0.0006314312441534145, 'epoch': 0.88}
|
|
|
729 |
6%|β | 675/10682 [05:45<1:22:01, 2.03it/s]
|
730 |
6%|β | 676/10682 [05:46<1:22:10, 2.03it/s]
|
731 |
6%|β | 677/10682 [05:46<1:22:05, 2.03it/s]
|
732 |
6%|β | 678/10682 [05:47<1:22:01, 2.03it/s]
|
733 |
6%|β | 679/10682 [05:47<1:22:00, 2.03it/s]
|
734 |
6%|β | 680/10682 [05:48<1:22:00, 2.03it/s]
|
735 |
6%|β | 681/10682 [05:48<1:21:58, 2.03it/s]
|
736 |
6%|β | 682/10682 [05:49<1:22:00, 2.03it/s]
|
737 |
6%|β | 683/10682 [05:49<1:22:00, 2.03it/s]
|
738 |
6%|β | 684/10682 [05:50<1:21:53, 2.03it/s]
|
739 |
6%|β | 685/10682 [05:50<1:21:53, 2.03it/s]
|
740 |
6%|β | 686/10682 [05:51<1:21:58, 2.03it/s]
|
741 |
6%|β | 687/10682 [05:51<1:21:59, 2.03it/s]
|
742 |
6%|β | 688/10682 [05:52<1:21:58, 2.03it/s]
|
743 |
6%|β | 689/10682 [05:52<1:21:58, 2.03it/s]
|
744 |
6%|β | 690/10682 [05:53<1:21:54, 2.03it/s]
|
745 |
6%|β | 691/10682 [05:53<1:21:55, 2.03it/s]
|
746 |
6%|β | 692/10682 [05:54<1:21:55, 2.03it/s]
|
747 |
6%|β | 693/10682 [05:54<1:21:52, 2.03it/s]
|
748 |
6%|β | 694/10682 [05:55<1:21:53, 2.03it/s]
|
749 |
7%|β | 695/10682 [05:55<1:21:52, 2.03it/s]
|
750 |
7%|β | 696/10682 [05:56<1:21:49, 2.03it/s]
|
751 |
7%|β | 697/10682 [05:56<1:21:49, 2.03it/s]
|
752 |
7%|β | 698/10682 [05:57<1:21:50, 2.03it/s]
|
753 |
7%|β | 699/10682 [05:57<1:21:52, 2.03it/s]
|
754 |
7%|β | 700/10682 [05:58<1:21:56, 2.03it/s]{'loss': 4.7325, 'grad_norm': 0.42659100890159607, 'learning_rate': 0.0006548175865294667, 'epoch': 0.92}
|
|
|
755 |
|
756 |
7%|β | 700/10682 [05:58<1:21:56, 2.03it/s]
|
757 |
7%|β | 701/10682 [05:58<1:22:04, 2.03it/s]
|
758 |
7%|β | 702/10682 [05:59<1:22:01, 2.03it/s]
|
759 |
7%|β | 703/10682 [05:59<1:21:58, 2.03it/s]
|
760 |
7%|β | 704/10682 [06:00<1:21:55, 2.03it/s]
|
761 |
7%|β | 705/10682 [06:00<1:21:51, 2.03it/s]
|
762 |
7%|β | 706/10682 [06:01<1:21:51, 2.03it/s]
|
763 |
7%|β | 707/10682 [06:01<1:21:50, 2.03it/s]
|
764 |
7%|β | 708/10682 [06:02<1:21:47, 2.03it/s]
|
765 |
7%|β | 709/10682 [06:02<1:21:49, 2.03it/s]
|
766 |
7%|β | 710/10682 [06:03<1:21:50, 2.03it/s]
|
767 |
7%|β | 711/10682 [06:03<1:21:46, 2.03it/s]
|
768 |
7%|β | 712/10682 [06:04<1:21:52, 2.03it/s]
|
769 |
7%|β | 713/10682 [06:04<1:21:42, 2.03it/s]
|
770 |
7%|β | 714/10682 [06:05<1:21:42, 2.03it/s]
|
771 |
7%|β | 715/10682 [06:05<1:21:42, 2.03it/s]
|
772 |
7%|β | 716/10682 [06:06<1:21:38, 2.03it/s]
|
773 |
7%|β | 717/10682 [06:06<1:21:44, 2.03it/s]
|
774 |
7%|β | 718/10682 [06:06<1:21:44, 2.03it/s]
|
775 |
7%|β | 719/10682 [06:07<1:21:40, 2.03it/s]
|
776 |
7%|β | 720/10682 [06:07<1:21:42, 2.03it/s]
|
777 |
7%|β | 721/10682 [06:08<1:21:39, 2.03it/s]
|
778 |
7%|β | 722/10682 [06:08<1:21:44, 2.03it/s]
|
779 |
7%|β | 723/10682 [06:09<1:21:43, 2.03it/s]
|
780 |
7%|β | 724/10682 [06:09<1:21:41, 2.03it/s]
|
781 |
7%|β | 725/10682 [06:10<1:21:41, 2.03it/s]{'loss': 4.7051, 'grad_norm': 0.42874085903167725, 'learning_rate': 0.0006782039289055192, 'epoch': 0.95}
|
782 |
|
|
|
783 |
7%|β | 725/10682 [06:10<1:21:41, 2.03it/s]
|
784 |
7%|β | 726/10682 [06:10<1:21:43, 2.03it/s]
|
785 |
7%|β | 727/10682 [06:11<1:21:45, 2.03it/s]
|
786 |
7%|β | 728/10682 [06:11<1:21:43, 2.03it/s]
|
787 |
7%|β | 729/10682 [06:12<1:21:43, 2.03it/s]
|
788 |
7%|β | 730/10682 [06:12<1:21:43, 2.03it/s]
|
789 |
7%|β | 731/10682 [06:13<1:21:38, 2.03it/s]
|
790 |
7%|β | 732/10682 [06:13<1:21:37, 2.03it/s]
|
791 |
7%|β | 733/10682 [06:14<1:21:36, 2.03it/s]
|
792 |
7%|β | 734/10682 [06:14<1:21:36, 2.03it/s]
|
793 |
7%|β | 735/10682 [06:15<1:21:38, 2.03it/s]
|
794 |
7%|β | 736/10682 [06:15<1:21:37, 2.03it/s]
|
795 |
7%|β | 737/10682 [06:16<1:21:37, 2.03it/s]
|
796 |
7%|β | 738/10682 [06:16<1:21:36, 2.03it/s]
|
797 |
7%|β | 739/10682 [06:17<1:21:32, 2.03it/s]
|
798 |
7%|β | 740/10682 [06:17<1:21:32, 2.03it/s]
|
799 |
7%|β | 741/10682 [06:18<1:21:29, 2.03it/s]
|
800 |
7%|β | 742/10682 [06:18<1:21:31, 2.03it/s]
|
801 |
7%|β | 743/10682 [06:19<1:21:30, 2.03it/s]
|
802 |
7%|β | 744/10682 [06:19<1:21:32, 2.03it/s]
|
803 |
7%|β | 745/10682 [06:20<1:21:28, 2.03it/s]
|
804 |
7%|β | 746/10682 [06:20<1:21:29, 2.03it/s]
|
805 |
7%|β | 747/10682 [06:21<1:21:29, 2.03it/s]
|
806 |
7%|β | 748/10682 [06:21<1:21:34, 2.03it/s]
|
807 |
7%|β | 749/10682 [06:22<1:21:30, 2.03it/s]
|
808 |
7%|β | 750/10682 [06:22<1:21:26, 2.03it/s]
|
809 |
{'loss': 4.6758, 'grad_norm': 0.3773520588874817, 'learning_rate': 0.0007015902712815716, 'epoch': 0.98}
|
|
|
810 |
7%|β | 750/10682 [06:22<1:21:26, 2.03it/s]
|
811 |
7%|β | 751/10682 [06:23<1:21:31, 2.03it/s]
|
812 |
7%|β | 752/10682 [06:23<1:21:26, 2.03it/s]
|
813 |
7%|β | 753/10682 [06:24<1:21:25, 2.03it/s]
|
814 |
7%|β | 754/10682 [06:24<1:21:27, 2.03it/s]
|
815 |
7%|β | 755/10682 [06:25<1:21:21, 2.03it/s]
|
816 |
7%|β | 756/10682 [06:25<1:21:19, 2.03it/s]
|
817 |
7%|β | 757/10682 [06:26<1:21:21, 2.03it/s]
|
818 |
7%|β | 758/10682 [06:26<1:21:23, 2.03it/s]
|
819 |
7%|β | 759/10682 [06:27<1:21:27, 2.03it/s]
|
820 |
7%|β | 760/10682 [06:27<1:21:26, 2.03it/s]
|
821 |
7%|β | 761/10682 [06:28<1:21:26, 2.03it/s]
|
822 |
7%|β | 762/10682 [06:28<1:21:29, 2.03it/s]
|
823 |
7%|β | 763/10682 [06:29<1:20:48, 2.05it/s]
|
824 |
7%|β | 764/10682 [06:41<10:54:45, 3.96s/it]
|
825 |
7%|β | 765/10682 [06:41<8:02:42, 2.92s/it]
|
826 |
7%|β | 766/10682 [06:42<6:02:29, 2.19s/it]
|
827 |
7%|β | 767/10682 [06:42<4:38:05, 1.68s/it]
|
828 |
7%|β | 768/10682 [06:43<3:39:03, 1.33s/it]
|
829 |
7%|β | 769/10682 [06:43<2:57:38, 1.08s/it]
|
830 |
7%|β | 770/10682 [06:44<2:28:48, 1.11it/s]
|
831 |
7%|β | 771/10682 [06:44<2:09:50, 1.27it/s]
|
832 |
7%|β | 772/10682 [06:45<1:55:25, 1.43it/s]
|
833 |
7%|β | 773/10682 [06:45<1:45:07, 1.57it/s]
|
834 |
7%|β | 774/10682 [06:46<1:38:06, 1.68it/s]
|
835 |
7%|β | 775/10682 [06:46<1:33:05, 1.77it/s]{'loss': 4.6275, 'grad_norm': 0.46725699305534363, 'learning_rate': 0.0007249766136576241, 'epoch': 1.02}
|
836 |
|
|
|
837 |
7%|β | 775/10682 [06:46<1:33:05, 1.77it/s]
|
838 |
7%|β | 776/10682 [06:47<1:29:45, 1.84it/s]
|
839 |
7%|β | 777/10682 [06:47<1:27:14, 1.89it/s]
|
840 |
7%|β | 778/10682 [06:48<1:25:30, 1.93it/s]
|
841 |
7%|β | 779/10682 [06:48<1:24:15, 1.96it/s]
|
842 |
7%|β | 780/10682 [06:49<1:23:16, 1.98it/s]
|
843 |
7%|β | 781/10682 [06:49<1:22:32, 2.00it/s]
|
844 |
7%|β | 782/10682 [06:50<1:22:18, 2.00it/s]
|
845 |
7%|β | 783/10682 [06:50<1:21:57, 2.01it/s]
|
846 |
7%|β | 784/10682 [06:51<1:21:51, 2.02it/s]
|
847 |
7%|β | 785/10682 [06:51<1:21:38, 2.02it/s]
|
|
|
1 |
+
slurm submission log: 2024-05-11 22:52:02.103456
|
2 |
+
created following sbatch script:
|
3 |
+
|
4 |
+
###############################
|
5 |
+
|
6 |
+
#!/bin/bash
|
7 |
+
|
8 |
+
#SBATCH --account=nlp
|
9 |
+
#SBATCH --cpus-per-task=16
|
10 |
+
#SBATCH --dependency=afterok:
|
11 |
+
#SBATCH --gres=gpu:2
|
12 |
+
#SBATCH --job-name=tthrush-job-2343873
|
13 |
+
#SBATCH --mem=400G
|
14 |
+
#SBATCH --nodelist=sphinx2
|
15 |
+
#SBATCH --open-mode=append
|
16 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/train_job_output.txt
|
17 |
+
#SBATCH --partition=sphinx
|
18 |
+
#SBATCH --time=14-0
|
19 |
+
|
20 |
+
# activate your desired anaconda environment
|
21 |
+
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
22 |
+
|
23 |
+
# cd to working directory
|
24 |
+
cd .
|
25 |
+
|
26 |
+
# launch commands
|
27 |
+
srun --unbuffered run_as_child_processes 'torchrun --master_port 29499 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default --output_hub_id pythia-70m_default --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
|
28 |
+
|
29 |
+
###############################
|
30 |
+
|
31 |
+
submission to slurm complete!
|
32 |
+
|
33 |
+
|
34 |
+
###############################
|
35 |
+
slurm submission output
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
sbatch: error: Batch job submission failed: Job dependency problem
|
40 |
+
|
41 |
+
###############################
|
42 |
+
|
43 |
+
slurm submission log: 2024-05-11 22:53:19.792950
|
44 |
+
created following sbatch script:
|
45 |
+
|
46 |
+
###############################
|
47 |
+
|
48 |
+
#!/bin/bash
|
49 |
+
|
50 |
+
#SBATCH --account=nlp
|
51 |
+
#SBATCH --cpus-per-task=16
|
52 |
+
#SBATCH --dependency=afterok:7599821
|
53 |
+
#SBATCH --gres=gpu:2
|
54 |
+
#SBATCH --job-name=tthrush-job-4621093
|
55 |
+
#SBATCH --mem=400G
|
56 |
+
#SBATCH --nodelist=sphinx2
|
57 |
+
#SBATCH --open-mode=append
|
58 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/train_job_output.txt
|
59 |
+
#SBATCH --partition=sphinx
|
60 |
+
#SBATCH --time=14-0
|
61 |
+
|
62 |
+
# activate your desired anaconda environment
|
63 |
+
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
64 |
+
|
65 |
+
# cd to working directory
|
66 |
+
cd .
|
67 |
+
|
68 |
+
# launch commands
|
69 |
+
srun --unbuffered run_as_child_processes 'torchrun --master_port 29499 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default --output_hub_id pythia-70m_default --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
|
70 |
+
|
71 |
+
###############################
|
72 |
+
|
73 |
+
submission to slurm complete!
|
74 |
+
|
75 |
+
|
76 |
+
###############################
|
77 |
+
slurm submission output
|
78 |
+
|
79 |
+
Submitted batch job 7599822
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
###############################
|
84 |
+
|
85 |
+
slurm submission log: 2024-05-11 23:09:47.984388
|
86 |
+
created following sbatch script:
|
87 |
+
|
88 |
+
###############################
|
89 |
+
|
90 |
+
#!/bin/bash
|
91 |
+
|
92 |
+
#SBATCH --account=nlp
|
93 |
+
#SBATCH --cpus-per-task=16
|
94 |
+
#SBATCH --dependency=afterok:7599867
|
95 |
+
#SBATCH --gres=gpu:2
|
96 |
+
#SBATCH --job-name=tthrush-job-4866328
|
97 |
+
#SBATCH --mem=400G
|
98 |
+
#SBATCH --nodelist=sphinx2
|
99 |
+
#SBATCH --open-mode=append
|
100 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/train_job_output.txt
|
101 |
+
#SBATCH --partition=sphinx
|
102 |
+
#SBATCH --time=14-0
|
103 |
+
|
104 |
+
# activate your desired anaconda environment
|
105 |
+
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
106 |
+
|
107 |
+
# cd to working directory
|
108 |
+
cd .
|
109 |
+
|
110 |
+
# launch commands
|
111 |
+
srun --unbuffered run_as_child_processes 'torchrun --master_port 29499 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default --output_hub_id pythia-70m_default --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
|
112 |
+
|
113 |
+
###############################
|
114 |
+
|
115 |
+
submission to slurm complete!
|
116 |
+
|
117 |
+
|
118 |
+
###############################
|
119 |
+
slurm submission output
|
120 |
+
|
121 |
+
Submitted batch job 7599868
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
###############################
|
126 |
+
|
127 |
+
###############################
|
128 |
+
start time: 2024-05-11 23:52:22.564861
|
129 |
+
machine: sphinx2
|
130 |
+
conda env: pretraining-coreset-selection
|
131 |
+
###############################
|
132 |
+
running following processes
|
133 |
+
|
134 |
+
torchrun --master_port 29499 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default --output_hub_id pythia-70m_default --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2
|
135 |
+
|
136 |
+
|
137 |
+
###############################
|
138 |
+
command outputs:
|
139 |
+
|
140 |
+
|
141 |
+
[2024-05-11 23:52:24,535] torch.distributed.run: [WARNING]
|
142 |
+
[2024-05-11 23:52:24,535] torch.distributed.run: [WARNING] *****************************************
|
143 |
+
[2024-05-11 23:52:24,535] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
144 |
+
[2024-05-11 23:52:24,535] torch.distributed.run: [WARNING] *****************************************
|
145 |
+
05/11/2024 23:52:30 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default', output_hub_id='pythia-70m_default', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
|
146 |
+
05/11/2024 23:52:30 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default', output_hub_id='pythia-70m_default', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
|
147 |
+
|
148 |
0%| | 0/10682 [00:00<?, ?it/s][rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
149 |
+
[rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
150 |
+
|
151 |
0%| | 1/10682 [00:05<15:57:49, 5.38s/it]
|
152 |
0%| | 2/10682 [00:07<10:01:48, 3.38s/it]
|
153 |
0%| | 3/10682 [00:08<7:22:52, 2.49s/it]
|
154 |
0%| | 4/10682 [00:10<5:58:45, 2.02s/it]
|
155 |
0%| | 5/10682 [00:11<5:00:13, 1.69s/it]
|
156 |
0%| | 6/10682 [00:12<4:15:17, 1.43s/it]
|
157 |
0%| | 7/10682 [00:13<3:46:05, 1.27s/it]
|
158 |
0%| | 8/10682 [00:13<3:23:48, 1.15s/it]
|
159 |
0%| | 9/10682 [00:14<3:05:00, 1.04s/it]
|
160 |
0%| | 10/10682 [00:15<2:51:07, 1.04it/s]
|
161 |
0%| | 11/10682 [00:16<2:37:51, 1.13it/s]
|
162 |
0%| | 12/10682 [00:16<2:25:52, 1.22it/s]
|
163 |
0%| | 13/10682 [00:17<2:17:06, 1.30it/s]
|
164 |
0%| | 14/10682 [00:18<2:11:06, 1.36it/s]
|
165 |
0%| | 15/10682 [00:18<2:05:30, 1.42it/s]
|
166 |
0%| | 16/10682 [00:19<1:59:37, 1.49it/s]
|
167 |
0%| | 17/10682 [00:20<1:56:34, 1.52it/s]
|
168 |
0%| | 18/10682 [00:20<1:52:32, 1.58it/s]
|
169 |
0%| | 19/10682 [00:21<1:49:53, 1.62it/s]
|
170 |
0%| | 20/10682 [00:21<1:48:21, 1.64it/s]
|
171 |
0%| | 21/10682 [00:22<1:46:12, 1.67it/s]
|
172 |
0%| | 22/10682 [00:22<1:44:20, 1.70it/s]
|
173 |
0%| | 23/10682 [00:23<1:41:52, 1.74it/s]
|
174 |
0%| | 24/10682 [00:24<1:40:45, 1.76it/s]
|
175 |
0%| | 25/10682 [00:24<1:40:14, 1.77it/s]
|
176 |
|
177 |
+
|
178 |
0%| | 25/10682 [00:24<1:40:14, 1.77it/s]
|
179 |
0%| | 26/10682 [00:25<1:39:10, 1.79it/s]
|
180 |
0%| | 27/10682 [00:25<1:37:34, 1.82it/s]
|
181 |
0%| | 28/10682 [00:26<1:36:48, 1.83it/s]
|
182 |
0%| | 29/10682 [00:26<1:36:00, 1.85it/s]
|
183 |
0%| | 30/10682 [00:27<1:35:12, 1.86it/s]
|
184 |
0%| | 31/10682 [00:27<1:34:57, 1.87it/s]
|
185 |
0%| | 32/10682 [00:28<1:34:15, 1.88it/s]
|
186 |
0%| | 33/10682 [00:28<1:34:03, 1.89it/s]
|
187 |
0%| | 34/10682 [00:29<1:33:35, 1.90it/s]
|
188 |
0%| | 35/10682 [00:29<1:32:53, 1.91it/s]
|
189 |
0%| | 36/10682 [00:30<1:32:07, 1.93it/s]
|
190 |
0%| | 37/10682 [00:30<1:31:43, 1.93it/s]
|
191 |
0%| | 38/10682 [00:31<1:31:06, 1.95it/s]
|
192 |
0%| | 39/10682 [00:31<1:30:51, 1.95it/s]
|
193 |
0%| | 40/10682 [00:32<1:30:16, 1.96it/s]
|
194 |
0%| | 41/10682 [00:32<1:30:12, 1.97it/s]
|
195 |
0%| | 42/10682 [00:33<1:31:55, 1.93it/s]
|
196 |
0%| | 43/10682 [00:34<1:31:57, 1.93it/s]
|
197 |
0%| | 44/10682 [00:34<1:32:51, 1.91it/s]
|
198 |
0%| | 45/10682 [00:35<1:34:55, 1.87it/s]
|
199 |
0%| | 46/10682 [00:35<1:34:07, 1.88it/s]
|
200 |
0%| | 47/10682 [00:36<1:34:23, 1.88it/s]
|
201 |
0%| | 48/10682 [00:36<1:34:34, 1.87it/s]
|
202 |
0%| | 49/10682 [00:37<1:34:01, 1.88it/s]
|
203 |
0%| | 50/10682 [00:37<1:33:22, 1.90it/s]
|
204 |
|
205 |
+
|
206 |
0%| | 50/10682 [00:37<1:33:22, 1.90it/s]
|
207 |
0%| | 51/10682 [00:38<1:32:21, 1.92it/s]
|
208 |
0%| | 52/10682 [00:38<1:31:09, 1.94it/s]
|
209 |
0%| | 53/10682 [00:39<1:30:15, 1.96it/s]
|
210 |
1%| | 54/10682 [00:39<1:29:35, 1.98it/s]
|
211 |
1%| | 55/10682 [00:40<1:29:07, 1.99it/s]
|
212 |
1%| | 56/10682 [00:40<1:29:19, 1.98it/s]
|
213 |
1%| | 57/10682 [00:41<1:29:05, 1.99it/s]
|
214 |
1%| | 58/10682 [00:41<1:29:05, 1.99it/s]
|
215 |
1%| | 59/10682 [00:42<1:28:56, 1.99it/s]
|
216 |
1%| | 60/10682 [00:42<1:28:40, 2.00it/s]
|
217 |
1%| | 61/10682 [00:43<1:28:37, 2.00it/s]
|
218 |
1%| | 62/10682 [00:43<1:28:44, 1.99it/s]
|
219 |
1%| | 63/10682 [00:44<1:28:26, 2.00it/s]
|
220 |
1%| | 64/10682 [00:44<1:28:10, 2.01it/s]
|
221 |
1%| | 65/10682 [00:45<1:27:48, 2.02it/s]
|
222 |
1%| | 66/10682 [00:45<1:27:39, 2.02it/s]
|
223 |
1%| | 67/10682 [00:46<1:27:42, 2.02it/s]
|
224 |
1%| | 68/10682 [00:46<1:27:37, 2.02it/s]
|
225 |
1%| | 69/10682 [00:47<1:27:36, 2.02it/s]
|
226 |
1%| | 70/10682 [00:47<1:27:33, 2.02it/s]
|
227 |
1%| | 71/10682 [00:48<1:27:35, 2.02it/s]
|
228 |
1%| | 72/10682 [00:48<1:27:30, 2.02it/s]
|
229 |
1%| | 73/10682 [00:49<1:27:31, 2.02it/s]
|
230 |
1%| | 74/10682 [00:49<1:27:25, 2.02it/s]
|
231 |
1%| | 75/10682 [00:50<1:27:17, 2.03it/s]{'loss': 9.2238, 'grad_norm': 1.1420856714248657, 'learning_rate': 7.015902712815715e-05, 'epoch': 0.1}
|
232 |
+
|
233 |
|
234 |
1%| | 75/10682 [00:50<1:27:17, 2.03it/s]
|
235 |
1%| | 76/10682 [00:50<1:27:25, 2.02it/s]
|
236 |
1%| | 77/10682 [00:51<1:27:27, 2.02it/s]
|
237 |
1%| | 78/10682 [00:51<1:27:22, 2.02it/s]
|
238 |
1%| | 79/10682 [00:52<1:27:13, 2.03it/s]
|
239 |
1%| | 80/10682 [00:52<1:27:13, 2.03it/s]
|
240 |
1%| | 81/10682 [00:53<1:27:19, 2.02it/s]
|
241 |
1%| | 82/10682 [00:53<1:27:19, 2.02it/s]
|
242 |
1%| | 83/10682 [00:54<1:27:39, 2.02it/s]
|
243 |
1%| | 84/10682 [00:54<1:27:38, 2.02it/s]
|
244 |
1%| | 85/10682 [00:55<1:27:39, 2.01it/s]
|
245 |
1%| | 86/10682 [00:55<1:27:50, 2.01it/s]
|
246 |
1%| | 87/10682 [00:56<1:27:38, 2.02it/s]
|
247 |
1%| | 88/10682 [00:56<1:27:22, 2.02it/s]
|
248 |
1%| | 89/10682 [00:57<1:27:23, 2.02it/s]
|
249 |
1%| | 90/10682 [00:57<1:27:17, 2.02it/s]
|
250 |
1%| | 91/10682 [00:58<1:27:10, 2.02it/s]
|
251 |
1%| | 92/10682 [00:58<1:27:09, 2.03it/s]
|
252 |
1%| | 93/10682 [00:59<1:27:03, 2.03it/s]
|
253 |
1%| | 94/10682 [00:59<1:26:59, 2.03it/s]
|
254 |
1%| | 95/10682 [01:00<1:26:56, 2.03it/s]
|
255 |
1%| | 96/10682 [01:00<1:26:53, 2.03it/s]
|
256 |
1%| | 97/10682 [01:01<1:26:58, 2.03it/s]
|
257 |
1%| | 98/10682 [01:01<1:26:53, 2.03it/s]
|
258 |
1%| | 99/10682 [01:02<1:26:56, 2.03it/s]
|
259 |
1%| | 100/10682 [01:02<1:26:55, 2.03it/s]{'loss': 8.428, 'grad_norm': 0.7997293472290039, 'learning_rate': 9.354536950420954e-05, 'epoch': 0.13}
|
260 |
+
|
261 |
|
262 |
1%| | 100/10682 [01:02<1:26:55, 2.03it/s]
|
263 |
1%| | 101/10682 [01:03<1:27:08, 2.02it/s]
|
264 |
1%| | 102/10682 [01:03<1:27:00, 2.03it/s]
|
265 |
1%| | 103/10682 [01:04<1:26:59, 2.03it/s]
|
266 |
1%| | 104/10682 [01:04<1:26:57, 2.03it/s]
|
267 |
1%| | 105/10682 [01:04<1:26:51, 2.03it/s]
|
268 |
1%| | 106/10682 [01:05<1:26:49, 2.03it/s]
|
269 |
1%| | 107/10682 [01:05<1:26:50, 2.03it/s]
|
270 |
1%| | 108/10682 [01:06<1:26:51, 2.03it/s]
|
271 |
1%| | 109/10682 [01:06<1:26:47, 2.03it/s]
|
272 |
1%| | 110/10682 [01:07<1:26:40, 2.03it/s]
|
273 |
1%| | 111/10682 [01:07<1:26:43, 2.03it/s]
|
274 |
1%| | 112/10682 [01:08<1:26:42, 2.03it/s]
|
275 |
1%| | 113/10682 [01:08<1:26:45, 2.03it/s]
|
276 |
1%| | 114/10682 [01:09<1:26:41, 2.03it/s]
|
277 |
1%| | 115/10682 [01:09<1:26:39, 2.03it/s]
|
278 |
1%| | 116/10682 [01:10<1:26:40, 2.03it/s]
|
279 |
1%| | 117/10682 [01:10<1:26:41, 2.03it/s]
|
280 |
1%| | 118/10682 [01:11<1:26:46, 2.03it/s]
|
281 |
1%| | 119/10682 [01:11<1:26:41, 2.03it/s]
|
282 |
1%| | 120/10682 [01:12<1:26:34, 2.03it/s]
|
283 |
1%| | 121/10682 [01:12<1:26:37, 2.03it/s]
|
284 |
1%| | 122/10682 [01:13<1:26:44, 2.03it/s]
|
285 |
1%| | 123/10682 [01:13<1:26:45, 2.03it/s]
|
286 |
1%| | 124/10682 [01:14<1:26:44, 2.03it/s]
|
287 |
1%| | 125/10682 [01:14<1:26:43, 2.03it/s]
|
288 |
|
289 |
+
|
290 |
1%| | 125/10682 [01:14<1:26:43, 2.03it/s]
|
291 |
1%| | 126/10682 [01:15<1:26:50, 2.03it/s]
|
292 |
1%| | 127/10682 [01:15<1:26:45, 2.03it/s]
|
293 |
1%| | 128/10682 [01:16<1:26:35, 2.03it/s]
|
294 |
1%| | 129/10682 [01:16<1:26:35, 2.03it/s]
|
295 |
1%| | 130/10682 [01:17<1:26:35, 2.03it/s]
|
296 |
1%| | 131/10682 [01:17<1:26:31, 2.03it/s]
|
297 |
1%| | 132/10682 [01:18<1:26:32, 2.03it/s]
|
298 |
1%| | 133/10682 [01:18<1:26:33, 2.03it/s]
|
299 |
1%|β | 134/10682 [01:19<1:26:27, 2.03it/s]
|
300 |
1%|β | 135/10682 [01:19<1:26:29, 2.03it/s]
|
301 |
1%|β | 136/10682 [01:20<1:26:31, 2.03it/s]
|
302 |
1%|β | 137/10682 [01:20<1:26:31, 2.03it/s]
|
303 |
1%|β | 138/10682 [01:21<1:26:30, 2.03it/s]
|
304 |
1%|β | 139/10682 [01:21<1:26:30, 2.03it/s]
|
305 |
1%|β | 140/10682 [01:22<1:26:29, 2.03it/s]
|
306 |
1%|β | 141/10682 [01:22<1:26:35, 2.03it/s]
|
307 |
1%|β | 142/10682 [01:23<1:26:28, 2.03it/s]
|
308 |
1%|β | 143/10682 [01:23<1:26:27, 2.03it/s]
|
309 |
1%|β | 144/10682 [01:24<1:26:27, 2.03it/s]
|
310 |
1%|β | 145/10682 [01:24<1:26:23, 2.03it/s]
|
311 |
1%|β | 146/10682 [01:25<1:26:32, 2.03it/s]
|
312 |
1%|β | 147/10682 [01:25<1:26:25, 2.03it/s]
|
313 |
1%|β | 148/10682 [01:26<1:26:20, 2.03it/s]
|
314 |
1%|β | 149/10682 [01:26<1:26:23, 2.03it/s]
|
315 |
1%|β | 150/10682 [01:27<1:26:20, 2.03it/s]
|
316 |
{'loss': 7.289, 'grad_norm': 0.367520809173584, 'learning_rate': 0.0001403180542563143, 'epoch': 0.2}
|
317 |
+
|
318 |
1%|β | 150/10682 [01:27<1:26:20, 2.03it/s]
|
319 |
1%|β | 151/10682 [01:27<1:26:26, 2.03it/s]
|
320 |
1%|β | 152/10682 [01:28<1:26:25, 2.03it/s]
|
321 |
1%|β | 153/10682 [01:28<1:26:23, 2.03it/s]
|
322 |
1%|β | 154/10682 [01:29<1:26:26, 2.03it/s]
|
323 |
1%|β | 155/10682 [01:29<1:26:23, 2.03it/s]
|
324 |
1%|β | 156/10682 [01:30<1:26:20, 2.03it/s]
|
325 |
1%|β | 157/10682 [01:30<1:26:22, 2.03it/s]
|
326 |
1%|β | 158/10682 [01:31<1:26:23, 2.03it/s]
|
327 |
1%|β | 159/10682 [01:31<1:26:25, 2.03it/s]
|
328 |
1%|β | 160/10682 [01:32<1:26:15, 2.03it/s]
|
329 |
2%|β | 161/10682 [01:32<1:26:14, 2.03it/s]
|
330 |
2%|β | 162/10682 [01:33<1:26:18, 2.03it/s]
|
331 |
2%|β | 163/10682 [01:33<1:26:12, 2.03it/s]
|
332 |
2%|β | 164/10682 [01:34<1:26:10, 2.03it/s]
|
333 |
2%|β | 165/10682 [01:34<1:26:10, 2.03it/s]
|
334 |
2%|β | 166/10682 [01:35<1:26:08, 2.03it/s]
|
335 |
2%|β | 167/10682 [01:35<1:26:13, 2.03it/s]
|
336 |
2%|β | 168/10682 [01:36<1:26:12, 2.03it/s]
|
337 |
2%|β | 169/10682 [01:36<1:26:15, 2.03it/s]
|
338 |
2%|β | 170/10682 [01:36<1:26:14, 2.03it/s]
|
339 |
2%|β | 171/10682 [01:37<1:26:08, 2.03it/s]
|
340 |
2%|β | 172/10682 [01:37<1:26:18, 2.03it/s]
|
341 |
2%|β | 173/10682 [01:38<1:26:12, 2.03it/s]
|
342 |
2%|β | 174/10682 [01:38<1:26:13, 2.03it/s]
|
343 |
2%|β | 175/10682 [01:39<1:26:11, 2.03it/s]{'loss': 6.8807, 'grad_norm': 0.33734890818595886, 'learning_rate': 0.00016370439663236668, 'epoch': 0.23}
|
344 |
|
345 |
+
|
346 |
2%|β | 175/10682 [01:39<1:26:11, 2.03it/s]
|
347 |
2%|β | 176/10682 [01:39<1:26:12, 2.03it/s]
|
348 |
2%|β | 177/10682 [01:40<1:26:12, 2.03it/s]
|
349 |
2%|β | 178/10682 [01:40<1:26:09, 2.03it/s]
|
350 |
2%|β | 179/10682 [01:41<1:26:12, 2.03it/s]
|
351 |
2%|β | 180/10682 [01:41<1:26:13, 2.03it/s]
|
352 |
2%|β | 181/10682 [01:42<1:26:08, 2.03it/s]
|
353 |
2%|β | 182/10682 [01:42<1:26:14, 2.03it/s]
|
354 |
2%|β | 183/10682 [01:43<1:26:10, 2.03it/s]
|
355 |
2%|β | 184/10682 [01:43<1:26:09, 2.03it/s]
|
356 |
2%|β | 185/10682 [01:44<1:26:08, 2.03it/s]
|
357 |
2%|β | 186/10682 [01:44<1:26:02, 2.03it/s]
|
358 |
2%|β | 187/10682 [01:45<1:26:04, 2.03it/s]
|
359 |
2%|β | 188/10682 [01:45<1:26:03, 2.03it/s]
|
360 |
2%|β | 189/10682 [01:46<1:25:58, 2.03it/s]
|
361 |
2%|β | 190/10682 [01:46<1:25:58, 2.03it/s]
|
362 |
2%|β | 191/10682 [01:47<1:25:58, 2.03it/s]
|
363 |
2%|β | 192/10682 [01:47<1:26:01, 2.03it/s]
|
364 |
2%|β | 193/10682 [01:48<1:26:01, 2.03it/s]
|
365 |
2%|β | 194/10682 [01:48<1:26:01, 2.03it/s]
|
366 |
2%|β | 195/10682 [01:49<1:25:58, 2.03it/s]
|
367 |
2%|β | 196/10682 [01:49<1:26:02, 2.03it/s]
|
368 |
2%|β | 197/10682 [01:50<1:26:00, 2.03it/s]
|
369 |
2%|β | 198/10682 [01:50<1:26:04, 2.03it/s]
|
370 |
2%|β | 199/10682 [01:51<1:26:00, 2.03it/s]
|
371 |
2%|β | 200/10682 [01:51<1:25:57, 2.03it/s]{'loss': 6.5556, 'grad_norm': 0.46203550696372986, 'learning_rate': 0.00018709073900841907, 'epoch': 0.26}
|
372 |
+
|
373 |
|
374 |
2%|β | 200/10682 [01:51<1:25:57, 2.03it/s]
|
375 |
2%|β | 201/10682 [01:52<1:26:04, 2.03it/s]
|
376 |
2%|β | 202/10682 [01:52<1:26:00, 2.03it/s]
|
377 |
2%|β | 203/10682 [01:53<1:26:03, 2.03it/s]
|
378 |
2%|β | 204/10682 [01:53<1:26:00, 2.03it/s]
|
379 |
2%|β | 205/10682 [01:54<1:25:56, 2.03it/s]
|
380 |
2%|β | 206/10682 [01:54<1:25:57, 2.03it/s]
|
381 |
2%|β | 207/10682 [01:55<1:25:52, 2.03it/s]
|
382 |
2%|β | 208/10682 [01:55<1:25:51, 2.03it/s]
|
383 |
2%|β | 209/10682 [01:56<1:25:54, 2.03it/s]
|
384 |
2%|β | 210/10682 [01:56<1:25:49, 2.03it/s]
|
385 |
2%|β | 211/10682 [01:57<1:25:45, 2.04it/s]
|
386 |
2%|β | 212/10682 [01:57<1:25:49, 2.03it/s]
|
387 |
2%|β | 213/10682 [01:58<1:25:44, 2.04it/s]
|
388 |
2%|β | 214/10682 [01:58<1:25:40, 2.04it/s]
|
389 |
2%|β | 215/10682 [01:59<1:25:44, 2.03it/s]
|
390 |
2%|β | 216/10682 [01:59<1:25:41, 2.04it/s]
|
391 |
2%|β | 217/10682 [02:00<1:25:42, 2.03it/s]
|
392 |
2%|β | 218/10682 [02:00<1:25:45, 2.03it/s]
|
393 |
2%|β | 219/10682 [02:01<1:25:42, 2.03it/s]
|
394 |
2%|β | 220/10682 [02:01<1:25:40, 2.04it/s]
|
395 |
2%|β | 221/10682 [02:02<1:25:44, 2.03it/s]
|
396 |
2%|β | 222/10682 [02:02<1:25:42, 2.03it/s]
|
397 |
2%|β | 223/10682 [02:03<1:25:39, 2.04it/s]
|
398 |
2%|β | 224/10682 [02:03<1:25:41, 2.03it/s]
|
399 |
2%|β | 225/10682 [02:04<1:25:38, 2.04it/s]{'loss': 6.2908, 'grad_norm': 0.7612385153770447, 'learning_rate': 0.00021047708138447147, 'epoch': 0.29}
|
400 |
+
|
401 |
|
402 |
2%|β | 225/10682 [02:04<1:25:38, 2.04it/s]
|
403 |
2%|β | 226/10682 [02:04<1:25:45, 2.03it/s]
|
404 |
2%|β | 227/10682 [02:05<1:25:46, 2.03it/s]
|
405 |
2%|β | 228/10682 [02:05<1:25:42, 2.03it/s]
|
406 |
2%|β | 229/10682 [02:06<1:25:41, 2.03it/s]
|
407 |
2%|β | 230/10682 [02:06<1:25:43, 2.03it/s]
|
408 |
2%|β | 231/10682 [02:07<1:25:37, 2.03it/s]
|
409 |
2%|β | 232/10682 [02:07<1:25:41, 2.03it/s]
|
410 |
2%|β | 233/10682 [02:07<1:25:39, 2.03it/s]
|
411 |
2%|β | 234/10682 [02:08<1:25:33, 2.04it/s]
|
412 |
2%|β | 235/10682 [02:08<1:25:38, 2.03it/s]
|
413 |
2%|β | 236/10682 [02:09<1:25:37, 2.03it/s]
|
414 |
2%|β | 237/10682 [02:09<1:25:35, 2.03it/s]
|
415 |
2%|β | 238/10682 [02:10<1:25:40, 2.03it/s]
|
416 |
2%|β | 239/10682 [02:10<1:25:36, 2.03it/s]
|
417 |
2%|β | 240/10682 [02:11<1:25:38, 2.03it/s]
|
418 |
2%|β | 241/10682 [02:11<1:25:39, 2.03it/s]
|
419 |
2%|β | 242/10682 [02:12<1:25:38, 2.03it/s]
|
420 |
2%|β | 243/10682 [02:12<1:25:43, 2.03it/s]
|
421 |
2%|β | 244/10682 [02:13<1:25:35, 2.03it/s]
|
422 |
2%|β | 245/10682 [02:13<1:25:33, 2.03it/s]
|
423 |
2%|β | 246/10682 [02:14<1:25:38, 2.03it/s]
|
424 |
2%|β | 247/10682 [02:14<1:25:35, 2.03it/s]
|
425 |
2%|β | 248/10682 [02:15<1:25:37, 2.03it/s]
|
426 |
2%|β | 249/10682 [02:15<1:25:34, 2.03it/s]
|
427 |
2%|β | 250/10682 [02:16<1:25:30, 2.03it/s]{'loss': 6.0883, 'grad_norm': 0.3854532241821289, 'learning_rate': 0.00023386342376052386, 'epoch': 0.33}
|
428 |
|
429 |
+
|
430 |
2%|β | 250/10682 [02:16<1:25:30, 2.03it/s]
|
431 |
2%|β | 251/10682 [02:16<1:25:42, 2.03it/s]
|
432 |
2%|β | 252/10682 [02:17<1:25:35, 2.03it/s]
|
433 |
2%|β | 253/10682 [02:17<1:25:34, 2.03it/s]
|
434 |
2%|β | 254/10682 [02:18<1:25:33, 2.03it/s]
|
435 |
2%|β | 255/10682 [02:18<1:25:34, 2.03it/s]
|
436 |
2%|β | 256/10682 [02:19<1:25:33, 2.03it/s]
|
437 |
2%|β | 257/10682 [02:19<1:25:31, 2.03it/s]
|
438 |
2%|β | 258/10682 [02:20<1:25:29, 2.03it/s]
|
439 |
2%|β | 259/10682 [02:20<1:25:25, 2.03it/s]
|
440 |
2%|β | 260/10682 [02:21<1:25:28, 2.03it/s]
|
441 |
2%|β | 261/10682 [02:21<1:25:25, 2.03it/s]
|
442 |
2%|β | 262/10682 [02:22<1:25:23, 2.03it/s]
|
443 |
2%|β | 263/10682 [02:22<1:25:27, 2.03it/s]
|
444 |
2%|β | 264/10682 [02:23<1:25:22, 2.03it/s]
|
445 |
2%|β | 265/10682 [02:23<1:25:19, 2.03it/s]
|
446 |
2%|β | 266/10682 [02:24<1:25:23, 2.03it/s]
|
447 |
2%|β | 267/10682 [02:24<1:25:23, 2.03it/s]
|
448 |
3%|β | 268/10682 [02:25<1:25:23, 2.03it/s]
|
449 |
3%|β | 269/10682 [02:25<1:25:27, 2.03it/s]
|
450 |
3%|β | 270/10682 [02:26<1:25:27, 2.03it/s]
|
451 |
3%|β | 271/10682 [02:26<1:25:24, 2.03it/s]
|
452 |
3%|β | 272/10682 [02:27<1:25:22, 2.03it/s]
|
453 |
3%|β | 273/10682 [02:27<1:25:20, 2.03it/s]
|
454 |
3%|β | 274/10682 [02:28<1:25:21, 2.03it/s]
|
455 |
3%|β | 275/10682 [02:28<1:25:21, 2.03it/s]
|
456 |
{'loss': 5.9181, 'grad_norm': 0.7595835328102112, 'learning_rate': 0.00025724976613657625, 'epoch': 0.36}
|
457 |
+
|
458 |
3%|β | 275/10682 [02:28<1:25:21, 2.03it/s]
|
459 |
3%|β | 276/10682 [02:29<1:25:26, 2.03it/s]
|
460 |
3%|β | 277/10682 [02:29<1:25:26, 2.03it/s]
|
461 |
3%|β | 278/10682 [02:30<1:25:24, 2.03it/s]
|
462 |
3%|β | 279/10682 [02:30<1:25:23, 2.03it/s]
|
463 |
3%|β | 280/10682 [02:31<1:25:23, 2.03it/s]
|
464 |
3%|β | 281/10682 [02:31<1:25:21, 2.03it/s]
|
465 |
3%|β | 282/10682 [02:32<1:25:25, 2.03it/s]
|
466 |
3%|β | 283/10682 [02:32<1:25:22, 2.03it/s]
|
467 |
3%|β | 284/10682 [02:33<1:25:20, 2.03it/s]
|
468 |
3%|β | 285/10682 [02:33<1:25:21, 2.03it/s]
|
469 |
3%|β | 286/10682 [02:34<1:25:24, 2.03it/s]
|
470 |
3%|β | 287/10682 [02:34<1:25:27, 2.03it/s]
|
471 |
3%|β | 288/10682 [02:35<1:25:22, 2.03it/s]
|
472 |
3%|β | 289/10682 [02:35<1:25:16, 2.03it/s]
|
473 |
3%|β | 290/10682 [02:36<1:25:18, 2.03it/s]
|
474 |
3%|β | 291/10682 [02:36<1:25:15, 2.03it/s]
|
475 |
3%|β | 292/10682 [02:37<1:25:08, 2.03it/s]
|
476 |
3%|β | 293/10682 [02:37<1:25:12, 2.03it/s]
|
477 |
3%|β | 294/10682 [02:38<1:25:09, 2.03it/s]
|
478 |
3%|β | 295/10682 [02:38<1:25:07, 2.03it/s]
|
479 |
3%|β | 296/10682 [02:39<1:25:10, 2.03it/s]
|
480 |
3%|β | 297/10682 [02:39<1:25:02, 2.04it/s]
|
481 |
3%|β | 298/10682 [02:39<1:25:05, 2.03it/s]
|
482 |
3%|β | 299/10682 [02:40<1:25:08, 2.03it/s]
|
483 |
3%|β | 300/10682 [02:40<1:25:04, 2.03it/s]{'loss': 5.7819, 'grad_norm': 0.6112937927246094, 'learning_rate': 0.0002806361085126286, 'epoch': 0.39}
|
484 |
+
|
485 |
|
486 |
3%|β | 300/10682 [02:40<1:25:04, 2.03it/s]
|
487 |
3%|β | 301/10682 [02:41<1:25:24, 2.03it/s]
|
488 |
3%|β | 302/10682 [02:41<1:25:14, 2.03it/s]
|
489 |
3%|β | 303/10682 [02:42<1:25:13, 2.03it/s]
|
490 |
3%|β | 304/10682 [02:42<1:25:08, 2.03it/s]
|
491 |
3%|β | 305/10682 [02:43<1:25:08, 2.03it/s]
|
492 |
3%|β | 306/10682 [02:43<1:25:08, 2.03it/s]
|
493 |
3%|β | 307/10682 [02:44<1:25:02, 2.03it/s]
|
494 |
3%|β | 308/10682 [02:44<1:25:02, 2.03it/s]
|
495 |
3%|β | 309/10682 [02:45<1:25:04, 2.03it/s]
|
496 |
3%|β | 310/10682 [02:45<1:25:05, 2.03it/s]
|
497 |
3%|β | 311/10682 [02:46<1:25:07, 2.03it/s]
|
498 |
3%|β | 312/10682 [02:46<1:25:04, 2.03it/s]
|
499 |
3%|β | 313/10682 [02:47<1:25:04, 2.03it/s]
|
500 |
3%|β | 314/10682 [02:47<1:25:01, 2.03it/s]
|
501 |
3%|β | 315/10682 [02:48<1:24:58, 2.03it/s]
|
502 |
3%|β | 316/10682 [02:48<1:24:57, 2.03it/s]
|
503 |
3%|β | 317/10682 [02:49<1:24:59, 2.03it/s]
|
504 |
3%|β | 318/10682 [02:49<1:24:58, 2.03it/s]
|
505 |
3%|β | 319/10682 [02:50<1:25:00, 2.03it/s]
|
506 |
3%|β | 320/10682 [02:50<1:24:55, 2.03it/s]
|
507 |
3%|β | 321/10682 [02:51<1:25:00, 2.03it/s]
|
508 |
3%|β | 322/10682 [02:51<1:25:05, 2.03it/s]
|
509 |
3%|β | 323/10682 [02:52<1:24:59, 2.03it/s]
|
510 |
3%|β | 324/10682 [02:52<1:24:58, 2.03it/s]
|
511 |
3%|β | 325/10682 [02:53<1:24:55, 2.03it/s]
|
512 |
{'loss': 5.6576, 'grad_norm': 1.0010818243026733, 'learning_rate': 0.00030402245088868103, 'epoch': 0.43}
|
513 |
+
|
514 |
3%|β | 325/10682 [02:53<1:24:55, 2.03it/s]
|
515 |
3%|β | 326/10682 [02:53<1:25:01, 2.03it/s]
|
516 |
3%|β | 327/10682 [02:54<1:25:04, 2.03it/s]
|
517 |
3%|β | 328/10682 [02:54<1:25:01, 2.03it/s]
|
518 |
3%|β | 329/10682 [02:55<1:25:04, 2.03it/s]
|
519 |
3%|β | 330/10682 [02:55<1:25:00, 2.03it/s]
|
520 |
3%|β | 331/10682 [02:56<1:24:56, 2.03it/s]
|
521 |
3%|β | 332/10682 [02:56<1:24:56, 2.03it/s]
|
522 |
3%|β | 333/10682 [02:57<1:24:52, 2.03it/s]
|
523 |
3%|β | 334/10682 [02:57<1:24:51, 2.03it/s]
|
524 |
3%|β | 335/10682 [02:58<1:24:51, 2.03it/s]
|
525 |
3%|β | 336/10682 [02:58<1:24:48, 2.03it/s]
|
526 |
3%|β | 337/10682 [02:59<1:24:50, 2.03it/s]
|
527 |
3%|β | 338/10682 [02:59<1:24:49, 2.03it/s]
|
528 |
3%|β | 339/10682 [03:00<1:24:46, 2.03it/s]
|
529 |
3%|β | 340/10682 [03:00<1:24:53, 2.03it/s]
|
530 |
3%|β | 341/10682 [03:01<1:24:48, 2.03it/s]
|
531 |
3%|β | 342/10682 [03:01<1:24:45, 2.03it/s]
|
532 |
3%|β | 343/10682 [03:02<1:24:49, 2.03it/s]
|
533 |
3%|β | 344/10682 [03:02<1:24:46, 2.03it/s]
|
534 |
3%|β | 345/10682 [03:03<1:24:46, 2.03it/s]
|
535 |
3%|β | 346/10682 [03:03<1:24:44, 2.03it/s]
|
536 |
3%|β | 347/10682 [03:04<1:24:40, 2.03it/s]
|
537 |
3%|β | 348/10682 [03:04<1:24:45, 2.03it/s]
|
538 |
3%|β | 349/10682 [03:05<1:24:42, 2.03it/s]
|
539 |
3%|β | 350/10682 [03:05<1:24:40, 2.03it/s]{'loss': 5.5561, 'grad_norm': 0.5823507308959961, 'learning_rate': 0.00032740879326473337, 'epoch': 0.46}
|
540 |
|
541 |
+
|
542 |
3%|β | 350/10682 [03:05<1:24:40, 2.03it/s]
|
543 |
3%|β | 351/10682 [03:06<1:24:50, 2.03it/s]
|
544 |
3%|β | 352/10682 [03:06<1:24:41, 2.03it/s]
|
545 |
3%|β | 353/10682 [03:07<1:24:44, 2.03it/s]
|
546 |
3%|β | 354/10682 [03:07<1:24:42, 2.03it/s]
|
547 |
3%|β | 355/10682 [03:08<1:24:36, 2.03it/s]
|
548 |
3%|β | 356/10682 [03:08<1:24:42, 2.03it/s]
|
549 |
3%|β | 357/10682 [03:09<1:24:39, 2.03it/s]
|
550 |
3%|β | 358/10682 [03:09<1:24:36, 2.03it/s]
|
551 |
3%|β | 359/10682 [03:10<1:24:39, 2.03it/s]
|
552 |
3%|β | 360/10682 [03:10<1:24:39, 2.03it/s]
|
553 |
3%|β | 361/10682 [03:10<1:24:40, 2.03it/s]
|
554 |
3%|β | 362/10682 [03:11<1:24:39, 2.03it/s]
|
555 |
3%|β | 363/10682 [03:11<1:24:35, 2.03it/s]
|
556 |
3%|β | 364/10682 [03:12<1:24:36, 2.03it/s]
|
557 |
3%|β | 365/10682 [03:12<1:24:38, 2.03it/s]
|
558 |
3%|β | 366/10682 [03:13<1:24:36, 2.03it/s]
|
559 |
3%|β | 367/10682 [03:13<1:24:37, 2.03it/s]
|
560 |
3%|β | 368/10682 [03:14<1:24:38, 2.03it/s]
|
561 |
3%|β | 369/10682 [03:14<1:24:37, 2.03it/s]
|
562 |
3%|β | 370/10682 [03:15<1:24:39, 2.03it/s]
|
563 |
3%|β | 371/10682 [03:15<1:24:34, 2.03it/s]
|
564 |
3%|β | 372/10682 [03:16<1:24:37, 2.03it/s]
|
565 |
3%|β | 373/10682 [03:16<1:24:37, 2.03it/s]
|
566 |
4%|β | 374/10682 [03:17<1:24:36, 2.03it/s]
|
567 |
4%|β | 375/10682 [03:17<1:24:36, 2.03it/s]
|
568 |
|
569 |
+
|
570 |
4%|β | 375/10682 [03:17<1:24:36, 2.03it/s]
|
571 |
4%|β | 376/10682 [03:18<1:25:00, 2.02it/s]
|
572 |
4%|β | 377/10682 [03:18<1:24:53, 2.02it/s]
|
573 |
4%|β | 378/10682 [03:19<1:24:43, 2.03it/s]
|
574 |
4%|β | 379/10682 [03:19<1:24:39, 2.03it/s]
|
575 |
4%|β | 380/10682 [03:20<1:24:37, 2.03it/s]
|
576 |
4%|β | 381/10682 [03:20<1:24:30, 2.03it/s]
|
577 |
4%|β | 382/10682 [03:21<1:24:32, 2.03it/s]
|
578 |
4%|β | 383/10682 [03:21<1:24:28, 2.03it/s]
|
579 |
4%|β | 384/10682 [03:22<1:24:27, 2.03it/s]
|
580 |
4%|β | 385/10682 [03:22<1:24:28, 2.03it/s]
|
581 |
4%|β | 386/10682 [03:23<1:24:22, 2.03it/s]
|
582 |
4%|β | 387/10682 [03:23<1:24:24, 2.03it/s]
|
583 |
4%|β | 388/10682 [03:24<1:24:26, 2.03it/s]
|
584 |
4%|β | 389/10682 [03:24<1:24:25, 2.03it/s]
|
585 |
4%|β | 390/10682 [03:25<1:24:25, 2.03it/s]
|
586 |
4%|β | 391/10682 [03:25<1:24:25, 2.03it/s]
|
587 |
4%|β | 392/10682 [03:26<1:24:20, 2.03it/s]
|
588 |
4%|β | 393/10682 [03:26<1:24:23, 2.03it/s]
|
589 |
4%|β | 394/10682 [03:27<1:24:21, 2.03it/s]
|
590 |
4%|β | 395/10682 [03:27<1:24:21, 2.03it/s]
|
591 |
4%|β | 396/10682 [03:28<1:24:23, 2.03it/s]
|
592 |
4%|β | 397/10682 [03:28<1:24:28, 2.03it/s]
|
593 |
4%|β | 398/10682 [03:29<1:24:28, 2.03it/s]
|
594 |
4%|β | 399/10682 [03:29<1:24:28, 2.03it/s]
|
595 |
4%|β | 400/10682 [03:30<1:24:31, 2.03it/s]{'loss': 5.375, 'grad_norm': 0.6424997448921204, 'learning_rate': 0.00037418147801683815, 'epoch': 0.52}
|
596 |
+
|
597 |
|
598 |
4%|β | 400/10682 [03:30<1:24:31, 2.03it/s]
|
599 |
4%|β | 401/10682 [03:30<1:24:46, 2.02it/s]
|
600 |
4%|β | 402/10682 [03:31<1:24:39, 2.02it/s]
|
601 |
4%|β | 403/10682 [03:31<1:24:35, 2.03it/s]
|
602 |
4%|β | 404/10682 [03:32<1:24:31, 2.03it/s]
|
603 |
4%|β | 405/10682 [03:32<1:24:23, 2.03it/s]
|
604 |
4%|β | 406/10682 [03:33<1:24:22, 2.03it/s]
|
605 |
4%|β | 407/10682 [03:33<1:24:23, 2.03it/s]
|
606 |
4%|β | 408/10682 [03:34<1:24:18, 2.03it/s]
|
607 |
4%|β | 409/10682 [03:34<1:24:17, 2.03it/s]
|
608 |
4%|β | 410/10682 [03:35<1:24:17, 2.03it/s]
|
609 |
4%|β | 411/10682 [03:35<1:24:12, 2.03it/s]
|
610 |
4%|β | 412/10682 [03:36<1:24:08, 2.03it/s]
|
611 |
4%|β | 413/10682 [03:36<1:24:10, 2.03it/s]
|
612 |
4%|β | 414/10682 [03:37<1:24:06, 2.03it/s]
|
613 |
4%|β | 415/10682 [03:37<1:24:09, 2.03it/s]
|
614 |
4%|β | 416/10682 [03:38<1:24:10, 2.03it/s]
|
615 |
4%|β | 417/10682 [03:38<1:24:06, 2.03it/s]
|
616 |
4%|β | 418/10682 [03:39<1:24:06, 2.03it/s]
|
617 |
4%|β | 419/10682 [03:39<1:24:08, 2.03it/s]
|
618 |
4%|β | 420/10682 [03:40<1:24:08, 2.03it/s]
|
619 |
4%|β | 421/10682 [03:40<1:24:09, 2.03it/s]
|
620 |
4%|β | 422/10682 [03:41<1:24:08, 2.03it/s]
|
621 |
4%|β | 423/10682 [03:41<1:24:06, 2.03it/s]
|
622 |
4%|β | 424/10682 [03:42<1:24:06, 2.03it/s]
|
623 |
4%|β | 425/10682 [03:42<1:24:03, 2.03it/s]{'loss': 5.2944, 'grad_norm': 0.4700624942779541, 'learning_rate': 0.0003975678203928906, 'epoch': 0.56}
|
624 |
+
|
625 |
|
626 |
4%|β | 425/10682 [03:42<1:24:03, 2.03it/s]
|
627 |
4%|β | 426/10682 [03:43<1:24:12, 2.03it/s]
|
628 |
4%|β | 427/10682 [03:43<1:24:08, 2.03it/s]
|
629 |
4%|β | 428/10682 [03:43<1:24:06, 2.03it/s]
|
630 |
4%|β | 429/10682 [03:44<1:24:09, 2.03it/s]
|
631 |
4%|β | 430/10682 [03:44<1:24:07, 2.03it/s]
|
632 |
4%|β | 431/10682 [03:45<1:24:08, 2.03it/s]
|
633 |
4%|β | 432/10682 [03:45<1:24:04, 2.03it/s]
|
634 |
4%|β | 433/10682 [03:46<1:23:59, 2.03it/s]
|
635 |
4%|β | 434/10682 [03:46<1:24:01, 2.03it/s]
|
636 |
4%|β | 435/10682 [03:47<1:23:59, 2.03it/s]
|
637 |
4%|β | 436/10682 [03:47<1:24:02, 2.03it/s]
|
638 |
4%|β | 437/10682 [03:48<1:24:03, 2.03it/s]
|
639 |
4%|β | 438/10682 [03:48<1:23:58, 2.03it/s]
|
640 |
4%|β | 439/10682 [03:49<1:24:02, 2.03it/s]
|
641 |
4%|β | 440/10682 [03:49<1:23:57, 2.03it/s]
|
642 |
4%|β | 441/10682 [03:50<1:23:53, 2.03it/s]
|
643 |
4%|β | 442/10682 [03:50<1:23:57, 2.03it/s]
|
644 |
4%|β | 443/10682 [03:51<1:23:55, 2.03it/s]
|
645 |
4%|β | 444/10682 [03:51<1:23:51, 2.03it/s]
|
646 |
4%|β | 445/10682 [03:52<1:23:58, 2.03it/s]
|
647 |
4%|β | 446/10682 [03:52<1:23:52, 2.03it/s]
|
648 |
4%|β | 447/10682 [03:53<1:23:56, 2.03it/s]
|
649 |
4%|β | 448/10682 [03:53<1:23:55, 2.03it/s]
|
650 |
4%|β | 449/10682 [03:54<1:23:52, 2.03it/s]
|
651 |
4%|β | 450/10682 [03:54<1:23:56, 2.03it/s]
|
652 |
{'loss': 5.2223, 'grad_norm': 0.4889560043811798, 'learning_rate': 0.00042095416276894293, 'epoch': 0.59}
|
653 |
+
|
654 |
4%|β | 450/10682 [03:54<1:23:56, 2.03it/s]
|
655 |
4%|β | 451/10682 [03:55<1:23:56, 2.03it/s]
|
656 |
4%|β | 452/10682 [03:55<1:23:55, 2.03it/s]
|
657 |
4%|β | 453/10682 [03:56<1:23:53, 2.03it/s]
|
658 |
4%|β | 454/10682 [03:56<1:23:48, 2.03it/s]
|
659 |
4%|β | 455/10682 [03:57<1:23:58, 2.03it/s]
|
660 |
4%|β | 456/10682 [03:57<1:23:51, 2.03it/s]
|
661 |
4%|β | 457/10682 [03:58<1:23:51, 2.03it/s]
|
662 |
4%|β | 458/10682 [03:58<1:23:50, 2.03it/s]
|
663 |
4%|β | 459/10682 [03:59<1:23:47, 2.03it/s]
|
664 |
4%|β | 460/10682 [03:59<1:23:45, 2.03it/s]
|
665 |
4%|β | 461/10682 [04:00<1:23:48, 2.03it/s]
|
666 |
4%|β | 462/10682 [04:00<1:23:44, 2.03it/s]
|
667 |
4%|β | 463/10682 [04:01<1:23:48, 2.03it/s]
|
668 |
4%|β | 464/10682 [04:01<1:23:47, 2.03it/s]
|
669 |
4%|β | 465/10682 [04:02<1:23:48, 2.03it/s]
|
670 |
4%|β | 466/10682 [04:02<1:23:50, 2.03it/s]
|
671 |
4%|β | 467/10682 [04:03<1:23:53, 2.03it/s]
|
672 |
4%|β | 468/10682 [04:03<1:23:52, 2.03it/s]
|
673 |
4%|β | 469/10682 [04:04<1:23:51, 2.03it/s]
|
674 |
4%|β | 470/10682 [04:04<1:23:49, 2.03it/s]
|
675 |
4%|β | 471/10682 [04:05<1:23:48, 2.03it/s]
|
676 |
4%|β | 472/10682 [04:05<1:23:45, 2.03it/s]
|
677 |
4%|β | 473/10682 [04:06<1:23:50, 2.03it/s]
|
678 |
4%|β | 474/10682 [04:06<1:23:45, 2.03it/s]
|
679 |
4%|β | 475/10682 [04:07<1:23:40, 2.03it/s]{'loss': 5.1492, 'grad_norm': 0.5106998682022095, 'learning_rate': 0.0004443405051449954, 'epoch': 0.62}
|
680 |
+
|
681 |
|
682 |
4%|β | 475/10682 [04:07<1:23:40, 2.03it/s]
|
683 |
4%|β | 476/10682 [04:07<1:23:50, 2.03it/s]
|
684 |
4%|β | 477/10682 [04:08<1:23:45, 2.03it/s]
|
685 |
4%|β | 478/10682 [04:08<1:23:44, 2.03it/s]
|
686 |
4%|β | 479/10682 [04:09<1:23:43, 2.03it/s]
|
687 |
4%|β | 480/10682 [04:09<1:23:37, 2.03it/s]
|
688 |
5%|β | 481/10682 [04:10<1:23:37, 2.03it/s]
|
689 |
5%|β | 482/10682 [04:10<1:23:40, 2.03it/s]
|
690 |
5%|β | 483/10682 [04:11<1:23:35, 2.03it/s]
|
691 |
5%|β | 484/10682 [04:11<1:23:36, 2.03it/s]
|
692 |
5%|β | 485/10682 [04:12<1:23:39, 2.03it/s]
|
693 |
5%|β | 486/10682 [04:12<1:23:37, 2.03it/s]
|
694 |
5%|β | 487/10682 [04:13<1:23:40, 2.03it/s]
|
695 |
5%|β | 488/10682 [04:13<1:23:36, 2.03it/s]
|
696 |
5%|β | 489/10682 [04:14<1:23:36, 2.03it/s]
|
697 |
5%|β | 490/10682 [04:14<1:23:43, 2.03it/s]
|
698 |
5%|β | 491/10682 [04:14<1:23:37, 2.03it/s]
|
699 |
5%|β | 492/10682 [04:15<1:23:40, 2.03it/s]
|
700 |
5%|β | 493/10682 [04:15<1:23:37, 2.03it/s]
|
701 |
5%|β | 494/10682 [04:16<1:23:39, 2.03it/s]
|
702 |
5%|β | 495/10682 [04:16<1:23:40, 2.03it/s]
|
703 |
5%|β | 496/10682 [04:17<1:23:36, 2.03it/s]
|
704 |
5%|β | 497/10682 [04:17<1:23:34, 2.03it/s]
|
705 |
5%|β | 498/10682 [04:18<1:23:35, 2.03it/s]
|
706 |
5%|β | 499/10682 [04:18<1:23:34, 2.03it/s]
|
707 |
5%|β | 500/10682 [04:19<1:23:36, 2.03it/s]
|
708 |
{'loss': 5.0961, 'grad_norm': 0.5852717161178589, 'learning_rate': 0.0004677268475210477, 'epoch': 0.66}
|
709 |
+
|
710 |
5%|β | 500/10682 [04:19<1:23:36, 2.03it/s]
|
711 |
5%|β | 501/10682 [04:19<1:23:38, 2.03it/s]
|
712 |
5%|β | 502/10682 [04:20<1:23:35, 2.03it/s]
|
713 |
5%|β | 503/10682 [04:20<1:23:33, 2.03it/s]
|
714 |
5%|β | 504/10682 [04:21<1:23:27, 2.03it/s]
|
715 |
5%|β | 505/10682 [04:21<1:23:25, 2.03it/s]
|
716 |
5%|β | 506/10682 [04:22<1:23:30, 2.03it/s]
|
717 |
5%|β | 507/10682 [04:22<1:23:32, 2.03it/s]
|
718 |
5%|β | 508/10682 [04:23<1:23:34, 2.03it/s]
|
719 |
5%|β | 509/10682 [04:23<1:23:31, 2.03it/s]
|
720 |
5%|β | 510/10682 [04:24<1:23:28, 2.03it/s]
|
721 |
5%|β | 511/10682 [04:24<1:23:30, 2.03it/s]
|
722 |
5%|β | 512/10682 [04:25<1:23:30, 2.03it/s]
|
723 |
5%|β | 513/10682 [04:25<1:23:31, 2.03it/s]
|
724 |
5%|β | 514/10682 [04:26<1:23:28, 2.03it/s]
|
725 |
5%|β | 515/10682 [04:26<1:23:25, 2.03it/s]
|
726 |
5%|β | 516/10682 [04:27<1:23:24, 2.03it/s]
|
727 |
5%|β | 517/10682 [04:27<1:23:23, 2.03it/s]
|
728 |
5%|β | 518/10682 [04:28<1:23:19, 2.03it/s]
|
729 |
5%|β | 519/10682 [04:28<1:23:23, 2.03it/s]
|
730 |
5%|β | 520/10682 [04:29<1:23:21, 2.03it/s]
|
731 |
5%|β | 521/10682 [04:29<1:23:23, 2.03it/s]
|
732 |
5%|β | 522/10682 [04:30<1:23:26, 2.03it/s]
|
733 |
5%|β | 523/10682 [04:30<1:23:26, 2.03it/s]
|
734 |
5%|β | 524/10682 [04:31<1:23:26, 2.03it/s]
|
735 |
5%|β | 525/10682 [04:31<1:23:17, 2.03it/s]{'loss': 5.0379, 'grad_norm': 0.4721851348876953, 'learning_rate': 0.0004911131898971, 'epoch': 0.69}
|
736 |
+
|
737 |
|
738 |
5%|β | 525/10682 [04:31<1:23:17, 2.03it/s]
|
739 |
5%|β | 526/10682 [04:32<1:23:25, 2.03it/s]
|
740 |
5%|β | 527/10682 [04:32<1:23:22, 2.03it/s]
|
741 |
5%|β | 528/10682 [04:33<1:23:13, 2.03it/s]
|
742 |
5%|β | 529/10682 [04:33<1:23:15, 2.03it/s]
|
743 |
5%|β | 530/10682 [04:34<1:23:11, 2.03it/s]
|
744 |
5%|β | 531/10682 [04:34<1:23:05, 2.04it/s]
|
745 |
5%|β | 532/10682 [04:35<1:23:09, 2.03it/s]
|
746 |
5%|β | 533/10682 [04:35<1:23:10, 2.03it/s]
|
747 |
5%|β | 534/10682 [04:36<1:23:07, 2.03it/s]
|
748 |
5%|β | 535/10682 [04:36<1:23:09, 2.03it/s]
|
749 |
5%|β | 536/10682 [04:37<1:23:13, 2.03it/s]
|
750 |
5%|β | 537/10682 [04:37<1:23:13, 2.03it/s]
|
751 |
5%|β | 538/10682 [04:38<1:23:09, 2.03it/s]
|
752 |
5%|β | 539/10682 [04:38<1:23:12, 2.03it/s]
|
753 |
5%|β | 540/10682 [04:39<1:23:08, 2.03it/s]
|
754 |
5%|β | 541/10682 [04:39<1:23:06, 2.03it/s]
|
755 |
5%|β | 542/10682 [04:40<1:23:10, 2.03it/s]
|
756 |
5%|β | 543/10682 [04:40<1:23:03, 2.03it/s]
|
757 |
5%|β | 544/10682 [04:41<1:22:57, 2.04it/s]
|
758 |
5%|β | 545/10682 [04:41<1:22:57, 2.04it/s]
|
759 |
5%|β | 546/10682 [04:42<1:23:04, 2.03it/s]
|
760 |
5%|β | 547/10682 [04:42<1:23:01, 2.03it/s]
|
761 |
5%|β | 548/10682 [04:43<1:23:04, 2.03it/s]
|
762 |
5%|β | 549/10682 [04:43<1:23:11, 2.03it/s]
|
763 |
5%|β | 550/10682 [04:44<1:23:07, 2.03it/s]
|
764 |
{'loss': 4.9823, 'grad_norm': 0.5419530272483826, 'learning_rate': 0.0005144995322731525, 'epoch': 0.72}
|
765 |
+
|
766 |
5%|β | 550/10682 [04:44<1:23:07, 2.03it/s]
|
767 |
5%|β | 551/10682 [04:44<1:23:14, 2.03it/s]
|
768 |
5%|β | 552/10682 [04:45<1:23:08, 2.03it/s]
|
769 |
5%|β | 553/10682 [04:45<1:23:05, 2.03it/s]
|
770 |
5%|β | 554/10682 [04:46<1:23:07, 2.03it/s]
|
771 |
5%|β | 555/10682 [04:46<1:23:04, 2.03it/s]
|
772 |
5%|β | 556/10682 [04:46<1:23:02, 2.03it/s]
|
773 |
5%|β | 557/10682 [04:47<1:23:02, 2.03it/s]
|
774 |
5%|β | 558/10682 [04:47<1:22:53, 2.04it/s]
|
775 |
5%|β | 559/10682 [04:48<1:22:56, 2.03it/s]
|
776 |
5%|β | 560/10682 [04:48<1:22:58, 2.03it/s]
|
777 |
5%|β | 561/10682 [04:49<1:22:57, 2.03it/s]
|
778 |
5%|β | 562/10682 [04:49<1:23:01, 2.03it/s]
|
779 |
5%|β | 563/10682 [04:50<1:22:55, 2.03it/s]
|
780 |
5%|β | 564/10682 [04:50<1:22:50, 2.04it/s]
|
781 |
5%|β | 565/10682 [04:51<1:22:55, 2.03it/s]
|
782 |
5%|β | 566/10682 [04:51<1:22:54, 2.03it/s]
|
783 |
5%|β | 567/10682 [04:52<1:22:52, 2.03it/s]
|
784 |
5%|β | 568/10682 [04:52<1:22:55, 2.03it/s]
|
785 |
5%|β | 569/10682 [04:53<1:22:49, 2.03it/s]
|
786 |
5%|β | 570/10682 [04:53<1:22:49, 2.03it/s]
|
787 |
5%|β | 571/10682 [04:54<1:22:56, 2.03it/s]
|
788 |
5%|β | 572/10682 [04:54<1:22:55, 2.03it/s]
|
789 |
5%|β | 573/10682 [04:55<1:22:58, 2.03it/s]
|
790 |
5%|β | 574/10682 [04:55<1:22:51, 2.03it/s]
|
791 |
5%|β | 575/10682 [04:56<1:22:54, 2.03it/s]
|
792 |
{'loss': 4.9327, 'grad_norm': 0.5166158080101013, 'learning_rate': 0.0005378858746492049, 'epoch': 0.75}
|
793 |
+
|
794 |
5%|β | 575/10682 [04:56<1:22:54, 2.03it/s]
|
795 |
5%|β | 576/10682 [04:56<1:23:00, 2.03it/s]
|
796 |
5%|β | 577/10682 [04:57<1:22:52, 2.03it/s]
|
797 |
5%|β | 578/10682 [04:57<1:22:54, 2.03it/s]
|
798 |
5%|β | 579/10682 [04:58<1:22:50, 2.03it/s]
|
799 |
5%|β | 580/10682 [04:58<1:22:46, 2.03it/s]
|
800 |
5%|β | 581/10682 [04:59<1:22:49, 2.03it/s]
|
801 |
5%|β | 582/10682 [04:59<1:22:44, 2.03it/s]
|
802 |
5%|β | 583/10682 [05:00<1:22:42, 2.04it/s]
|
803 |
5%|β | 584/10682 [05:00<1:22:46, 2.03it/s]
|
804 |
5%|β | 585/10682 [05:01<1:22:42, 2.03it/s]
|
805 |
5%|β | 586/10682 [05:01<1:22:45, 2.03it/s]
|
806 |
5%|β | 587/10682 [05:02<1:22:45, 2.03it/s]
|
807 |
6%|β | 588/10682 [05:02<1:22:42, 2.03it/s]
|
808 |
6%|β | 589/10682 [05:03<1:22:43, 2.03it/s]
|
809 |
6%|β | 590/10682 [05:03<1:22:42, 2.03it/s]
|
810 |
6%|β | 591/10682 [05:04<1:22:41, 2.03it/s]
|
811 |
6%|β | 592/10682 [05:04<1:22:46, 2.03it/s]
|
812 |
6%|β | 593/10682 [05:05<1:22:41, 2.03it/s]
|
813 |
6%|β | 594/10682 [05:05<1:22:42, 2.03it/s]
|
814 |
6%|β | 595/10682 [05:06<1:22:45, 2.03it/s]
|
815 |
6%|β | 596/10682 [05:06<1:22:45, 2.03it/s]
|
816 |
6%|β | 597/10682 [05:07<1:22:45, 2.03it/s]
|
817 |
6%|β | 598/10682 [05:07<1:22:40, 2.03it/s]
|
818 |
6%|β | 599/10682 [05:08<1:22:39, 2.03it/s]
|
819 |
6%|β | 600/10682 [05:08<1:22:42, 2.03it/s]
|
820 |
{'loss': 4.8904, 'grad_norm': 0.47772836685180664, 'learning_rate': 0.0005612722170252572, 'epoch': 0.79}
|
821 |
+
|
822 |
6%|β | 600/10682 [05:08<1:22:42, 2.03it/s]
|
823 |
6%|β | 601/10682 [05:09<1:22:56, 2.03it/s]
|
824 |
6%|β | 602/10682 [05:09<1:22:56, 2.03it/s]
|
825 |
6%|β | 603/10682 [05:10<1:22:49, 2.03it/s]
|
826 |
6%|β | 604/10682 [05:10<1:22:51, 2.03it/s]
|
827 |
6%|β | 605/10682 [05:11<1:22:47, 2.03it/s]
|
828 |
6%|β | 606/10682 [05:11<1:22:42, 2.03it/s]
|
829 |
6%|β | 607/10682 [05:12<1:22:46, 2.03it/s]
|
830 |
6%|β | 608/10682 [05:12<1:22:41, 2.03it/s]
|
831 |
6%|β | 609/10682 [05:13<1:29:38, 1.87it/s]
|
832 |
6%|β | 610/10682 [05:13<1:27:31, 1.92it/s]
|
833 |
6%|β | 611/10682 [05:14<1:25:59, 1.95it/s]
|
834 |
6%|β | 612/10682 [05:14<1:24:58, 1.98it/s]
|
835 |
6%|β | 613/10682 [05:15<1:24:20, 1.99it/s]
|
836 |
6%|β | 614/10682 [05:15<1:23:43, 2.00it/s]
|
837 |
6%|β | 615/10682 [05:16<1:23:26, 2.01it/s]
|
838 |
6%|β | 616/10682 [05:16<1:30:13, 1.86it/s]
|
839 |
6%|β | 617/10682 [05:17<1:27:49, 1.91it/s]
|
840 |
6%|β | 618/10682 [05:17<1:26:16, 1.94it/s]
|
841 |
6%|β | 619/10682 [05:18<1:25:07, 1.97it/s]
|
842 |
6%|β | 620/10682 [05:18<1:24:15, 1.99it/s]
|
843 |
6%|β | 621/10682 [05:19<1:23:44, 2.00it/s]
|
844 |
6%|β | 622/10682 [05:19<1:23:18, 2.01it/s]
|
845 |
6%|β | 623/10682 [05:20<1:23:01, 2.02it/s]
|
846 |
6%|β | 624/10682 [05:20<1:22:56, 2.02it/s]
|
847 |
6%|β | 625/10682 [05:21<1:22:46, 2.03it/s]
|
848 |
{'loss': 4.84, 'grad_norm': 0.46007564663887024, 'learning_rate': 0.0005846585594013096, 'epoch': 0.82}
|
849 |
+
|
850 |
6%|β | 625/10682 [05:21<1:22:46, 2.03it/s]
|
851 |
6%|β | 626/10682 [05:21<1:22:47, 2.02it/s]
|
852 |
6%|β | 627/10682 [05:22<1:22:42, 2.03it/s]
|
853 |
6%|β | 628/10682 [05:22<1:22:37, 2.03it/s]
|
854 |
6%|β | 629/10682 [05:23<1:22:35, 2.03it/s]
|
855 |
6%|β | 630/10682 [05:23<1:22:34, 2.03it/s]
|
856 |
6%|β | 631/10682 [05:24<1:22:36, 2.03it/s]
|
857 |
6%|β | 632/10682 [05:24<1:22:34, 2.03it/s]
|
858 |
6%|β | 633/10682 [05:25<1:22:31, 2.03it/s]
|
859 |
6%|β | 634/10682 [05:25<1:22:29, 2.03it/s]
|
860 |
6%|β | 635/10682 [05:26<1:22:27, 2.03it/s]
|
861 |
6%|β | 636/10682 [05:26<1:22:26, 2.03it/s]
|
862 |
6%|β | 637/10682 [05:27<1:22:25, 2.03it/s]
|
863 |
6%|β | 638/10682 [05:27<1:22:25, 2.03it/s]
|
864 |
6%|β | 639/10682 [05:28<1:22:22, 2.03it/s]
|
865 |
6%|β | 640/10682 [05:28<1:22:25, 2.03it/s]
|
866 |
6%|β | 641/10682 [05:29<1:22:22, 2.03it/s]
|
867 |
6%|β | 642/10682 [05:29<1:22:34, 2.03it/s]
|
868 |
6%|β | 643/10682 [05:30<1:22:29, 2.03it/s]
|
869 |
6%|β | 644/10682 [05:30<1:22:26, 2.03it/s]
|
870 |
6%|β | 645/10682 [05:31<1:22:23, 2.03it/s]
|
871 |
6%|β | 646/10682 [05:31<1:22:21, 2.03it/s]
|
872 |
6%|β | 647/10682 [05:32<1:22:23, 2.03it/s]
|
873 |
6%|β | 648/10682 [05:32<1:22:19, 2.03it/s]
|
874 |
6%|β | 649/10682 [05:33<1:22:13, 2.03it/s]
|
875 |
6%|β | 650/10682 [05:33<1:22:13, 2.03it/s]{'loss': 4.8005, 'grad_norm': 0.5678160190582275, 'learning_rate': 0.0006080449017773621, 'epoch': 0.85}
|
876 |
+
|
877 |
|
878 |
6%|β | 650/10682 [05:33<1:22:13, 2.03it/s]
|
879 |
6%|β | 651/10682 [05:34<1:22:16, 2.03it/s]
|
880 |
6%|β | 652/10682 [05:34<1:22:10, 2.03it/s]
|
881 |
6%|β | 653/10682 [05:35<1:22:16, 2.03it/s]
|
882 |
6%|β | 654/10682 [05:35<1:22:15, 2.03it/s]
|
883 |
6%|β | 655/10682 [05:35<1:22:15, 2.03it/s]
|
884 |
6%|β | 656/10682 [05:36<1:22:15, 2.03it/s]
|
885 |
6%|β | 657/10682 [05:36<1:22:10, 2.03it/s]
|
886 |
6%|β | 658/10682 [05:37<1:22:12, 2.03it/s]
|
887 |
6%|β | 659/10682 [05:37<1:22:11, 2.03it/s]
|
888 |
6%|β | 660/10682 [05:38<1:22:09, 2.03it/s]
|
889 |
6%|β | 661/10682 [05:38<1:22:13, 2.03it/s]
|
890 |
6%|β | 662/10682 [05:39<1:22:08, 2.03it/s]
|
891 |
6%|β | 663/10682 [05:39<1:22:09, 2.03it/s]
|
892 |
6%|β | 664/10682 [05:40<1:22:09, 2.03it/s]
|
893 |
6%|β | 665/10682 [05:40<1:22:09, 2.03it/s]
|
894 |
6%|β | 666/10682 [05:41<1:22:07, 2.03it/s]
|
895 |
6%|β | 667/10682 [05:41<1:22:09, 2.03it/s]
|
896 |
6%|β | 668/10682 [05:42<1:22:09, 2.03it/s]
|
897 |
6%|β | 669/10682 [05:42<1:22:11, 2.03it/s]
|
898 |
6%|β | 670/10682 [05:43<1:22:10, 2.03it/s]
|
899 |
6%|β | 671/10682 [05:43<1:22:05, 2.03it/s]
|
900 |
6%|β | 672/10682 [05:44<1:22:01, 2.03it/s]
|
901 |
6%|β | 673/10682 [05:44<1:22:07, 2.03it/s]
|
902 |
6%|β | 674/10682 [05:45<1:22:03, 2.03it/s]
|
903 |
6%|β | 675/10682 [05:45<1:22:01, 2.03it/s]
|
904 |
{'loss': 4.7671, 'grad_norm': 0.4880385100841522, 'learning_rate': 0.0006314312441534145, 'epoch': 0.88}
|
905 |
+
|
906 |
6%|β | 675/10682 [05:45<1:22:01, 2.03it/s]
|
907 |
6%|β | 676/10682 [05:46<1:22:10, 2.03it/s]
|
908 |
6%|β | 677/10682 [05:46<1:22:05, 2.03it/s]
|
909 |
6%|β | 678/10682 [05:47<1:22:01, 2.03it/s]
|
910 |
6%|β | 679/10682 [05:47<1:22:00, 2.03it/s]
|
911 |
6%|β | 680/10682 [05:48<1:22:00, 2.03it/s]
|
912 |
6%|β | 681/10682 [05:48<1:21:58, 2.03it/s]
|
913 |
6%|β | 682/10682 [05:49<1:22:00, 2.03it/s]
|
914 |
6%|β | 683/10682 [05:49<1:22:00, 2.03it/s]
|
915 |
6%|β | 684/10682 [05:50<1:21:53, 2.03it/s]
|
916 |
6%|β | 685/10682 [05:50<1:21:53, 2.03it/s]
|
917 |
6%|β | 686/10682 [05:51<1:21:58, 2.03it/s]
|
918 |
6%|β | 687/10682 [05:51<1:21:59, 2.03it/s]
|
919 |
6%|β | 688/10682 [05:52<1:21:58, 2.03it/s]
|
920 |
6%|β | 689/10682 [05:52<1:21:58, 2.03it/s]
|
921 |
6%|β | 690/10682 [05:53<1:21:54, 2.03it/s]
|
922 |
6%|β | 691/10682 [05:53<1:21:55, 2.03it/s]
|
923 |
6%|β | 692/10682 [05:54<1:21:55, 2.03it/s]
|
924 |
6%|β | 693/10682 [05:54<1:21:52, 2.03it/s]
|
925 |
6%|β | 694/10682 [05:55<1:21:53, 2.03it/s]
|
926 |
7%|β | 695/10682 [05:55<1:21:52, 2.03it/s]
|
927 |
7%|β | 696/10682 [05:56<1:21:49, 2.03it/s]
|
928 |
7%|β | 697/10682 [05:56<1:21:49, 2.03it/s]
|
929 |
7%|β | 698/10682 [05:57<1:21:50, 2.03it/s]
|
930 |
7%|β | 699/10682 [05:57<1:21:52, 2.03it/s]
|
931 |
7%|β | 700/10682 [05:58<1:21:56, 2.03it/s]{'loss': 4.7325, 'grad_norm': 0.42659100890159607, 'learning_rate': 0.0006548175865294667, 'epoch': 0.92}
|
932 |
+
|
933 |
|
934 |
7%|β | 700/10682 [05:58<1:21:56, 2.03it/s]
|
935 |
7%|β | 701/10682 [05:58<1:22:04, 2.03it/s]
|
936 |
7%|β | 702/10682 [05:59<1:22:01, 2.03it/s]
|
937 |
7%|β | 703/10682 [05:59<1:21:58, 2.03it/s]
|
938 |
7%|β | 704/10682 [06:00<1:21:55, 2.03it/s]
|
939 |
7%|β | 705/10682 [06:00<1:21:51, 2.03it/s]
|
940 |
7%|β | 706/10682 [06:01<1:21:51, 2.03it/s]
|
941 |
7%|β | 707/10682 [06:01<1:21:50, 2.03it/s]
|
942 |
7%|β | 708/10682 [06:02<1:21:47, 2.03it/s]
|
943 |
7%|β | 709/10682 [06:02<1:21:49, 2.03it/s]
|
944 |
7%|β | 710/10682 [06:03<1:21:50, 2.03it/s]
|
945 |
7%|β | 711/10682 [06:03<1:21:46, 2.03it/s]
|
946 |
7%|β | 712/10682 [06:04<1:21:52, 2.03it/s]
|
947 |
7%|β | 713/10682 [06:04<1:21:42, 2.03it/s]
|
948 |
7%|β | 714/10682 [06:05<1:21:42, 2.03it/s]
|
949 |
7%|β | 715/10682 [06:05<1:21:42, 2.03it/s]
|
950 |
7%|β | 716/10682 [06:06<1:21:38, 2.03it/s]
|
951 |
7%|β | 717/10682 [06:06<1:21:44, 2.03it/s]
|
952 |
7%|β | 718/10682 [06:06<1:21:44, 2.03it/s]
|
953 |
7%|β | 719/10682 [06:07<1:21:40, 2.03it/s]
|
954 |
7%|β | 720/10682 [06:07<1:21:42, 2.03it/s]
|
955 |
7%|β | 721/10682 [06:08<1:21:39, 2.03it/s]
|
956 |
7%|β | 722/10682 [06:08<1:21:44, 2.03it/s]
|
957 |
7%|β | 723/10682 [06:09<1:21:43, 2.03it/s]
|
958 |
7%|β | 724/10682 [06:09<1:21:41, 2.03it/s]
|
959 |
7%|β | 725/10682 [06:10<1:21:41, 2.03it/s]{'loss': 4.7051, 'grad_norm': 0.42874085903167725, 'learning_rate': 0.0006782039289055192, 'epoch': 0.95}
|
960 |
|
961 |
+
|
962 |
7%|β | 725/10682 [06:10<1:21:41, 2.03it/s]
|
963 |
7%|β | 726/10682 [06:10<1:21:43, 2.03it/s]
|
964 |
7%|β | 727/10682 [06:11<1:21:45, 2.03it/s]
|
965 |
7%|β | 728/10682 [06:11<1:21:43, 2.03it/s]
|
966 |
7%|β | 729/10682 [06:12<1:21:43, 2.03it/s]
|
967 |
7%|β | 730/10682 [06:12<1:21:43, 2.03it/s]
|
968 |
7%|β | 731/10682 [06:13<1:21:38, 2.03it/s]
|
969 |
7%|β | 732/10682 [06:13<1:21:37, 2.03it/s]
|
970 |
7%|β | 733/10682 [06:14<1:21:36, 2.03it/s]
|
971 |
7%|β | 734/10682 [06:14<1:21:36, 2.03it/s]
|
972 |
7%|β | 735/10682 [06:15<1:21:38, 2.03it/s]
|
973 |
7%|β | 736/10682 [06:15<1:21:37, 2.03it/s]
|
974 |
7%|β | 737/10682 [06:16<1:21:37, 2.03it/s]
|
975 |
7%|β | 738/10682 [06:16<1:21:36, 2.03it/s]
|
976 |
7%|β | 739/10682 [06:17<1:21:32, 2.03it/s]
|
977 |
7%|β | 740/10682 [06:17<1:21:32, 2.03it/s]
|
978 |
7%|β | 741/10682 [06:18<1:21:29, 2.03it/s]
|
979 |
7%|β | 742/10682 [06:18<1:21:31, 2.03it/s]
|
980 |
7%|β | 743/10682 [06:19<1:21:30, 2.03it/s]
|
981 |
7%|β | 744/10682 [06:19<1:21:32, 2.03it/s]
|
982 |
7%|β | 745/10682 [06:20<1:21:28, 2.03it/s]
|
983 |
7%|β | 746/10682 [06:20<1:21:29, 2.03it/s]
|
984 |
7%|β | 747/10682 [06:21<1:21:29, 2.03it/s]
|
985 |
7%|β | 748/10682 [06:21<1:21:34, 2.03it/s]
|
986 |
7%|β | 749/10682 [06:22<1:21:30, 2.03it/s]
|
987 |
7%|β | 750/10682 [06:22<1:21:26, 2.03it/s]
|
988 |
{'loss': 4.6758, 'grad_norm': 0.3773520588874817, 'learning_rate': 0.0007015902712815716, 'epoch': 0.98}
|
989 |
+
|
990 |
7%|β | 750/10682 [06:22<1:21:26, 2.03it/s]
|
991 |
7%|β | 751/10682 [06:23<1:21:31, 2.03it/s]
|
992 |
7%|β | 752/10682 [06:23<1:21:26, 2.03it/s]
|
993 |
7%|β | 753/10682 [06:24<1:21:25, 2.03it/s]
|
994 |
7%|β | 754/10682 [06:24<1:21:27, 2.03it/s]
|
995 |
7%|β | 755/10682 [06:25<1:21:21, 2.03it/s]
|
996 |
7%|β | 756/10682 [06:25<1:21:19, 2.03it/s]
|
997 |
7%|β | 757/10682 [06:26<1:21:21, 2.03it/s]
|
998 |
7%|β | 758/10682 [06:26<1:21:23, 2.03it/s]
|
999 |
7%|β | 759/10682 [06:27<1:21:27, 2.03it/s]
|
1000 |
7%|β | 760/10682 [06:27<1:21:26, 2.03it/s]
|
1001 |
7%|β | 761/10682 [06:28<1:21:26, 2.03it/s]
|
1002 |
7%|β | 762/10682 [06:28<1:21:29, 2.03it/s]
|
1003 |
7%|β | 763/10682 [06:29<1:20:48, 2.05it/s]
|
1004 |
7%|β | 764/10682 [06:41<10:54:45, 3.96s/it]
|
1005 |
7%|β | 765/10682 [06:41<8:02:42, 2.92s/it]
|
1006 |
7%|β | 766/10682 [06:42<6:02:29, 2.19s/it]
|
1007 |
7%|β | 767/10682 [06:42<4:38:05, 1.68s/it]
|
1008 |
7%|β | 768/10682 [06:43<3:39:03, 1.33s/it]
|
1009 |
7%|β | 769/10682 [06:43<2:57:38, 1.08s/it]
|
1010 |
7%|β | 770/10682 [06:44<2:28:48, 1.11it/s]
|
1011 |
7%|β | 771/10682 [06:44<2:09:50, 1.27it/s]
|
1012 |
7%|β | 772/10682 [06:45<1:55:25, 1.43it/s]
|
1013 |
7%|β | 773/10682 [06:45<1:45:07, 1.57it/s]
|
1014 |
7%|β | 774/10682 [06:46<1:38:06, 1.68it/s]
|
1015 |
7%|β | 775/10682 [06:46<1:33:05, 1.77it/s]{'loss': 4.6275, 'grad_norm': 0.46725699305534363, 'learning_rate': 0.0007249766136576241, 'epoch': 1.02}
|
1016 |
|
1017 |
+
|
1018 |
7%|β | 775/10682 [06:46<1:33:05, 1.77it/s]
|
1019 |
7%|β | 776/10682 [06:47<1:29:45, 1.84it/s]
|
1020 |
7%|β | 777/10682 [06:47<1:27:14, 1.89it/s]
|
1021 |
7%|β | 778/10682 [06:48<1:25:30, 1.93it/s]
|
1022 |
7%|β | 779/10682 [06:48<1:24:15, 1.96it/s]
|
1023 |
7%|β | 780/10682 [06:49<1:23:16, 1.98it/s]
|
1024 |
7%|β | 781/10682 [06:49<1:22:32, 2.00it/s]
|
1025 |
7%|β | 782/10682 [06:50<1:22:18, 2.00it/s]
|
1026 |
7%|β | 783/10682 [06:50<1:21:57, 2.01it/s]
|
1027 |
7%|β | 784/10682 [06:51<1:21:51, 2.02it/s]
|
1028 |
7%|β | 785/10682 [06:51<1:21:38, 2.02it/s]
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6cf3be7689dc496c68d9042ba7ef79c5166b2cf3d4a5717ebfc52a3a1503f499
|
3 |
+
size 5112
|