|
slurm submission log: 2024-05-10 08:21:54.861722 |
|
created following sbatch script: |
|
|
|
############################### |
|
|
|
#!/bin/bash |
|
|
|
#SBATCH --account=nlp |
|
#SBATCH --cpus-per-task=16 |
|
#SBATCH --gres=gpu:2 |
|
#SBATCH --job-name=tthrush-job-4584668 |
|
#SBATCH --mem=400G |
|
#SBATCH --nodelist=sphinx1 |
|
#SBATCH --open-mode=append |
|
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/train_job_output.txt |
|
#SBATCH --partition=sphinx |
|
#SBATCH --time=14-0 |
|
|
|
# activate your desired anaconda environment |
|
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection |
|
|
|
# cd to working directory |
|
cd . |
|
|
|
# launch commands |
|
srun --unbuffered run_as_child_processes 'torchrun --master_port 29505 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/arc_easy --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy --output_hub_id pythia-70m_arc_easy --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' |
|
|
|
############################### |
|
|
|
submission to slurm complete! |
|
|
|
|
|
############################### |
|
slurm submission output |
|
|
|
Submitted batch job 7593615 |
|
|
|
|
|
|
|
############################### |
|
|
|
slurm submission log: 2024-05-10 08:23:21.003132 |
|
created following sbatch script: |
|
|
|
############################### |
|
|
|
#!/bin/bash |
|
|
|
#SBATCH --account=nlp |
|
#SBATCH --cpus-per-task=16 |
|
#SBATCH --gres=gpu:2 |
|
#SBATCH --job-name=tthrush-job-3288067 |
|
#SBATCH --mem=400G |
|
#SBATCH --nodelist=sphinx2 |
|
#SBATCH --open-mode=append |
|
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy/train_job_output.txt |
|
#SBATCH --partition=sphinx |
|
#SBATCH --time=14-0 |
|
|
|
# activate your desired anaconda environment |
|
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection |
|
|
|
# cd to working directory |
|
cd . |
|
|
|
# launch commands |
|
srun --unbuffered run_as_child_processes 'torchrun --master_port 29505 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/arc_easy --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy --output_hub_id pythia-70m_arc_easy --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' |
|
|
|
############################### |
|
|
|
submission to slurm complete! |
|
|
|
|
|
############################### |
|
slurm submission output |
|
|
|
Submitted batch job 7593628 |
|
|
|
|
|
|
|
############################### |
|
|
|
############################### |
|
start time: 2024-05-10 15:11:46.478529 |
|
machine: sphinx2 |
|
conda env: pretraining-coreset-selection |
|
############################### |
|
running following processes |
|
|
|
torchrun --master_port 29505 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/arc_easy --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy --output_hub_id pythia-70m_arc_easy --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 |
|
|
|
|
|
############################### |
|
command outputs: |
|
|
|
|
|
[2024-05-10 15:11:48,365] torch.distributed.run: [WARNING] |
|
[2024-05-10 15:11:48,365] torch.distributed.run: [WARNING] ***************************************** |
|
[2024-05-10 15:11:48,365] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
|
[2024-05-10 15:11:48,365] torch.distributed.run: [WARNING] ***************************************** |
|
05/10/2024 15:11:58 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/arc_easy', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy', output_hub_id='pythia-70m_arc_easy', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) |
|
05/10/2024 15:11:58 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/arc_easy', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_arc_easy', output_hub_id='pythia-70m_arc_easy', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) |
|
0%| | 0/763 [00:00<?, ?it/s][rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) |
|
[rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) |
|
0%| | 1/763 [02:48<35:43:21, 168.77s/it]
0%| | 2/763 [03:01<16:13:36, 76.76s/it]
0%| | 3/763 [03:09<9:38:15, 45.65s/it]
1%| | 4/763 [03:15<6:20:08, 30.05s/it]
1%| | 5/763 [03:20<4:23:05, 20.83s/it]
1%| | 6/763 [03:24<3:09:34, 15.03s/it]
1%| | 7/763 [03:27<2:19:36, 11.08s/it]
1%| | 8/763 [03:29<1:44:25, 8.30s/it]
1%| | 9/763 [03:31<1:19:09, 6.30s/it]
1%|β | 10/763 [03:32<59:38, 4.75s/it]
1%|β | 11/763 [03:33<46:08, 3.68s/it]
2%|β | 12/763 [03:34<36:16, 2.90s/it]
2%|β | 13/763 [03:36<29:30, 2.36s/it]
2%|β | 14/763 [03:37<24:32, 1.97s/it]
2%|β | 15/763 [03:38<20:35, 1.65s/it]
2%|β | 16/763 [03:38<17:20, 1.39s/it]
2%|β | 17/763 [03:39<14:49, 1.19s/it]
2%|β | 18/763 [03:40<13:08, 1.06s/it]
2%|β | 19/763 [03:41<11:41, 1.06it/s]
3%|β | 20/763 [03:41<10:53, 1.14it/s]
3%|β | 21/763 [03:42<09:57, 1.24it/s]
3%|β | 22/763 [03:43<09:19, 1.32it/s]
3%|β | 23/763 [03:43<08:44, 1.41it/s]
3%|β | 24/763 [03:44<08:19, 1.48it/s]
3%|β | 25/763 [03:44<07:57, 1.55it/s]
{'loss': 9.7523, 'grad_norm': 1.2292073965072632, 'learning_rate': 0.0003246753246753247, 'epoch': 0.03} |
|
3%|β | 25/763 [03:44<07:57, 1.55it/s]
3%|β | 26/763 [03:45<07:45, 1.58it/s]
4%|β | 27/763 [03:45<07:21, 1.67it/s]
4%|β | 28/763 [03:46<07:31, 1.63it/s]
4%|β | 29/763 [03:47<08:27, 1.45it/s]
4%|β | 30/763 [03:48<08:08, 1.50it/s]
4%|β | 31/763 [03:48<07:44, 1.58it/s]
4%|β | 32/763 [03:49<07:32, 1.62it/s]
4%|β | 33/763 [03:49<07:11, 1.69it/s]
4%|β | 34/763 [03:50<06:54, 1.76it/s]
5%|β | 35/763 [03:50<06:44, 1.80it/s]
5%|β | 36/763 [03:51<06:40, 1.81it/s]
5%|β | 37/763 [03:51<06:33, 1.84it/s]
5%|β | 38/763 [03:52<06:30, 1.86it/s]
5%|β | 39/763 [03:52<06:24, 1.88it/s]
5%|β | 40/763 [03:53<06:17, 1.92it/s]
5%|β | 41/763 [03:53<06:12, 1.94it/s]
6%|β | 42/763 [03:54<06:10, 1.94it/s]
6%|β | 43/763 [03:54<06:10, 1.95it/s]
6%|β | 44/763 [03:55<06:06, 1.96it/s]
6%|β | 45/763 [03:55<06:06, 1.96it/s]
6%|β | 46/763 [03:56<06:06, 1.95it/s]
6%|β | 47/763 [03:56<06:03, 1.97it/s]
6%|β | 48/763 [03:57<06:02, 1.97it/s]
6%|β | 49/763 [03:57<06:00, 1.98it/s]
7%|β | 50/763 [03:58<05:58, 1.99it/s]
{'loss': 7.2948, 'grad_norm': 0.3099381625652313, 'learning_rate': 0.0006493506493506494, 'epoch': 0.07} |
|
7%|β | 50/763 [03:58<05:58, 1.99it/s]
7%|β | 51/763 [03:58<05:58, 1.98it/s]
7%|β | 52/763 [03:59<05:57, 1.99it/s]
7%|β | 53/763 [03:59<05:58, 1.98it/s]
7%|β | 54/763 [04:00<05:56, 1.99it/s]
7%|β | 55/763 [04:00<05:55, 1.99it/s]
7%|β | 56/763 [04:01<05:56, 1.98it/s]
7%|β | 57/763 [04:01<05:55, 1.99it/s]
8%|β | 58/763 [04:02<05:54, 1.99it/s]
8%|β | 59/763 [04:02<05:53, 1.99it/s]
8%|β | 60/763 [04:03<05:52, 2.00it/s]
8%|β | 61/763 [04:03<05:51, 2.00it/s]
8%|β | 62/763 [04:04<05:50, 2.00it/s]
8%|β | 63/763 [04:04<05:49, 2.00it/s]
8%|β | 64/763 [04:05<05:48, 2.01it/s]
9%|β | 65/763 [04:05<05:47, 2.01it/s]
9%|β | 66/763 [04:06<05:47, 2.01it/s]
9%|β | 67/763 [04:06<05:47, 2.01it/s]
9%|β | 68/763 [04:07<05:46, 2.01it/s]
9%|β | 69/763 [04:07<05:45, 2.01it/s]
9%|β | 70/763 [04:08<05:46, 2.00it/s]
9%|β | 71/763 [04:08<05:45, 2.01it/s]
9%|β | 72/763 [04:09<05:44, 2.01it/s]
10%|β | 73/763 [04:09<05:43, 2.01it/s]
10%|β | 74/763 [04:10<05:42, 2.01it/s]
10%|β | 75/763 [04:10<05:42, 2.01it/s]
{'loss': 6.067, 'grad_norm': 0.795758068561554, 'learning_rate': 0.000974025974025974, 'epoch': 0.1} |
|
10%|β | 75/763 [04:10<05:42, 2.01it/s]
10%|β | 76/763 [04:11<05:42, 2.01it/s]
10%|β | 77/763 [04:11<05:41, 2.01it/s]
10%|β | 78/763 [04:12<05:40, 2.01it/s]
10%|β | 79/763 [04:12<05:40, 2.01it/s]
10%|β | 80/763 [04:13<05:39, 2.01it/s]
11%|β | 81/763 [04:13<05:39, 2.01it/s]
11%|β | 82/763 [04:14<05:38, 2.01it/s]
11%|β | 83/763 [04:14<05:37, 2.01it/s]
11%|β | 84/763 [04:15<05:37, 2.01it/s]
11%|β | 85/763 [04:15<05:36, 2.01it/s]
11%|ββ | 86/763 [04:16<05:35, 2.01it/s]
11%|ββ | 87/763 [04:16<05:35, 2.02it/s]
12%|ββ | 88/763 [04:17<05:34, 2.02it/s]
12%|ββ | 89/763 [04:17<05:33, 2.02it/s]
12%|ββ | 90/763 [04:18<05:33, 2.02it/s]
12%|ββ | 91/763 [04:18<05:32, 2.02it/s]
12%|ββ | 92/763 [04:19<05:32, 2.02it/s]
12%|ββ | 93/763 [04:19<05:32, 2.02it/s]
12%|ββ | 94/763 [04:20<05:31, 2.02it/s]
12%|ββ | 95/763 [04:20<05:31, 2.02it/s]
13%|ββ | 96/763 [04:21<05:30, 2.02it/s]
13%|ββ | 97/763 [04:21<05:30, 2.02it/s]
13%|ββ | 98/763 [04:22<05:29, 2.02it/s]
13%|ββ | 99/763 [04:22<05:29, 2.02it/s]
13%|ββ | 100/763 [04:23<05:28, 2.02it/s]
{'loss': 5.4002, 'grad_norm': 0.2940315008163452, 'learning_rate': 0.0009972289418801728, 'epoch': 0.13} |
|
13%|ββ | 100/763 [04:23<05:28, 2.02it/s]
13%|ββ | 101/763 [04:23<05:29, 2.01it/s]
13%|ββ | 102/763 [04:24<05:28, 2.01it/s]
13%|ββ | 103/763 [04:24<05:27, 2.02it/s]
14%|ββ | 104/763 [04:25<05:26, 2.02it/s]
14%|ββ | 105/763 [04:25<05:26, 2.02it/s]
14%|ββ | 106/763 [04:26<05:25, 2.02it/s]
14%|ββ | 107/763 [04:26<05:25, 2.02it/s]
14%|ββ | 108/763 [04:27<05:24, 2.02it/s]
14%|ββ | 109/763 [04:27<05:23, 2.02it/s]
14%|ββ | 110/763 [04:28<05:23, 2.02it/s]
15%|ββ | 111/763 [04:28<05:22, 2.02it/s]
15%|ββ | 112/763 [04:29<05:22, 2.02it/s]
15%|ββ | 113/763 [04:29<05:21, 2.02it/s]
15%|ββ | 114/763 [04:30<05:21, 2.02it/s]
15%|ββ | 115/763 [04:30<05:20, 2.02it/s]
15%|ββ | 116/763 [04:31<05:20, 2.02it/s]
15%|ββ | 117/763 [04:31<05:20, 2.02it/s]
15%|ββ | 118/763 [04:32<05:19, 2.02it/s]
16%|ββ | 119/763 [04:32<05:19, 2.02it/s]
16%|ββ | 120/763 [04:33<05:18, 2.02it/s]
16%|ββ | 121/763 [04:33<05:18, 2.02it/s]
16%|ββ | 122/763 [04:34<05:17, 2.02it/s]
16%|ββ | 123/763 [04:34<05:17, 2.02it/s]
16%|ββ | 124/763 [04:35<05:16, 2.02it/s]
16%|ββ | 125/763 [04:35<05:16, 2.02it/s]
{'loss': 4.9596, 'grad_norm': 0.28205427527427673, 'learning_rate': 0.0009879683689693263, 'epoch': 0.16} |
|
16%|ββ | 125/763 [04:35<05:16, 2.02it/s]
17%|ββ | 126/763 [04:36<05:16, 2.01it/s]
17%|ββ | 127/763 [04:36<05:15, 2.02it/s]
17%|ββ | 128/763 [04:37<05:14, 2.02it/s]
17%|ββ | 129/763 [04:37<05:14, 2.02it/s]
17%|ββ | 130/763 [04:38<05:14, 2.01it/s]
17%|ββ | 131/763 [04:38<05:13, 2.02it/s]
17%|ββ | 132/763 [04:39<05:12, 2.02it/s]
17%|ββ | 133/763 [04:39<05:11, 2.02it/s]
18%|ββ | 134/763 [04:40<05:11, 2.02it/s]
18%|ββ | 135/763 [04:40<05:10, 2.02it/s]
18%|ββ | 136/763 [04:41<05:09, 2.02it/s]
18%|ββ | 137/763 [04:41<05:09, 2.02it/s]
18%|ββ | 138/763 [04:42<05:08, 2.02it/s]
18%|ββ | 139/763 [04:42<05:08, 2.02it/s]
18%|ββ | 140/763 [04:43<05:07, 2.02it/s]
18%|ββ | 141/763 [04:43<05:07, 2.03it/s]
19%|ββ | 142/763 [04:44<05:06, 2.02it/s]
19%|ββ | 143/763 [04:44<05:05, 2.03it/s]
19%|ββ | 144/763 [04:45<05:05, 2.03it/s]
19%|ββ | 145/763 [04:45<05:05, 2.02it/s]
19%|ββ | 146/763 [04:46<05:04, 2.03it/s]
19%|ββ | 147/763 [04:46<05:04, 2.02it/s]
19%|ββ | 148/763 [04:47<05:03, 2.03it/s]
20%|ββ | 149/763 [04:47<05:03, 2.02it/s]
20%|ββ | 150/763 [04:48<05:03, 2.02it/s]
{'loss': 4.6537, 'grad_norm': 0.25578466057777405, 'learning_rate': 0.0009723185625357323, 'epoch': 0.2} |
|
20%|ββ | 150/763 [04:48<05:03, 2.02it/s]
20%|ββ | 151/763 [04:48<05:02, 2.02it/s]
20%|ββ | 152/763 [04:49<05:01, 2.02it/s]
20%|ββ | 153/763 [04:49<05:01, 2.02it/s]
20%|ββ | 154/763 [04:50<05:00, 2.02it/s]
20%|ββ | 155/763 [04:50<05:00, 2.02it/s]
20%|ββ | 156/763 [04:51<04:59, 2.02it/s]
21%|ββ | 157/763 [04:51<04:59, 2.02it/s]
21%|ββ | 158/763 [04:52<04:58, 2.02it/s]
21%|ββ | 159/763 [04:52<04:58, 2.02it/s]
21%|ββ | 160/763 [04:53<04:58, 2.02it/s]
21%|ββ | 161/763 [04:53<04:57, 2.02it/s]
21%|ββ | 162/763 [04:54<04:57, 2.02it/s]
21%|βββ | 163/763 [04:54<04:56, 2.02it/s]
21%|βββ | 164/763 [04:54<04:55, 2.02it/s]
22%|βββ | 165/763 [04:55<04:55, 2.03it/s]
22%|βββ | 166/763 [04:55<04:54, 2.03it/s]
22%|βββ | 167/763 [04:56<04:54, 2.03it/s]
22%|βββ | 168/763 [04:56<04:53, 2.03it/s]
22%|βββ | 169/763 [04:57<04:53, 2.03it/s]
22%|βββ | 170/763 [04:57<04:52, 2.03it/s]
22%|βββ | 171/763 [04:58<04:52, 2.02it/s]
23%|βββ | 172/763 [04:58<04:52, 2.02it/s]
23%|βββ | 173/763 [04:59<04:51, 2.02it/s]
23%|βββ | 174/763 [04:59<04:50, 2.02it/s]
23%|βββ | 175/763 [05:00<04:50, 2.03it/s]{'loss': 4.3942, 'grad_norm': 0.4287571609020233, 'learning_rate': 0.0009504844339512095, 'epoch': 0.23} |
|
23%|βββ | 175/763 [05:00<04:50, 2.03it/s]
23%|βββ | 176/763 [05:00<04:50, 2.02it/s]
23%|βββ | 177/763 [05:01<04:49, 2.02it/s]
23%|βββ | 178/763 [05:01<04:49, 2.02it/s]
23%|βββ | 179/763 [05:02<04:48, 2.02it/s]
24%|βββ | 180/763 [05:02<04:48, 2.02it/s]
24%|βββ | 181/763 [05:03<04:47, 2.03it/s]
24%|βββ | 182/763 [05:03<04:46, 2.02it/s]
24%|βββ | 183/763 [05:04<04:46, 2.02it/s]
24%|βββ | 184/763 [05:04<04:45, 2.03it/s]
24%|βββ | 185/763 [05:05<04:45, 2.02it/s]
24%|βββ | 186/763 [05:05<04:44, 2.03it/s]
25%|βββ | 187/763 [05:06<04:44, 2.02it/s]
25%|βββ | 188/763 [05:06<04:43, 2.03it/s]
25%|βββ | 189/763 [05:07<04:43, 2.02it/s]
25%|βββ | 190/763 [05:07<04:42, 2.03it/s]
25%|βββ | 191/763 [05:08<04:42, 2.02it/s]
25%|βββ | 192/763 [05:08<04:41, 2.03it/s]
25%|βββ | 193/763 [05:09<04:41, 2.03it/s]
25%|βββ | 194/763 [05:09<04:40, 2.03it/s]
26%|βββ | 195/763 [05:10<04:40, 2.03it/s]
26%|βββ | 196/763 [05:10<04:39, 2.03it/s]
26%|βββ | 197/763 [05:11<04:39, 2.03it/s]
26%|βββ | 198/763 [05:11<04:38, 2.03it/s]
26%|βββ | 199/763 [05:12<04:38, 2.03it/s]
26%|βββ | 200/763 [05:12<04:37, 2.03it/s]
{'loss': 4.1683, 'grad_norm': 0.3253032863140106, 'learning_rate': 0.0009227518692591244, 'epoch': 0.26} |
|
26%|βββ | 200/763 [05:12<04:37, 2.03it/s]
26%|βββ | 201/763 [05:13<04:37, 2.03it/s]
26%|βββ | 202/763 [05:13<04:37, 2.02it/s]
27%|βββ | 203/763 [05:14<04:36, 2.03it/s]
27%|βββ | 204/763 [05:14<04:36, 2.02it/s]
27%|βββ | 205/763 [05:15<04:35, 2.03it/s]
27%|βββ | 206/763 [05:15<04:34, 2.03it/s]
27%|βββ | 207/763 [05:16<04:34, 2.02it/s]
27%|βββ | 208/763 [05:16<04:33, 2.03it/s]
27%|βββ | 209/763 [05:17<04:33, 2.03it/s]
28%|βββ | 210/763 [05:17<04:32, 2.03it/s]
28%|βββ | 211/763 [05:18<04:32, 2.03it/s]
28%|βββ | 212/763 [05:18<04:31, 2.03it/s]
28%|βββ | 213/763 [05:19<04:31, 2.02it/s]
28%|βββ | 214/763 [05:19<04:31, 2.03it/s]
28%|βββ | 215/763 [05:20<04:30, 2.03it/s]
28%|βββ | 216/763 [05:20<04:30, 2.03it/s]
28%|βββ | 217/763 [05:21<04:29, 2.03it/s]
29%|βββ | 218/763 [05:21<04:29, 2.02it/s]
29%|βββ | 219/763 [05:22<04:28, 2.02it/s]
29%|βββ | 220/763 [05:22<04:28, 2.02it/s]
29%|βββ | 221/763 [05:23<04:27, 2.02it/s]
29%|βββ | 222/763 [05:23<04:27, 2.02it/s]
29%|βββ | 223/763 [05:24<04:26, 2.03it/s]
29%|βββ | 224/763 [05:24<04:26, 2.02it/s]
29%|βββ | 225/763 [05:25<04:25, 2.02it/s]
{'loss': 3.9865, 'grad_norm': 0.383036732673645, 'learning_rate': 0.0008894839859139472, 'epoch': 0.29} |
|
29%|βββ | 225/763 [05:25<04:25, 2.02it/s]
30%|βββ | 226/763 [05:25<04:25, 2.02it/s]
30%|βββ | 227/763 [05:26<04:25, 2.02it/s]
30%|βββ | 228/763 [05:26<04:24, 2.02it/s]
30%|βββ | 229/763 [05:27<04:23, 2.02it/s]
30%|βββ | 230/763 [05:27<04:23, 2.02it/s]
30%|βββ | 231/763 [05:28<04:23, 2.02it/s]
30%|βββ | 232/763 [05:28<04:22, 2.02it/s]
31%|βββ | 233/763 [05:29<04:21, 2.02it/s]
31%|βββ | 234/763 [05:29<04:21, 2.03it/s]
31%|βββ | 235/763 [05:30<04:21, 2.02it/s]
31%|βββ | 236/763 [05:30<04:20, 2.02it/s]
31%|βββ | 237/763 [05:31<04:20, 2.02it/s]
31%|βββ | 238/763 [05:31<04:19, 2.02it/s]
31%|ββββ | 239/763 [05:32<04:18, 2.02it/s]
31%|ββββ | 240/763 [05:32<04:18, 2.02it/s]
32%|ββββ | 241/763 [05:33<04:17, 2.03it/s]
32%|ββββ | 242/763 [05:33<04:17, 2.02it/s]
32%|ββββ | 243/763 [05:34<04:16, 2.02it/s]
32%|ββββ | 244/763 [05:34<04:16, 2.02it/s]
32%|ββββ | 245/763 [05:34<04:15, 2.03it/s]
32%|ββββ | 246/763 [05:35<04:15, 2.02it/s]
32%|ββββ | 247/763 [05:35<04:14, 2.03it/s]
33%|ββββ | 248/763 [05:36<04:14, 2.03it/s]
33%|ββββ | 249/763 [05:36<04:13, 2.02it/s]
33%|ββββ | 250/763 [05:37<04:13, 2.02it/s]
{'loss': 3.8302, 'grad_norm': 0.3425499498844147, 'learning_rate': 0.0008511163782882168, 'epoch': 0.33} |
|
33%|ββββ | 250/763 [05:37<04:13, 2.02it/s]
33%|ββββ | 251/763 [05:37<04:13, 2.02it/s]
33%|ββββ | 252/763 [05:38<04:12, 2.02it/s]
33%|ββββ | 253/763 [05:38<04:11, 2.02it/s]
33%|ββββ | 254/763 [05:39<04:11, 2.02it/s]
33%|ββββ | 255/763 [05:39<04:10, 2.02it/s]
34%|ββββ | 256/763 [05:40<04:10, 2.03it/s]
34%|ββββ | 257/763 [05:40<04:09, 2.03it/s]
34%|ββββ | 258/763 [05:41<04:08, 2.03it/s]
34%|ββββ | 259/763 [05:41<04:08, 2.03it/s]
34%|ββββ | 260/763 [05:42<04:08, 2.03it/s]
34%|ββββ | 261/763 [05:42<04:07, 2.03it/s]
34%|ββββ | 262/763 [05:43<04:07, 2.03it/s]
34%|ββββ | 263/763 [05:43<04:06, 2.03it/s]
35%|ββββ | 264/763 [05:44<04:06, 2.03it/s]
35%|ββββ | 265/763 [05:44<04:05, 2.03it/s]
35%|ββββ | 266/763 [05:45<04:05, 2.03it/s]
35%|ββββ | 267/763 [05:45<04:04, 2.03it/s]
35%|ββββ | 268/763 [05:46<04:04, 2.03it/s]
35%|ββββ | 269/763 [05:46<04:03, 2.03it/s]
35%|ββββ | 270/763 [05:47<04:03, 2.03it/s]
36%|ββββ | 271/763 [05:47<04:02, 2.03it/s]
36%|ββββ | 272/763 [05:48<04:02, 2.03it/s]
36%|ββββ | 273/763 [05:48<04:02, 2.02it/s]
36%|ββββ | 274/763 [05:49<04:01, 2.03it/s]
36%|ββββ | 275/763 [05:49<04:00, 2.03it/s]{'loss': 3.7106, 'grad_norm': 0.39079391956329346, 'learning_rate': 0.0008081514142000517, 'epoch': 0.36} |
|
36%|ββββ | 275/763 [05:49<04:00, 2.03it/s]
36%|ββββ | 276/763 [05:50<04:00, 2.02it/s]
36%|ββββ | 277/763 [05:50<04:00, 2.02it/s]
36%|ββββ | 278/763 [05:51<03:59, 2.02it/s]
37%|ββββ | 279/763 [05:51<03:59, 2.02it/s]
37%|ββββ | 280/763 [05:52<03:58, 2.02it/s]
37%|ββββ | 281/763 [05:52<03:58, 2.02it/s]
37%|ββββ | 282/763 [05:53<03:57, 2.03it/s]
37%|ββββ | 283/763 [05:53<03:57, 2.02it/s]
37%|ββββ | 284/763 [05:54<03:56, 2.03it/s]
37%|ββββ | 285/763 [05:54<03:55, 2.03it/s]
37%|ββββ | 286/763 [05:55<03:55, 2.03it/s]
38%|ββββ | 287/763 [05:55<03:54, 2.03it/s]
38%|ββββ | 288/763 [05:56<03:54, 2.03it/s]
38%|ββββ | 289/763 [05:56<03:54, 2.03it/s]
38%|ββββ | 290/763 [05:57<03:53, 2.02it/s]
38%|ββββ | 291/763 [05:57<03:53, 2.02it/s]
38%|ββββ | 292/763 [05:58<03:53, 2.02it/s]
38%|ββββ | 293/763 [05:58<03:52, 2.02it/s]
39%|ββββ | 294/763 [05:59<03:51, 2.02it/s]
39%|ββββ | 295/763 [05:59<03:51, 2.02it/s]
39%|ββββ | 296/763 [06:00<03:51, 2.02it/s]
39%|ββββ | 297/763 [06:00<03:50, 2.02it/s]
39%|ββββ | 298/763 [06:01<03:50, 2.02it/s]
39%|ββββ | 299/763 [06:01<03:49, 2.02it/s]
39%|ββββ | 300/763 [06:02<03:48, 2.02it/s]
{'loss': 3.5989, 'grad_norm': 0.3694958984851837, 'learning_rate': 0.0007611516571398591, 'epoch': 0.39}
|
|
39%|ββββ | 300/763 [06:02<03:48, 2.02it/s]
39%|ββββ | 301/763 [06:02<03:48, 2.02it/s]
40%|ββββ | 302/763 [06:03<03:48, 2.02it/s]
40%|ββββ | 303/763 [06:03<03:47, 2.02it/s]
40%|ββββ | 304/763 [06:04<03:46, 2.02it/s]
40%|ββββ | 305/763 [06:04<03:46, 2.03it/s]
40%|ββββ | 306/763 [06:05<03:45, 2.02it/s]
40%|ββββ | 307/763 [06:05<03:45, 2.02it/s]
40%|ββββ | 308/763 [06:06<03:44, 2.02it/s]
40%|ββββ | 309/763 [06:06<03:44, 2.02it/s]
41%|ββββ | 310/763 [06:07<03:43, 2.02it/s]
41%|ββββ | 311/763 [06:07<03:43, 2.02it/s]
41%|ββββ | 312/763 [06:08<03:42, 2.03it/s]
41%|ββββ | 313/763 [06:08<03:42, 2.03it/s]
41%|ββββ | 314/763 [06:09<03:41, 2.03it/s]
41%|βββββ | 315/763 [06:09<03:41, 2.03it/s]
41%|βββββ | 316/763 [06:10<03:40, 2.03it/s]
42%|βββββ | 317/763 [06:10<03:40, 2.03it/s]
42%|βββββ | 318/763 [06:11<03:39, 2.03it/s]
42%|βββββ | 319/763 [06:11<03:39, 2.03it/s]
42%|βββββ | 320/763 [06:12<03:38, 2.03it/s]
42%|βββββ | 321/763 [06:12<03:37, 2.03it/s]
42%|βββββ | 322/763 [06:13<03:37, 2.03it/s]
42%|βββββ | 323/763 [06:13<03:36, 2.03it/s]
42%|βββββ | 324/763 [06:14<03:36, 2.03it/s]
43%|βββββ | 325/763 [06:14<03:36, 2.03it/s]
{'loss': 3.4821, 'grad_norm': 0.3835093379020691, 'learning_rate': 0.000710732500322581, 'epoch': 0.43} |
|
43%|βββββ | 325/763 [06:14<03:36, 2.03it/s]
43%|βββββ | 326/763 [06:15<03:36, 2.02it/s]
43%|βββββ | 327/763 [06:15<03:35, 2.02it/s]
43%|βββββ | 328/763 [06:15<03:35, 2.02it/s]
43%|βββββ | 329/763 [06:16<03:34, 2.02it/s]
43%|βββββ | 330/763 [06:16<03:34, 2.02it/s]
43%|βββββ | 331/763 [06:17<03:33, 2.02it/s]
44%|βββββ | 332/763 [06:17<03:33, 2.02it/s]
44%|βββββ | 333/763 [06:18<03:32, 2.03it/s]
44%|βββββ | 334/763 [06:18<03:31, 2.02it/s]
44%|βββββ | 335/763 [06:19<03:31, 2.03it/s]
44%|βββββ | 336/763 [06:19<03:31, 2.02it/s]
44%|βββββ | 337/763 [06:20<03:30, 2.02it/s]
44%|βββββ | 338/763 [06:20<03:30, 2.02it/s]
44%|βββββ | 339/763 [06:21<03:29, 2.02it/s]
45%|βββββ | 340/763 [06:21<03:29, 2.02it/s]
45%|βββββ | 341/763 [06:22<03:28, 2.03it/s]
45%|βββββ | 342/763 [06:22<03:27, 2.03it/s]
45%|βββββ | 343/763 [06:23<03:27, 2.03it/s]
45%|βββββ | 344/763 [06:23<03:26, 2.03it/s]
45%|βββββ | 345/763 [06:24<03:26, 2.03it/s]
45%|βββββ | 346/763 [06:24<03:25, 2.03it/s]
45%|βββββ | 347/763 [06:25<03:25, 2.02it/s]
46%|βββββ | 348/763 [06:25<03:25, 2.02it/s]
46%|βββββ | 349/763 [06:26<03:24, 2.02it/s]
46%|βββββ | 350/763 [06:26<03:24, 2.02it/s]{'loss': 3.3953, 'grad_norm': 0.3727777600288391, 'learning_rate': 0.0006575541090118104, 'epoch': 0.46}
|
|
46%|βββββ | 350/763 [06:26<03:24, 2.02it/s]
46%|βββββ | 351/763 [06:27<03:24, 2.02it/s]
46%|βββββ | 352/763 [06:27<03:23, 2.02it/s]
46%|βββββ | 353/763 [06:28<03:22, 2.02it/s]
46%|βββββ | 354/763 [06:28<03:21, 2.03it/s]
47%|βββββ | 355/763 [06:29<03:21, 2.02it/s]
47%|βββββ | 356/763 [06:29<03:21, 2.02it/s]
47%|βββββ | 357/763 [06:30<03:20, 2.03it/s]
47%|βββββ | 358/763 [06:30<03:20, 2.02it/s]
47%|βββββ | 359/763 [06:31<03:19, 2.02it/s]
47%|βββββ | 360/763 [06:31<03:19, 2.02it/s]
47%|βββββ | 361/763 [06:32<03:18, 2.02it/s]
47%|βββββ | 362/763 [06:32<03:18, 2.02it/s]
48%|βββββ | 363/763 [06:33<03:17, 2.02it/s]
48%|βββββ | 364/763 [06:33<03:17, 2.02it/s]
48%|βββββ | 365/763 [06:34<03:16, 2.02it/s]
48%|βββββ | 366/763 [06:34<03:16, 2.02it/s]
48%|βββββ | 367/763 [06:35<03:15, 2.02it/s]
48%|βββββ | 368/763 [06:35<03:15, 2.02it/s]
48%|βββββ | 369/763 [06:36<03:14, 2.02it/s]
48%|βββββ | 370/763 [06:36<03:14, 2.02it/s]
49%|βββββ | 371/763 [06:37<03:13, 2.02it/s]
49%|βββββ | 372/763 [06:37<03:13, 2.02it/s]
49%|βββββ | 373/763 [06:38<03:13, 2.02it/s]
49%|βββββ | 374/763 [06:38<03:12, 2.02it/s]
49%|βββββ | 375/763 [06:39<03:11, 2.02it/s]
{'loss': 3.303, 'grad_norm': 0.35355713963508606, 'learning_rate': 0.0006023127766192824, 'epoch': 0.49} |
|
49%|βββββ | 375/763 [06:39<03:11, 2.02it/s]
49%|βββββ | 376/763 [06:39<03:11, 2.02it/s]
49%|βββββ | 377/763 [06:40<03:10, 2.02it/s]
50%|βββββ | 378/763 [06:40<03:10, 2.02it/s]
50%|βββββ | 379/763 [06:41<03:10, 2.02it/s]
50%|βββββ | 380/763 [06:41<03:09, 2.02it/s]
50%|βββββ | 381/763 [06:42<03:09, 2.02it/s]
50%|βββββ | 382/763 [06:42<03:08, 2.02it/s]
50%|βββββ | 383/763 [06:43<03:08, 2.02it/s]
50%|βββββ | 384/763 [06:43<03:07, 2.02it/s]
50%|βββββ | 385/763 [06:44<03:07, 2.02it/s]
51%|βββββ | 386/763 [06:44<03:06, 2.02it/s]
51%|βββββ | 387/763 [06:45<03:06, 2.02it/s]
51%|βββββ | 388/763 [06:45<03:05, 2.02it/s]
51%|βββββ | 389/763 [06:46<03:05, 2.02it/s]
51%|βββββ | 390/763 [06:46<03:04, 2.02it/s]
51%|βββββ | 391/763 [06:47<03:03, 2.02it/s]
51%|ββββββ | 392/763 [06:47<03:03, 2.02it/s]
52%|ββββββ | 393/763 [06:48<03:03, 2.02it/s]
52%|ββββββ | 394/763 [06:48<03:02, 2.02it/s]
52%|ββββββ | 395/763 [06:49<03:01, 2.03it/s]
52%|ββββββ | 396/763 [06:49<03:01, 2.02it/s]
52%|ββββββ | 397/763 [06:50<03:00, 2.02it/s]
52%|ββββββ | 398/763 [06:50<03:00, 2.02it/s]
52%|ββββββ | 399/763 [06:51<02:59, 2.02it/s]
52%|ββββββ | 400/763 [06:51<02:59, 2.02it/s]
{'loss': 3.2386, 'grad_norm': 0.35548439621925354, 'learning_rate': 0.0005457318077590012, 'epoch': 0.52} |
|
52%|ββββββ | 400/763 [06:51<02:59, 2.02it/s]
53%|ββββββ | 401/763 [06:52<02:59, 2.02it/s]
53%|ββββββ | 402/763 [06:52<02:58, 2.02it/s]
53%|ββββββ | 403/763 [06:53<02:58, 2.02it/s]
53%|ββββββ | 404/763 [06:53<02:57, 2.02it/s]
53%|ββββββ | 405/763 [06:54<02:56, 2.02it/s]
53%|ββββββ | 406/763 [06:54<02:56, 2.02it/s]
53%|ββββββ | 407/763 [06:55<02:56, 2.02it/s]
53%|ββββββ | 408/763 [06:55<02:55, 2.02it/s]
54%|ββββββ | 409/763 [06:56<02:55, 2.02it/s]
54%|ββββββ | 410/763 [06:56<02:54, 2.02it/s]
54%|ββββββ | 411/763 [06:57<02:54, 2.02it/s]
54%|ββββββ | 412/763 [06:57<02:53, 2.02it/s]
54%|ββββββ | 413/763 [06:58<02:53, 2.02it/s]
54%|ββββββ | 414/763 [06:58<02:52, 2.02it/s]
54%|ββββββ | 415/763 [06:59<02:52, 2.02it/s]
55%|ββββββ | 416/763 [06:59<02:51, 2.02it/s]
55%|ββββββ | 417/763 [06:59<02:50, 2.03it/s]
55%|ββββββ | 418/763 [07:00<02:50, 2.02it/s]
55%|ββββββ | 419/763 [07:00<02:49, 2.03it/s]
55%|ββββββ | 420/763 [07:01<02:49, 2.02it/s]
55%|ββββββ | 421/763 [07:01<02:49, 2.02it/s]
55%|ββββββ | 422/763 [07:02<02:48, 2.02it/s]
55%|ββββββ | 423/763 [07:02<02:48, 2.02it/s]
56%|ββββββ | 424/763 [07:03<02:47, 2.02it/s]
56%|ββββββ | 425/763 [07:03<02:47, 2.02it/s]{'loss': 3.1794, 'grad_norm': 0.37164270877838135, 'learning_rate': 0.0004885520476290998, 'epoch': 0.56} |
|
56%|ββββββ | 425/763 [07:03<02:47, 2.02it/s]
56%|ββββββ | 426/763 [07:04<02:46, 2.02it/s]
56%|ββββββ | 427/763 [07:04<02:46, 2.02it/s]
56%|ββββββ | 428/763 [07:05<02:45, 2.02it/s]
56%|ββββββ | 429/763 [07:05<02:45, 2.02it/s]
56%|ββββββ | 430/763 [07:06<02:44, 2.02it/s]
56%|ββββββ | 431/763 [07:06<02:44, 2.02it/s]
57%|ββββββ | 432/763 [07:07<02:43, 2.02it/s]
57%|ββββββ | 433/763 [07:07<02:43, 2.02it/s]
57%|ββββββ | 434/763 [07:08<02:42, 2.02it/s]
57%|ββββββ | 435/763 [07:08<02:42, 2.02it/s]
57%|ββββββ | 436/763 [07:09<02:41, 2.02it/s]
57%|ββββββ | 437/763 [07:09<02:41, 2.02it/s]
57%|ββββββ | 438/763 [07:10<02:40, 2.02it/s]
58%|ββββββ | 439/763 [07:10<02:40, 2.02it/s]
58%|ββββββ | 440/763 [07:11<02:39, 2.02it/s]
58%|ββββββ | 441/763 [07:11<02:39, 2.02it/s]
58%|ββββββ | 442/763 [07:12<02:38, 2.03it/s]
58%|ββββββ | 443/763 [07:12<02:38, 2.02it/s]
58%|ββββββ | 444/763 [07:13<02:37, 2.02it/s]
58%|ββββββ | 445/763 [07:13<02:37, 2.02it/s]
58%|ββββββ | 446/763 [07:14<02:36, 2.02it/s]
59%|ββββββ | 447/763 [07:14<02:36, 2.02it/s]
59%|ββββββ | 448/763 [07:15<02:35, 2.02it/s]
59%|ββββββ | 449/763 [07:15<02:35, 2.02it/s]
59%|ββββββ | 450/763 [07:16<02:34, 2.03it/s]{'loss': 3.1029, 'grad_norm': 0.3509822487831116, 'learning_rate': 0.00043152218172535383, 'epoch': 0.59} |
|
59%|ββββββ | 450/763 [07:16<02:34, 2.03it/s]
59%|ββββββ | 451/763 [07:16<02:34, 2.02it/s]
59%|ββββββ | 452/763 [07:17<02:33, 2.02it/s]
59%|ββββββ | 453/763 [07:17<02:33, 2.02it/s]
60%|ββββββ | 454/763 [07:18<02:32, 2.02it/s]
60%|ββββββ | 455/763 [07:18<02:32, 2.02it/s]
60%|ββββββ | 456/763 [07:19<02:31, 2.02it/s]
60%|ββββββ | 457/763 [07:19<02:31, 2.02it/s]
60%|ββββββ | 458/763 [07:20<02:30, 2.02it/s]
60%|ββββββ | 459/763 [07:20<02:30, 2.02it/s]
60%|ββββββ | 460/763 [07:21<02:29, 2.02it/s]
60%|ββββββ | 461/763 [07:21<02:29, 2.02it/s]
61%|ββββββ | 462/763 [07:22<02:28, 2.02it/s]
61%|ββββββ | 463/763 [07:22<02:28, 2.03it/s]
61%|ββββββ | 464/763 [07:23<02:27, 2.02it/s]
61%|ββββββ | 465/763 [07:23<02:27, 2.02it/s]
61%|ββββββ | 466/763 [07:24<02:26, 2.02it/s]
61%|ββββββ | 467/763 [07:24<02:26, 2.02it/s]
61%|βββββββ | 468/763 [07:25<02:25, 2.02it/s]
61%|βββββββ | 469/763 [07:25<02:25, 2.03it/s]
62%|βββββββ | 470/763 [07:26<02:24, 2.03it/s]
62%|βββββββ | 471/763 [07:26<02:24, 2.03it/s]
62%|βββββββ | 472/763 [07:27<02:23, 2.03it/s]
62%|βββββββ | 473/763 [07:27<02:23, 2.03it/s]
62%|βββββββ | 474/763 [07:28<02:22, 2.03it/s]
62%|βββββββ | 475/763 [07:28<02:22, 2.03it/s]
{'loss': 3.0648, 'grad_norm': 0.37363898754119873, 'learning_rate': 0.0003753889328974423, 'epoch': 0.62} |
|
62%|βββββββ | 475/763 [07:28<02:22, 2.03it/s]
62%|βββββββ | 476/763 [07:29<02:21, 2.02it/s]
63%|βββββββ | 477/763 [07:29<02:21, 2.02it/s]
63%|βββββββ | 478/763 [07:30<02:20, 2.02it/s]
63%|βββββββ | 479/763 [07:30<02:20, 2.02it/s]
63%|βββββββ | 480/763 [07:31<02:20, 2.02it/s]
63%|βββββββ | 481/763 [07:31<02:19, 2.02it/s]
63%|βββββββ | 482/763 [07:32<02:19, 2.02it/s]
63%|βββββββ | 483/763 [07:32<02:18, 2.02it/s]
63%|βββββββ | 484/763 [07:33<02:17, 2.03it/s]
64%|βββββββ | 485/763 [07:33<02:17, 2.02it/s]
64%|βββββββ | 486/763 [07:34<02:16, 2.03it/s]
64%|βββββββ | 487/763 [07:34<02:16, 2.02it/s]
64%|βββββββ | 488/763 [07:35<02:15, 2.02it/s]
64%|βββββββ | 489/763 [07:35<02:15, 2.03it/s]
64%|βββββββ | 490/763 [07:36<02:14, 2.03it/s]
64%|βββββββ | 491/763 [07:36<02:14, 2.03it/s]
64%|βββββββ | 492/763 [07:37<02:13, 2.02it/s]
65%|βββββββ | 493/763 [07:37<02:13, 2.03it/s]
65%|βββββββ | 494/763 [07:38<02:12, 2.02it/s]
65%|βββββββ | 495/763 [07:38<02:12, 2.03it/s]
65%|βββββββ | 496/763 [07:39<02:11, 2.03it/s]
65%|βββββββ | 497/763 [07:39<02:11, 2.02it/s]
65%|βββββββ | 498/763 [07:40<02:10, 2.02it/s]
65%|βββββββ | 499/763 [07:40<02:10, 2.02it/s]
66%|βββββββ | 500/763 [07:41<02:09, 2.02it/s]{'loss': 3.0023, 'grad_norm': 0.36063244938850403, 'learning_rate': 0.00032088728410319416, 'epoch': 0.66} |
|
66%|βββββββ | 500/763 [07:41<02:09, 2.02it/s]
66%|βββββββ | 501/763 [07:41<02:09, 2.02it/s]
66%|βββββββ | 502/763 [07:41<02:09, 2.02it/s]
66%|βββββββ | 503/763 [07:42<02:08, 2.02it/s]
66%|βββββββ | 504/763 [07:42<02:08, 2.02it/s]
66%|βββββββ | 505/763 [07:43<02:07, 2.02it/s]
66%|βββββββ | 506/763 [07:43<02:07, 2.02it/s]
66%|βββββββ | 507/763 [07:44<02:07, 2.02it/s]
67%|βββββββ | 508/763 [07:44<02:07, 2.01it/s]
67%|βββββββ | 509/763 [07:45<02:06, 2.01it/s]
67%|βββββββ | 510/763 [07:45<02:05, 2.01it/s]
67%|βββββββ | 511/763 [07:46<02:05, 2.01it/s]
67%|βββββββ | 512/763 [07:46<02:04, 2.01it/s]
67%|βββββββ | 513/763 [07:47<02:04, 2.01it/s]
67%|βββββββ | 514/763 [07:47<02:03, 2.02it/s]
67%|βββββββ | 515/763 [07:48<02:02, 2.02it/s]
68%|βββββββ | 516/763 [07:48<02:02, 2.02it/s]
68%|βββββββ | 517/763 [07:49<02:01, 2.02it/s]
68%|βββββββ | 518/763 [07:49<02:01, 2.02it/s]
68%|βββββββ | 519/763 [07:50<02:00, 2.02it/s]
68%|βββββββ | 520/763 [07:50<02:00, 2.02it/s]
68%|βββββββ | 521/763 [07:51<01:59, 2.02it/s]
68%|βββββββ | 522/763 [07:51<01:59, 2.02it/s]
69%|βββββββ | 523/763 [07:52<01:58, 2.03it/s]
69%|βββββββ | 524/763 [07:52<01:58, 2.02it/s]
69%|βββββββ | 525/763 [07:53<01:57, 2.03it/s]
{'loss': 2.9534, 'grad_norm': 0.3486718237400055, 'learning_rate': 0.0002687308548795825, 'epoch': 0.69}
|
|
69%|βββββββ | 525/763 [07:53<01:57, 2.03it/s]
69%|βββββββ | 526/763 [07:53<01:57, 2.02it/s]
69%|βββββββ | 527/763 [07:54<01:56, 2.02it/s]
69%|βββββββ | 528/763 [07:54<01:56, 2.02it/s]
69%|βββββββ | 529/763 [07:55<01:55, 2.02it/s]
69%|βββββββ | 530/763 [07:55<01:55, 2.02it/s]
70%|βββββββ | 531/763 [07:56<01:54, 2.02it/s]
70%|βββββββ | 532/763 [07:56<01:54, 2.02it/s]
70%|βββββββ | 533/763 [07:57<01:53, 2.02it/s]
70%|βββββββ | 534/763 [07:57<01:54, 2.00it/s]
70%|βββββββ | 535/763 [07:58<01:54, 1.99it/s]
70%|βββββββ | 536/763 [07:58<01:54, 1.98it/s]
70%|βββββββ | 537/763 [07:59<01:54, 1.98it/s]
71%|βββββββ | 538/763 [07:59<01:54, 1.97it/s]
71%|βββββββ | 539/763 [08:00<01:53, 1.98it/s]
71%|βββββββ | 540/763 [08:00<01:52, 1.99it/s]
71%|βββββββ | 541/763 [08:01<01:50, 2.00it/s]
71%|βββββββ | 542/763 [08:01<01:50, 2.01it/s]
71%|βββββββ | 543/763 [08:02<01:49, 2.01it/s]
71%|ββββββββ | 544/763 [08:02<01:48, 2.02it/s]
71%|ββββββββ | 545/763 [08:03<01:48, 2.02it/s]
72%|ββββββββ | 546/763 [08:03<01:47, 2.02it/s]
72%|ββββββββ | 547/763 [08:04<01:46, 2.02it/s]
72%|ββββββββ | 548/763 [08:04<01:46, 2.02it/s]
72%|ββββββββ | 549/763 [08:05<01:45, 2.02it/s]
72%|ββββββββ | 550/763 [08:05<01:45, 2.02it/s]{'loss': 2.9111, 'grad_norm': 0.34614884853363037, 'learning_rate': 0.00021960255753653008, 'epoch': 0.72}
|
|
72%|ββββββββ | 550/763 [08:05<01:45, 2.02it/s]
72%|ββββββββ | 551/763 [08:06<01:45, 2.02it/s]
72%|ββββββββ | 552/763 [08:06<01:44, 2.02it/s]
72%|ββββββββ | 553/763 [08:07<01:43, 2.02it/s]
73%|ββββββββ | 554/763 [08:07<01:43, 2.02it/s]
73%|ββββββββ | 555/763 [08:08<01:42, 2.02it/s]
73%|ββββββββ | 556/763 [08:08<01:42, 2.02it/s]
73%|ββββββββ | 557/763 [08:09<01:41, 2.02it/s]
73%|ββββββββ | 558/763 [08:09<01:41, 2.02it/s]
73%|ββββββββ | 559/763 [08:10<01:40, 2.02it/s]
73%|ββββββββ | 560/763 [08:10<01:40, 2.02it/s]
74%|ββββββββ | 561/763 [08:11<01:39, 2.02it/s]
74%|ββββββββ | 562/763 [08:11<01:39, 2.02it/s]
74%|ββββββββ | 563/763 [08:12<01:38, 2.02it/s]
74%|ββββββββ | 564/763 [08:12<01:38, 2.02it/s]
74%|ββββββββ | 565/763 [08:13<01:37, 2.02it/s]
74%|ββββββββ | 566/763 [08:13<01:37, 2.02it/s]
74%|ββββββββ | 567/763 [08:14<01:36, 2.02it/s]
74%|ββββββββ | 568/763 [08:14<01:36, 2.02it/s]
75%|ββββββββ | 569/763 [08:15<01:35, 2.02it/s]
75%|ββββββββ | 570/763 [08:15<01:35, 2.02it/s]
75%|ββββββββ | 571/763 [08:16<01:34, 2.02it/s]
75%|ββββββββ | 572/763 [08:16<01:34, 2.03it/s]
75%|ββββββββ | 573/763 [08:17<01:33, 2.02it/s]
75%|ββββββββ | 574/763 [08:17<01:33, 2.02it/s]
75%|ββββββββ | 575/763 [08:18<01:32, 2.02it/s]
{'loss': 2.8796, 'grad_norm': 0.354743093252182, 'learning_rate': 0.00017414565541703342, 'epoch': 0.75} |
|
75%|ββββββββ | 575/763 [08:18<01:32, 2.02it/s]
75%|ββββββββ | 576/763 [08:18<01:32, 2.02it/s]
76%|ββββββββ | 577/763 [08:19<01:32, 2.02it/s]
76%|ββββββββ | 578/763 [08:19<01:31, 2.02it/s]
76%|ββββββββ | 579/763 [08:20<01:31, 2.02it/s]
76%|ββββββββ | 580/763 [08:20<01:30, 2.02it/s]
76%|ββββββββ | 581/763 [08:21<01:30, 2.02it/s]
76%|ββββββββ | 582/763 [08:21<01:29, 2.02it/s]
76%|ββββββββ | 583/763 [08:22<01:28, 2.02it/s]
77%|ββββββββ | 584/763 [08:22<01:28, 2.02it/s]
77%|ββββββββ | 585/763 [08:23<01:28, 2.02it/s]
77%|ββββββββ | 586/763 [08:23<01:27, 2.02it/s]
77%|ββββββββ | 587/763 [08:24<01:27, 2.02it/s]
77%|ββββββββ | 588/763 [08:24<01:26, 2.02it/s]
77%|ββββββββ | 589/763 [08:25<01:26, 2.02it/s]
77%|ββββββββ | 590/763 [08:25<01:25, 2.02it/s]
77%|ββββββββ | 591/763 [08:26<01:25, 2.02it/s]
78%|ββββββββ | 592/763 [08:26<01:24, 2.02it/s]
78%|ββββββββ | 593/763 [08:27<01:24, 2.02it/s]
78%|ββββββββ | 594/763 [08:27<01:23, 2.03it/s]
78%|ββββββββ | 595/763 [08:28<01:23, 2.02it/s]
78%|ββββββββ | 596/763 [08:28<01:22, 2.02it/s]
78%|ββββββββ | 597/763 [08:29<01:22, 2.02it/s]
78%|ββββββββ | 598/763 [08:29<01:21, 2.02it/s]
79%|ββββββββ | 599/763 [08:30<01:21, 2.02it/s]
79%|ββββββββ | 600/763 [08:30<01:20, 2.02it/s]
{'loss': 2.848, 'grad_norm': 0.35213974118232727, 'learning_rate': 0.0001329553403026331, 'epoch': 0.79}
|
|
79%|ββββββββ | 600/763 [08:30<01:20, 2.02it/s]
79%|ββββββββ | 601/763 [08:31<01:20, 2.02it/s]
79%|ββββββββ | 602/763 [08:31<01:19, 2.02it/s]
79%|ββββββββ | 603/763 [08:32<01:19, 2.02it/s]
79%|ββββββββ | 604/763 [08:32<01:18, 2.02it/s]
79%|ββββββββ | 605/763 [08:33<01:18, 2.02it/s]
79%|ββββββββ | 606/763 [08:33<01:17, 2.02it/s]
80%|ββββββββ | 607/763 [08:34<01:17, 2.02it/s]
80%|ββββββββ | 608/763 [08:34<01:16, 2.02it/s]
80%|ββββββββ | 609/763 [08:35<01:22, 1.86it/s]
80%|ββββββββ | 610/763 [08:35<01:20, 1.91it/s]
80%|ββββββββ | 611/763 [08:36<01:18, 1.94it/s]
80%|ββββββββ | 612/763 [08:36<01:16, 1.97it/s]
80%|ββββββββ | 613/763 [08:37<01:15, 1.98it/s]
80%|ββββββββ | 614/763 [08:37<01:14, 1.99it/s]
81%|ββββββββ | 615/763 [08:38<01:13, 2.00it/s]
81%|ββββββββ | 616/763 [08:38<01:19, 1.85it/s]
81%|ββββββββ | 617/763 [08:39<01:16, 1.90it/s]
81%|ββββββββ | 618/763 [08:39<01:14, 1.94it/s]
81%|ββββββββ | 619/763 [08:40<01:13, 1.96it/s]
81%|βββββββββ | 620/763 [08:40<01:12, 1.98it/s]
81%|βββββββββ | 621/763 [08:41<01:11, 1.99it/s]
82%|βββββββββ | 622/763 [08:41<01:10, 2.00it/s]
82%|βββββββββ | 623/763 [08:42<01:09, 2.01it/s]
82%|βββββββββ | 624/763 [08:42<01:08, 2.02it/s]
82%|βββββββββ | 625/763 [08:43<01:08, 2.02it/s]{'loss': 2.8272, 'grad_norm': 0.3479262888431549, 'learning_rate': 9.657093924581261e-05, 'epoch': 0.82}
|
|
82%|βββββββββ | 625/763 [08:43<01:08, 2.02it/s]
82%|βββββββββ | 626/763 [08:43<01:07, 2.02it/s]
82%|βββββββββ | 627/763 [08:44<01:07, 2.02it/s]
82%|βββββββββ | 628/763 [08:44<01:06, 2.02it/s]
82%|βββββββββ | 629/763 [08:45<01:06, 2.02it/s]
83%|βββββββββ | 630/763 [08:45<01:05, 2.02it/s]
83%|βββββββββ | 631/763 [08:46<01:05, 2.02it/s]
83%|βββββββββ | 632/763 [08:46<01:04, 2.02it/s]
83%|βββββββββ | 633/763 [08:47<01:04, 2.02it/s]
83%|βββββββββ | 634/763 [08:47<01:03, 2.02it/s]
83%|βββββββββ | 635/763 [08:48<01:03, 2.02it/s]
83%|βββββββββ | 636/763 [08:48<01:02, 2.02it/s]
83%|βββββββββ | 637/763 [08:49<01:02, 2.02it/s]
84%|βββββββββ | 638/763 [08:49<01:01, 2.02it/s]
84%|βββββββββ | 639/763 [08:50<01:01, 2.02it/s]
84%|βββββββββ | 640/763 [08:50<01:00, 2.02it/s]
84%|βββββββββ | 641/763 [08:51<01:00, 2.02it/s]
84%|βββββββββ | 642/763 [08:51<00:59, 2.02it/s]
84%|βββββββββ | 643/763 [08:52<00:59, 2.02it/s]
84%|βββββββββ | 644/763 [08:52<00:58, 2.02it/s]
85%|βββββββββ | 645/763 [08:53<00:58, 2.02it/s]
85%|βββββββββ | 646/763 [08:53<00:57, 2.02it/s]
85%|βββββββββ | 647/763 [08:54<00:57, 2.02it/s]
85%|βββββββββ | 648/763 [08:54<00:56, 2.02it/s]
85%|βββββββββ | 649/763 [08:55<00:56, 2.02it/s]
85%|βββββββββ | 650/763 [08:55<00:55, 2.02it/s]
{'loss': 2.8167, 'grad_norm': 0.3358338475227356, 'learning_rate': 6.546885286948184e-05, 'epoch': 0.85} |
|
85%|βββββββββ | 650/763 [08:55<00:55, 2.02it/s]
85%|βββββββββ | 651/763 [08:56<00:55, 2.02it/s]
85%|βββββββββ | 652/763 [08:56<00:54, 2.02it/s]
86%|βββββββββ | 653/763 [08:57<00:54, 2.02it/s]
86%|βββββββββ | 654/763 [08:57<00:53, 2.02it/s]
86%|βββββββββ | 655/763 [08:58<00:53, 2.02it/s]
86%|βββββββββ | 656/763 [08:58<00:52, 2.02it/s]
86%|βββββββββ | 657/763 [08:59<00:52, 2.02it/s]
86%|βββββββββ | 658/763 [08:59<00:51, 2.02it/s]
86%|βββββββββ | 659/763 [09:00<00:51, 2.02it/s]
87%|βββββββββ | 660/763 [09:00<00:50, 2.02it/s]
87%|βββββββββ | 661/763 [09:00<00:50, 2.03it/s]
87%|βββββββββ | 662/763 [09:01<00:49, 2.02it/s]
87%|βββββββββ | 663/763 [09:01<00:49, 2.02it/s]
87%|βββββββββ | 664/763 [09:02<00:48, 2.02it/s]
87%|βββββββββ | 665/763 [09:02<00:48, 2.02it/s]
87%|βββββββββ | 666/763 [09:03<00:47, 2.02it/s]
87%|βββββββββ | 667/763 [09:03<00:47, 2.02it/s]
88%|βββββββββ | 668/763 [09:04<00:46, 2.02it/s]
88%|βββββββββ | 669/763 [09:04<00:46, 2.02it/s]
88%|βββββββββ | 670/763 [09:05<00:45, 2.02it/s]
88%|βββββββββ | 671/763 [09:05<00:45, 2.02it/s]
88%|βββββββββ | 672/763 [09:06<00:44, 2.03it/s]
88%|βββββββββ | 673/763 [09:06<00:44, 2.02it/s]
88%|βββββββββ | 674/763 [09:07<00:43, 2.02it/s]
88%|βββββββββ | 675/763 [09:07<00:43, 2.02it/s]
{'loss': 2.7964, 'grad_norm': 0.3353927731513977, 'learning_rate': 4.0056317596204094e-05, 'epoch': 0.88}
|
|
88%|βββββββββ | 675/763 [09:07<00:43, 2.02it/s]
89%|βββββββββ | 676/763 [09:08<00:43, 2.02it/s]
89%|βββββββββ | 677/763 [09:08<00:42, 2.02it/s]
89%|βββββββββ | 678/763 [09:09<00:42, 2.02it/s]
89%|βββββββββ | 679/763 [09:09<00:41, 2.02it/s]
89%|βββββββββ | 680/763 [09:10<00:41, 2.02it/s]
89%|βββββββββ | 681/763 [09:10<00:40, 2.02it/s]
89%|βββββββββ | 682/763 [09:11<00:40, 2.02it/s]
90%|βββββββββ | 683/763 [09:11<00:39, 2.02it/s]
90%|βββββββββ | 684/763 [09:12<00:39, 2.02it/s]
90%|βββββββββ | 685/763 [09:12<00:38, 2.02it/s]
90%|βββββββββ | 686/763 [09:13<00:38, 2.02it/s]
90%|βββββββββ | 687/763 [09:13<00:37, 2.02it/s]
90%|βββββββββ | 688/763 [09:14<00:37, 2.02it/s]
90%|βββββββββ | 689/763 [09:14<00:36, 2.02it/s]
90%|βββββββββ | 690/763 [09:15<00:36, 2.02it/s]
91%|βββββββββ | 691/763 [09:15<00:35, 2.02it/s]
91%|βββββββββ | 692/763 [09:16<00:35, 2.02it/s]
91%|βββββββββ | 693/763 [09:16<00:34, 2.03it/s]
91%|βββββββββ | 694/763 [09:17<00:34, 2.02it/s]
91%|βββββββββ | 695/763 [09:17<00:33, 2.02it/s]
91%|βββββββββ | 696/763 [09:18<00:33, 2.02it/s]
91%|ββββββββββ| 697/763 [09:18<00:32, 2.02it/s]
91%|ββββββββββ| 698/763 [09:19<00:32, 2.02it/s]
92%|ββββββββββ| 699/763 [09:19<00:31, 2.03it/s]
92%|ββββββββββ| 700/763 [09:20<00:31, 2.02it/s]
{'loss': 2.7815, 'grad_norm': 0.3266715109348297, 'learning_rate': 2.0666073481669712e-05, 'epoch': 0.92} |
|
92%|ββββββββββ| 700/763 [09:20<00:31, 2.02it/s]
92%|ββββββββββ| 701/763 [09:20<00:30, 2.02it/s]
92%|ββββββββββ| 702/763 [09:21<00:30, 2.02it/s]
92%|ββββββββββ| 703/763 [09:21<00:29, 2.02it/s]
92%|ββββββββββ| 704/763 [09:22<00:29, 2.02it/s]
92%|ββββββββββ| 705/763 [09:22<00:28, 2.02it/s]
93%|ββββββββββ| 706/763 [09:23<00:28, 2.02it/s]
93%|ββββββββββ| 707/763 [09:23<00:27, 2.02it/s]
93%|ββββββββββ| 708/763 [09:24<00:27, 2.03it/s]
93%|ββββββββββ| 709/763 [09:24<00:26, 2.02it/s]
93%|ββββββββββ| 710/763 [09:25<00:26, 2.02it/s]
93%|ββββββββββ| 711/763 [09:25<00:25, 2.02it/s]
93%|ββββββββββ| 712/763 [09:26<00:25, 2.02it/s]
93%|ββββββββββ| 713/763 [09:26<00:24, 2.02it/s]
94%|ββββββββββ| 714/763 [09:27<00:24, 2.02it/s]
94%|ββββββββββ| 715/763 [09:27<00:23, 2.02it/s]
94%|ββββββββββ| 716/763 [09:28<00:23, 2.02it/s]
94%|ββββββββββ| 717/763 [09:28<00:22, 2.02it/s]
94%|ββββββββββ| 718/763 [09:29<00:22, 2.02it/s]
94%|ββββββββββ| 719/763 [09:29<00:21, 2.02it/s]
94%|ββββββββββ| 720/763 [09:30<00:21, 2.02it/s]
94%|ββββββββββ| 721/763 [09:30<00:20, 2.02it/s]
95%|ββββββββββ| 722/763 [09:31<00:20, 2.02it/s]
95%|ββββββββββ| 723/763 [09:31<00:19, 2.02it/s]
95%|ββββββββββ| 724/763 [09:32<00:19, 2.02it/s]
95%|ββββββββββ| 725/763 [09:32<00:18, 2.02it/s]
{'loss': 2.7777, 'grad_norm': 0.3224312365055084, 'learning_rate': 7.552007469355249e-06, 'epoch': 0.95} |
|
95%|ββββββββββ| 725/763 [09:32<00:18, 2.02it/s]
95%|ββββββββββ| 726/763 [09:33<00:18, 2.02it/s]
95%|ββββββββββ| 727/763 [09:33<00:17, 2.02it/s]
95%|ββββββββββ| 728/763 [09:34<00:17, 2.02it/s]
96%|ββββββββββ| 729/763 [09:34<00:16, 2.02it/s]
96%|ββββββββββ| 730/763 [09:35<00:16, 2.02it/s]
96%|ββββββββββ| 731/763 [09:35<00:15, 2.02it/s]
96%|ββββββββββ| 732/763 [09:36<00:15, 2.02it/s]
96%|ββββββββββ| 733/763 [09:36<00:14, 2.02it/s]
96%|ββββββββββ| 734/763 [09:37<00:14, 2.02it/s]
96%|ββββββββββ| 735/763 [09:37<00:13, 2.02it/s]
96%|ββββββββββ| 736/763 [09:38<00:13, 2.02it/s]
97%|ββββββββββ| 737/763 [09:38<00:12, 2.02it/s]
97%|ββββββββββ| 738/763 [09:39<00:12, 2.02it/s]
97%|ββββββββββ| 739/763 [09:39<00:11, 2.02it/s]
97%|ββββββββββ| 740/763 [09:40<00:11, 2.02it/s]
97%|ββββββββββ| 741/763 [09:40<00:10, 2.02it/s]
97%|ββββββββββ| 742/763 [09:41<00:10, 2.02it/s]
97%|ββββββββββ| 743/763 [09:41<00:09, 2.02it/s]
98%|ββββββββββ| 744/763 [09:42<00:09, 2.02it/s]
98%|ββββββββββ| 745/763 [09:42<00:08, 2.02it/s]
98%|ββββββββββ| 746/763 [09:43<00:08, 2.02it/s]
98%|ββββββββββ| 747/763 [09:43<00:07, 2.02it/s]
98%|ββββββββββ| 748/763 [09:44<00:07, 2.02it/s]
98%|ββββββββββ| 749/763 [09:44<00:06, 2.02it/s]
98%|ββββββββββ| 750/763 [09:44<00:06, 2.02it/s]
{'loss': 2.771, 'grad_norm': 0.32577940821647644, 'learning_rate': 8.858291115876327e-07, 'epoch': 0.98} |
|
98%|ββββββββββ| 750/763 [09:44<00:06, 2.02it/s]
98%|ββββββββββ| 751/763 [09:45<00:05, 2.02it/s]
99%|ββββββββββ| 752/763 [09:45<00:05, 2.02it/s]
99%|ββββββββββ| 753/763 [09:46<00:04, 2.02it/s]
99%|ββββββββββ| 754/763 [09:46<00:04, 2.02it/s]
99%|ββββββββββ| 755/763 [09:47<00:03, 2.02it/s]
99%|ββββββββββ| 756/763 [09:47<00:03, 2.02it/s]
99%|ββββββββββ| 757/763 [09:48<00:02, 2.02it/s]
99%|ββββββββββ| 758/763 [09:48<00:02, 2.02it/s]
99%|ββββββββββ| 759/763 [09:49<00:01, 2.02it/s]
100%|ββββββββββ| 760/763 [09:49<00:01, 2.02it/s]
100%|ββββββββββ| 761/763 [09:50<00:00, 2.02it/s]
100%|ββββββββββ| 762/763 [09:50<00:00, 2.02it/s]
100%|ββββββββββ| 763/763 [09:51<00:00, 2.04it/s]
{'train_runtime': 609.0458, 'train_samples_per_second': 1282.818, 'train_steps_per_second': 1.253, 'train_loss': 3.845910203566245, 'epoch': 1.0}
|
|
100%|ββββββββββ| 763/763 [10:08<00:00, 2.04it/s]
100%|ββββββββββ| 763/763 [10:08<00:00, 1.25it/s] |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|