|
0%| | 0/20000 [00:00<?, ?it/s]/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.) |
|
warnings.warn( |
|
0%| | 1/20000 [00:03<16:45:22, 3.02s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
0%| | 25/20000 [00:49<10:30:28, 1.89s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
0%|β | 50/20000 [01:36<10:30:25, 1.90s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
0%|β | 75/20000 [02:23<10:25:09, 1.88s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
0%|β | 100/20000 [03:10<10:21:12, 1.87s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1%|β | 125/20000 [03:57<10:17:00, 1.86s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1%|β | 150/20000 [04:43<10:12:27, 1.85s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1%|β | 174/20000 [05:28<10:11:10, 1.85s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1%|β | 199/20000 [06:14<10:07:08, 1.84s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1%|β | 225/20000 [07:02<10:09:09, 1.85s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1%|β | 250/20000 [07:48<10:05:59, 1.84s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1%|β | 274/20000 [08:32<10:02:25, 1.83s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1%|ββ | 299/20000 [09:18<10:01:03, 1.83s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2%|ββ | 324/20000 [10:04<10:02:02, 1.84s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2%|ββ | 350/20000 [10:52<9:54:57, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2%|ββ | 375/20000 [11:37<10:00:16, 1.84s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2%|ββ | 400/20000 [12:23<9:56:59, 1.83s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2%|ββ | 425/20000 [13:09<9:54:49, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2%|ββ | 450/20000 [13:54<9:56:31, 1.83s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2%|ββ | 474/20000 [14:38<9:55:46, 1.83s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2%|ββ | 500/20000 [15:26<9:52:31, 1.82s/it][INFO|trainer.py:3304] 2024-04-24 16:58:56,780 >> Saving model checkpoint to ./checkpoint-500 |
|
[INFO|configuration_utils.py:471] 2024-04-24 16:58:56,784 >> Configuration saved in ./checkpoint-500/config.json |
|
[INFO|configuration_utils.py:697] 2024-04-24 16:58:56,788 >> Configuration saved in ./checkpoint-500/generation_config.json |
|
{'loss': 2.0773, 'grad_norm': 4.6875, 'learning_rate': 0.0001, 'epoch': 0.12} |
|
[INFO|modeling_utils.py:2590] 2024-04-24 16:59:01,066 >> Model weights saved in ./checkpoint-500/model.safetensors |
|
[INFO|tokenization_utils_base.py:2488] 2024-04-24 16:59:01,079 >> tokenizer config file saved in ./checkpoint-500/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2497] 2024-04-24 16:59:01,081 >> Special tokens file saved in ./checkpoint-500/special_tokens_map.json |
|
[INFO|tokenization_utils_base.py:2488] 2024-04-24 16:59:11,382 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2497] 2024-04-24 16:59:11,384 >> Special tokens file saved in ./special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.) |
|
warnings.warn( |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3%|ββ | 524/20000 [16:24<9:52:57, 1.83s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3%|ββ | 550/20000 [17:12<9:50:23, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3%|βββ | 575/20000 [17:57<9:51:03, 1.83s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3%|βββ | 600/20000 [18:43<9:49:40, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3%|βββ | 625/20000 [19:28<9:46:32, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3%|βββ | 650/20000 [20:14<9:44:36, 1.81s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3%|βββ | 675/20000 [20:59<9:45:00, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4%|βββ | 700/20000 [21:45<9:46:59, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4%|βββ | 724/20000 [22:28<9:44:44, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4%|βββ | 750/20000 [23:16<9:44:11, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4%|βββ | 775/20000 [24:01<9:42:52, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4%|βββ | 800/20000 [24:47<10:00:33, 1.88s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4%|ββββ | 824/20000 [25:30<9:42:02, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4%|ββββ | 850/20000 [26:18<9:41:27, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4%|ββββ | 875/20000 [27:03<9:40:29, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4%|ββββ | 900/20000 [27:49<9:39:54, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5%|ββββ | 924/20000 [28:32<9:40:41, 1.83s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5%|ββββ | 950/20000 [29:20<9:37:26, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5%|ββββ | 975/20000 [30:05<9:34:44, 1.81s/it] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5%|ββββ | 1000/20000 [30:51<9:35:10, 1.82s/it][INFO|trainer.py:3304] 2024-04-24 17:14:21,632 >> Saving model checkpoint to ./checkpoint-1000 |
|
[INFO|configuration_utils.py:471] 2024-04-24 17:14:21,635 >> Configuration saved in ./checkpoint-1000/config.json |
|
[INFO|configuration_utils.py:697] 2024-04-24 17:14:21,638 >> Configuration saved in ./checkpoint-1000/generation_config.json |
|
{'loss': 1.5546, 'grad_norm': 1.2265625, 'learning_rate': 9.743589743589744e-05, 'epoch': 0.25} |
|
[INFO|modeling_utils.py:2590] 2024-04-24 17:14:25,965 >> Model weights saved in ./checkpoint-1000/model.safetensors |
|
[INFO|tokenization_utils_base.py:2488] 2024-04-24 17:14:25,975 >> tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2497] 2024-04-24 17:14:25,977 >> Special tokens file saved in ./checkpoint-1000/special_tokens_map.json |
|
[INFO|tokenization_utils_base.py:2488] 2024-04-24 17:14:35,995 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2497] 2024-04-24 17:14:35,997 >> Special tokens file saved in ./special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.) |
|
warnings.warn( |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5%|ββββ | 1026/20000 [31:52<9:35:24, 1.82s/it] |
|
|
|
|
|
|
|
|
|
|