diff --git "a/wandb/run-20220503_172048-zotxt8wa/files/output.log" "b/wandb/run-20220503_172048-zotxt8wa/files/output.log" --- "a/wandb/run-20220503_172048-zotxt8wa/files/output.log" +++ "b/wandb/run-20220503_172048-zotxt8wa/files/output.log" @@ -104277,3 +104277,10398 @@ To disable this warning, you can either: huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|█████████████████▍ | 5001/19440 [15:00:21<4612:05:52, 1149.91s/it] + 26%|█████████████████▍ | 5001/19440 [15:00:21<4612:05:52, 1149.91s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.2353, 'learning_rate': 0.00027188682804108984, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|█████████████████▊ | 5002/19440 [15:00:26<3233:45:00, 806.31s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.0968, 'learning_rate': 0.00027186800583077204, 'epoch': 0.77} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|█████████████████▊ | 5003/19440 [15:00:30<2268:43:48, 565.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9315, 'learning_rate': 0.0002718491836204542, 'epoch': 0.77} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|█████████████████▊ | 5004/19440 [15:00:35<1593:08:35, 397.29s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.062, 'learning_rate': 0.00027183036141013633, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|█████████████████▊ | 5005/19440 [15:00:39<1120:08:00, 279.35s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.9241, 'learning_rate': 0.0002718115391998185, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████ | 5006/19440 [15:00:43<788:57:41, 196.78s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9087, 'learning_rate': 0.0002717927169895007, 'epoch': 0.77} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████ | 5007/19440 [15:00:47<557:12:46, 138.98s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8207, 'learning_rate': 0.0002717738947791828, 'epoch': 0.77} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▎ | 5008/19440 [15:00:51<394:53:10, 98.50s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7627, 'learning_rate': 0.000271755072568865, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▎ | 5009/19440 [15:00:55<281:07:33, 70.13s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8625, 'learning_rate': 0.0002717362503585472, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▎ | 5010/19440 [15:00:59<201:38:37, 50.31s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5945, 'learning_rate': 0.00027171742814822937, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▎ | 5011/19440 [15:01:03<145:52:43, 36.40s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7398, 'learning_rate': 0.0002716986059379115, 'epoch': 0.77} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▎ | 5012/19440 [15:01:07<106:38:05, 26.61s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6961, 'learning_rate': 0.00027167978372759366, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5013/19440 [15:01:11<79:49:59, 19.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6652, 'learning_rate': 0.00027166096151727586, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5014/19440 [15:01:15<60:33:57, 15.11s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7652, 'learning_rate': 0.000271642139306958, 'epoch': 0.77} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5015/19440 [15:01:19<46:57:19, 11.72s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3861, 'learning_rate': 0.0002716233170966402, 'epoch': 0.77} + 26%|██████████████████▌ | 5016/19440 [15:01:23<37:19:53, 9.32s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5017/19440 [15:01:26<30:51:32, 7.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5883, 'learning_rate': 0.00027160449488632235, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5018/19440 [15:01:30<25:58:05, 6.48s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6158, 'learning_rate': 0.00027158567267600455, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3952, 'learning_rate': 0.0002715668504656867, 'epoch': 0.77} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5019/19440 [15:01:34<22:28:07, 5.61s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3773, 'learning_rate': 0.00027154802825536884, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5020/19440 [15:01:37<19:56:37, 4.98s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5021/19440 [15:01:41<18:08:13, 4.53s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4866, 'learning_rate': 0.00027152920604505104, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5022/19440 [15:01:44<17:00:07, 4.25s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4064, 'learning_rate': 0.0002715103838347332, 'epoch': 0.77} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2864, 'learning_rate': 0.0002714915616244154, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5023/19440 [15:01:48<16:10:09, 4.04s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3696, 'learning_rate': 0.00027147273941409753, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5024/19440 [15:01:51<15:32:52, 3.88s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1424, 'learning_rate': 0.0002714539172037797, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5025/19440 [15:01:55<15:35:30, 3.89s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.537, 'learning_rate': 0.0002714350949934618, 'epoch': 0.78} + 26%|██████████████████▌ | 5026/19440 [15:01:59<15:05:49, 3.77s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5027/19440 [15:02:02<14:36:26, 3.65s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5337, 'learning_rate': 0.000271416272783144, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2552, 'learning_rate': 0.0002713974505728262, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▌ | 5028/19440 [15:02:05<14:16:03, 3.56s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5029/19440 [15:02:09<13:53:29, 3.47s/it] + 26%|██████████████████▋ | 5029/19440 [15:02:09<13:53:29, 3.47s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2578, 'learning_rate': 0.00027135980615219056, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5030/19440 [15:02:12<13:35:11, 3.39s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2183, 'learning_rate': 0.0002713409839418727, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5031/19440 [15:02:15<13:14:22, 3.31s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.026, 'learning_rate': 0.00027132216173155485, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5032/19440 [15:02:18<12:46:16, 3.19s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5033/19440 [15:02:21<12:28:23, 3.12s/it] + 26%|██████████████████▋ | 5033/19440 [15:02:21<12:28:23, 3.12s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0845, 'learning_rate': 0.0002712845173109192, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5034/19440 [15:02:24<12:13:46, 3.06s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5035/19440 [15:02:27<11:58:46, 2.99s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0071, 'learning_rate': 0.00027126569510060134, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0906, 'learning_rate': 0.00027124687289028354, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5036/19440 [15:02:29<11:48:20, 2.95s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5037/19440 [15:02:32<11:38:26, 2.91s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9582, 'learning_rate': 0.00027122805067996574, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.7155, 'learning_rate': 0.0002712092284696479, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5038/19440 [15:02:36<12:01:05, 3.00s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5039/19440 [15:02:38<11:45:29, 2.94s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0514, 'learning_rate': 0.00027119040625933003, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5040/19440 [15:02:41<11:26:40, 2.86s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8325, 'learning_rate': 0.0002711715840490122, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.4197, 'learning_rate': 0.0002711527618386944, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5041/19440 [15:02:44<11:12:41, 2.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8506, 'learning_rate': 0.0002711339396283765, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5042/19440 [15:02:46<11:02:54, 2.76s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5043/19440 [15:02:49<10:51:30, 2.72s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.5539, 'learning_rate': 0.0002711151174180587, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.4672, 'learning_rate': 0.0002710962952077409, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5044/19440 [15:02:52<10:46:07, 2.69s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.215, 'learning_rate': 0.00027107747299742307, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5045/19440 [15:02:54<10:31:13, 2.63s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5046/19440 [15:02:57<10:19:30, 2.58s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.0267, 'learning_rate': 0.0002710586507871052, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5047/19440 [15:02:59<10:07:30, 2.53s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.9159, 'learning_rate': 0.00027103982857678736, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.0468, 'learning_rate': 0.00027102100636646956, 'epoch': 0.78} + 26%|██████████████████▉ | 5048/19440 [15:03:01<9:59:55, 2.50s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.6901, 'learning_rate': 0.0002710021841561517, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5049/19440 [15:03:04<9:50:30, 2.46s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5050/19440 [15:03:07<10:10:04, 2.54s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.6805, 'learning_rate': 0.0002709833619458339, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.2864, 'learning_rate': 0.00027096453973551605, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5051/19440 [15:03:11<12:49:46, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5052/19440 [15:03:16<14:09:45, 3.54s/it] + 26%|██████████████████▋ | 5052/19440 [15:03:16<14:09:45, 3.54s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9982, 'learning_rate': 0.0002709268953148804, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5053/19440 [15:03:20<14:50:14, 3.71s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.098, 'learning_rate': 0.00027090807310456254, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5054/19440 [15:03:24<15:07:15, 3.78s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9643, 'learning_rate': 0.00027088925089424474, 'epoch': 0.78} + 26%|██████████████████▋ | 5055/19440 [15:03:27<15:10:35, 3.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5056/19440 [15:03:31<15:11:55, 3.80s/it] + 26%|██████████████████▋ | 5056/19440 [15:03:31<15:11:55, 3.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5057/19440 [15:03:35<15:21:01, 3.84s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.9384, 'learning_rate': 0.0002708516064736091, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5058/19440 [15:03:39<15:14:43, 3.82s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9776, 'learning_rate': 0.00027083278426329123, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5059/19440 [15:03:43<15:07:40, 3.79s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9302, 'learning_rate': 0.0002708139620529734, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7927, 'learning_rate': 0.0002707951398426555, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5060/19440 [15:03:46<14:58:16, 3.75s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5061/19440 [15:03:50<14:47:52, 3.70s/it] + 26%|██████████████████▋ | 5061/19440 [15:03:50<14:47:52, 3.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▋ | 5062/19440 [15:03:54<14:40:03, 3.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.9061, 'learning_rate': 0.0002707574954220199, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5063/19440 [15:03:58<15:00:07, 3.76s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.782, 'learning_rate': 0.00027073867321170206, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5064/19440 [15:04:01<14:47:02, 3.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6194, 'learning_rate': 0.00027071985100138426, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5265, 'learning_rate': 0.0002707010287910664, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5065/19440 [15:04:05<14:30:18, 3.63s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6959, 'learning_rate': 0.00027068220658074856, 'epoch': 0.78} + 26%|██████████████████▊ | 5066/19440 [15:04:08<14:13:57, 3.56s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5067/19440 [15:04:11<13:58:35, 3.50s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7507, 'learning_rate': 0.0002706633843704307, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.546, 'learning_rate': 0.0002706445621601129, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5068/19440 [15:04:15<13:45:36, 3.45s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5069/19440 [15:04:18<13:31:15, 3.39s/it] + 26%|██████████████████▊ | 5069/19440 [15:04:18<13:31:15, 3.39s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5070/19440 [15:04:21<13:18:39, 3.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5103, 'learning_rate': 0.00027060691773947724, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5082, 'learning_rate': 0.00027058809552915944, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5071/19440 [15:04:24<13:08:54, 3.29s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5072/19440 [15:04:27<12:58:39, 3.25s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6161, 'learning_rate': 0.0002705692733188416, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5065, 'learning_rate': 0.00027055045110852374, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5073/19440 [15:04:31<12:49:33, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5074/19440 [15:04:34<12:53:39, 3.23s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2262, 'learning_rate': 0.0002705316288982059, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5075/19440 [15:04:37<13:14:55, 3.32s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3847, 'learning_rate': 0.0002705128066878881, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.232, 'learning_rate': 0.0002704939844775702, 'epoch': 0.78} + 26%|██████████████████▊ | 5076/19440 [15:04:40<13:02:25, 3.27s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5077/19440 [15:04:43<12:44:18, 3.19s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.238, 'learning_rate': 0.0002704751622672524, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1653, 'learning_rate': 0.00027045634005693457, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5078/19440 [15:04:47<12:31:03, 3.14s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5079/19440 [15:04:49<12:16:48, 3.08s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3818, 'learning_rate': 0.0002704375178466167, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2608, 'learning_rate': 0.0002704186956362989, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5080/19440 [15:04:52<12:08:11, 3.04s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5081/19440 [15:04:55<11:57:25, 3.00s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2612, 'learning_rate': 0.00027039987342598106, 'epoch': 0.78} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2312, 'learning_rate': 0.00027038105121566326, 'epoch': 0.78} + 26%|██████████████████▊ | 5082/19440 [15:04:58<11:49:40, 2.97s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1847, 'learning_rate': 0.0002703622290053454, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5083/19440 [15:05:01<11:44:40, 2.94s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5084/19440 [15:05:04<11:38:58, 2.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9134, 'learning_rate': 0.0002703434067950276, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0898, 'learning_rate': 0.00027032458458470975, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5085/19440 [15:05:07<11:31:42, 2.89s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5086/19440 [15:05:10<11:24:41, 2.86s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1434, 'learning_rate': 0.0002703057623743919, 'epoch': 0.78} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.8231, 'learning_rate': 0.0002702869401640741, 'epoch': 0.78} + 26%|██████████████████▊ | 5087/19440 [15:05:12<11:15:29, 2.82s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5088/19440 [15:05:16<11:38:49, 2.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9953, 'learning_rate': 0.00027026811795375624, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5089/19440 [15:05:18<11:27:16, 2.87s/it] + 26%|██████████████████▊ | 5089/19440 [15:05:18<11:27:16, 2.87s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.7648, 'learning_rate': 0.0002702304735331206, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5090/19440 [15:05:21<11:13:46, 2.82s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5091/19440 [15:05:24<11:03:27, 2.77s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.4722, 'learning_rate': 0.0002702116513228028, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5092/19440 [15:05:26<10:55:08, 2.74s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.7025, 'learning_rate': 0.00027019282911248493, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6785, 'learning_rate': 0.0002701740069021671, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5093/19440 [15:05:29<10:45:02, 2.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.6593, 'learning_rate': 0.0002701551846918492, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5094/19440 [15:05:31<10:37:09, 2.66s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5095/19440 [15:05:34<10:28:05, 2.63s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.5417, 'learning_rate': 0.0002701363624815314, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▊ | 5096/19440 [15:05:36<10:20:06, 2.59s/it] + 26%|██████████████████▊ | 5096/19440 [15:05:36<10:20:06, 2.59s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.3834, 'learning_rate': 0.00027009871806089577, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5097/19440 [15:05:39<10:12:23, 2.56s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.8378, 'learning_rate': 0.00027007989585057797, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5098/19440 [15:05:41<10:01:00, 2.51s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████▏ | 5099/19440 [15:05:44<9:47:33, 2.46s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.586, 'learning_rate': 0.0002700610736402601, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5100/19440 [15:05:46<10:03:00, 2.52s/it] + 26%|██████████████████▉ | 5100/19440 [15:05:46<10:03:00, 2.52s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.1218, 'learning_rate': 0.0002700234292196244, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5101/19440 [15:05:51<12:35:00, 3.16s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.1967, 'learning_rate': 0.0002700046070093066, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5102/19440 [15:05:55<13:48:48, 3.47s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.1704, 'learning_rate': 0.00026998578479898875, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5103/19440 [15:05:59<14:33:52, 3.66s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.9559, 'learning_rate': 0.00026996696258867095, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5104/19440 [15:06:03<14:53:11, 3.74s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.075, 'learning_rate': 0.0002699481403783531, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5105/19440 [15:06:07<14:59:57, 3.77s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9212, 'learning_rate': 0.00026992931816803524, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5106/19440 [15:06:11<15:07:42, 3.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.056, 'learning_rate': 0.00026991049595771744, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5107/19440 [15:06:15<15:07:54, 3.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5108/19440 [15:06:18<14:56:30, 3.75s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7743, 'learning_rate': 0.0002698916737473996, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5109/19440 [15:06:22<14:47:43, 3.72s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8302, 'learning_rate': 0.0002698728515370818, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8825, 'learning_rate': 0.0002698540293267639, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5110/19440 [15:06:26<14:39:45, 3.68s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7007, 'learning_rate': 0.0002698352071164461, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5111/19440 [15:06:29<14:28:08, 3.64s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5112/19440 [15:06:33<14:18:00, 3.59s/it] + 26%|██████████████████▉ | 5112/19440 [15:06:33<14:18:00, 3.59s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5113/19440 [15:06:37<14:37:56, 3.68s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7302, 'learning_rate': 0.0002697975626958104, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5114/19440 [15:06:40<14:21:39, 3.61s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7476, 'learning_rate': 0.0002697787404854926, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6933, 'learning_rate': 0.00026975991827517476, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5115/19440 [15:06:43<14:03:48, 3.53s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5116/19440 [15:06:47<13:46:48, 3.46s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8151, 'learning_rate': 0.00026974109606485696, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7924, 'learning_rate': 0.0002697222738545391, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5117/19440 [15:06:50<13:31:27, 3.40s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5118/19440 [15:06:53<13:16:16, 3.34s/it] + 26%|██████████████████▉ | 5118/19440 [15:06:53<13:16:16, 3.34s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5119/19440 [15:06:56<13:06:16, 3.29s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.4826, 'learning_rate': 0.00026968462943390345, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6077, 'learning_rate': 0.0002696658072235856, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5120/19440 [15:06:59<12:57:26, 3.26s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5121/19440 [15:07:03<12:47:12, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5277, 'learning_rate': 0.00026964698501326774, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5403, 'learning_rate': 0.00026962816280294994, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5122/19440 [15:07:06<12:40:01, 3.18s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5123/19440 [15:07:09<12:31:34, 3.15s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5362, 'learning_rate': 0.00026960934059263214, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3673, 'learning_rate': 0.0002695905183823143, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5124/19440 [15:07:12<12:30:08, 3.14s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5125/19440 [15:07:15<12:54:31, 3.25s/it] + 26%|██████████████████▉ | 5125/19440 [15:07:15<12:54:31, 3.25s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5126/19440 [15:07:18<12:43:05, 3.20s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0833, 'learning_rate': 0.00026955287396167863, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5061, 'learning_rate': 0.0002695340517513608, 'epoch': 0.79} + 26%|██████████████████▉ | 5127/19440 [15:07:21<12:30:12, 3.14s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5128/19440 [15:07:24<12:16:50, 3.09s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5111, 'learning_rate': 0.0002695152295410429, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|██████████████████▉ | 5129/19440 [15:07:27<12:08:18, 3.05s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2214, 'learning_rate': 0.0002694964073307251, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3069, 'learning_rate': 0.0002694775851204073, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5130/19440 [15:07:30<11:57:12, 3.01s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5131/19440 [15:07:33<11:59:25, 3.02s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3688, 'learning_rate': 0.00026945876291008947, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1463, 'learning_rate': 0.0002694399406997716, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5132/19440 [15:07:36<11:47:03, 2.97s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5133/19440 [15:07:39<11:39:12, 2.93s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0467, 'learning_rate': 0.00026942111848945376, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.1139, 'learning_rate': 0.00026940229627913596, 'epoch': 0.79} + 26%|███████████████████ | 5134/19440 [15:07:42<11:29:32, 2.89s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5135/19440 [15:07:45<11:32:22, 2.90s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0835, 'learning_rate': 0.0002693834740688181, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5136/19440 [15:07:48<11:36:48, 2.92s/it] + 26%|███████████████████ | 5136/19440 [15:07:48<11:36:48, 2.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5137/19440 [15:07:51<11:34:50, 2.91s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.744, 'learning_rate': 0.00026934582964818245, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.7023, 'learning_rate': 0.00026932700743786465, 'epoch': 0.79} + 26%|███████████████████ | 5138/19440 [15:07:54<12:04:14, 3.04s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5139/19440 [15:07:57<11:51:35, 2.99s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.9167, 'learning_rate': 0.0002693081852275468, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5140/19440 [15:08:00<11:36:25, 2.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8569, 'learning_rate': 0.00026928936301722894, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.5421, 'learning_rate': 0.00026927054080691114, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5141/19440 [15:08:02<11:29:55, 2.90s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5142/19440 [15:08:05<11:13:35, 2.83s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.8725, 'learning_rate': 0.0002692517185965933, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5143/19440 [15:08:08<10:59:19, 2.77s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.4727, 'learning_rate': 0.0002692328963862755, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.5511, 'learning_rate': 0.00026921407417595763, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5144/19440 [15:08:10<10:45:49, 2.71s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5145/19440 [15:08:13<10:31:03, 2.65s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.1365, 'learning_rate': 0.0002691952519656398, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5146/19440 [15:08:15<10:13:40, 2.58s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.9456, 'learning_rate': 0.00026917642975532197, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5147/19440 [15:08:18<10:03:55, 2.54s/it] + 26%|███████████████████ | 5147/19440 [15:08:18<10:03:55, 2.54s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.9672, 'learning_rate': 0.0002691387853346863, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████▎ | 5148/19440 [15:08:20<9:52:02, 2.49s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.7631, 'learning_rate': 0.00026911996312436846, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████▎ | 5149/19440 [15:08:22<9:41:06, 2.44s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.4286, 'learning_rate': 0.00026910114091405066, 'epoch': 0.79} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5150/19440 [15:08:27<11:54:28, 3.00s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 26%|███████████████████ | 5151/19440 [15:08:31<13:52:27, 3.50s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.358, 'learning_rate': 0.0002690823187037328, 'epoch': 0.79} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5152/19440 [15:08:36<14:48:38, 3.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.3346, 'learning_rate': 0.000269063496493415, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5153/19440 [15:08:40<15:21:35, 3.87s/it] + 27%|██████████████���████ | 5153/19440 [15:08:40<15:21:35, 3.87s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5154/19440 [15:08:44<15:30:21, 3.91s/it] + 27%|███████████████████ | 5154/19440 [15:08:44<15:30:21, 3.91s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5155/19440 [15:08:48<15:35:03, 3.93s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.1015, 'learning_rate': 0.00026900702986246144, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5156/19440 [15:08:52<15:26:26, 3.89s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.0299, 'learning_rate': 0.00026898820765214364, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5157/19440 [15:08:55<15:30:28, 3.91s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.0661, 'learning_rate': 0.00026896938544182584, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5158/19440 [15:08:59<15:30:02, 3.91s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7095, 'learning_rate': 0.000268950563231508, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.0557, 'learning_rate': 0.0002689317410211902, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5159/19440 [15:09:03<15:18:52, 3.86s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8326, 'learning_rate': 0.00026891291881087233, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5160/19440 [15:09:07<15:03:48, 3.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5161/19440 [15:09:10<14:49:10, 3.74s/it] + 27%|███████████████████ | 5161/19440 [15:09:10<14:49:10, 3.74s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5162/19440 [15:09:14<14:34:28, 3.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6858, 'learning_rate': 0.0002688752743902366, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████ | 5163/19440 [15:09:18<14:51:19, 3.75s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7326, 'learning_rate': 0.0002688564521799188, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6955, 'learning_rate': 0.00026883762996960097, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5164/19440 [15:09:21<14:39:51, 3.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.4379, 'learning_rate': 0.00026881880775928317, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5165/19440 [15:09:25<14:28:56, 3.65s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5166/19440 [15:09:28<14:14:59, 3.59s/it] + 27%|███████████████████▏ | 5166/19440 [15:09:28<14:14:59, 3.59s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5167/19440 [15:09:32<14:00:55, 3.54s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.659, 'learning_rate': 0.00026878116333864746, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6712, 'learning_rate': 0.00026876234112832966, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5168/19440 [15:09:35<13:42:04, 3.46s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5169/19440 [15:09:38<13:28:37, 3.40s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3781, 'learning_rate': 0.0002687435189180118, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5170/19440 [15:09:42<13:27:41, 3.40s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.453, 'learning_rate': 0.000268724696707694, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3003, 'learning_rate': 0.00026870587449737615, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5171/19440 [15:09:45<13:16:34, 3.35s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5172/19440 [15:09:48<13:09:58, 3.32s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3899, 'learning_rate': 0.00026868705228705835, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|████████���██████████▏ | 5173/19440 [15:09:51<12:58:43, 3.27s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5276, 'learning_rate': 0.0002686682300767405, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5174/19440 [15:09:55<12:51:29, 3.24s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4432, 'learning_rate': 0.00026864940786642264, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5175/19440 [15:09:58<13:12:02, 3.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1952, 'learning_rate': 0.00026863058565610484, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3035, 'learning_rate': 0.000268611763445787, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5176/19440 [15:10:01<13:02:02, 3.29s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5177/19440 [15:10:04<12:43:16, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1305, 'learning_rate': 0.0002685929412354692, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2528, 'learning_rate': 0.00026857411902515133, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5178/19440 [15:10:07<12:27:09, 3.14s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5179/19440 [15:10:10<12:18:27, 3.11s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1531, 'learning_rate': 0.00026855529681483353, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.236, 'learning_rate': 0.0002685364746045157, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5180/19440 [15:10:13<12:08:20, 3.06s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5181/19440 [15:10:16<12:02:05, 3.04s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.1535, 'learning_rate': 0.0002685176523941978, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2812, 'learning_rate': 0.00026849883018388, 'epoch': 0.8} + 27%|███████████████████▏ | 5182/19440 [15:10:19<11:51:40, 2.99s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5183/19440 [15:10:22<11:42:23, 2.96s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.1967, 'learning_rate': 0.00026848000797356216, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5184/19440 [15:10:25<11:34:53, 2.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9552, 'learning_rate': 0.00026846118576324436, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5185/19440 [15:10:28<11:30:15, 2.91s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.9131, 'learning_rate': 0.0002684423635529265, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5186/19440 [15:10:31<11:29:02, 2.90s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8808, 'learning_rate': 0.0002684235413426087, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5187/19440 [15:10:34<11:26:27, 2.89s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0169, 'learning_rate': 0.00026840471913229085, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5188/19440 [15:10:37<11:48:39, 2.98s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6969, 'learning_rate': 0.000268385896921973, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5189/19440 [15:10:40<11:37:10, 2.94s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8919, 'learning_rate': 0.00026836707471165514, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5190/19440 [15:10:42<11:23:22, 2.88s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.552, 'learning_rate': 0.00026834825250133734, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5191/19440 [15:10:45<11:27:11, 2.89s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.7015, 'learning_rate': 0.00026832943029101954, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5192/19440 [15:10:48<11:12:22, 2.83s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8862, 'learning_rate': 0.0002683106080807017, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5193/19440 [15:10:51<11:01:07, 2.78s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6516, 'learning_rate': 0.00026829178587038383, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5194/19440 [15:10:53<10:47:22, 2.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.4186, 'learning_rate': 0.000268272963660066, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5195/19440 [15:10:56<10:33:22, 2.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.4383, 'learning_rate': 0.0002682541414497482, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5196/19440 [15:10:58<10:24:07, 2.63s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.4523, 'learning_rate': 0.0002682353192394303, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▏ | 5197/19440 [15:11:01<10:15:22, 2.59s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.1226, 'learning_rate': 0.0002682164970291125, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5198/19440 [15:11:03<10:07:18, 2.56s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.0555, 'learning_rate': 0.00026819767481879467, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.8679, 'learning_rate': 0.00026817885260847687, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5199/19440 [15:11:06<9:54:41, 2.51s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5200/19440 [15:11:08<10:10:08, 2.57s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.3858, 'learning_rate': 0.000268160030398159, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5201/19440 [15:11:13<12:40:31, 3.20s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.3209, 'learning_rate': 0.00026814120818784116, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.2512, 'learning_rate': 0.00026812238597752336, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5202/19440 [15:11:17<13:55:34, 3.52s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5203/19440 [15:11:21<14:37:54, 3.70s/it] + 27%|███████████████████▎ | 5203/19440 [15:11:21<14:37:54, 3.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5204/19440 [15:11:25<14:56:45, 3.78s/it] + 27%|███████████████████▎ | 5204/19440 [15:11:25<14:56:45, 3.78s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.1096, 'learning_rate': 0.00026806591934656985, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5205/19440 [15:11:29<15:02:55, 3.81s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9579, 'learning_rate': 0.00026804709713625205, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5206/19440 [15:11:33<15:09:07, 3.83s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9439, 'learning_rate': 0.0002680282749259342, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5207/19440 [15:11:37<15:15:04, 3.86s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.882, 'learning_rate': 0.00026800945271561634, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5208/19440 [15:11:41<15:03:46, 3.81s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8777, 'learning_rate': 0.00026799063050529854, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5209/19440 [15:11:44<14:55:18, 3.77s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6009, 'learning_rate': 0.0002679718082949807, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5210/19440 [15:11:48<14:45:19, 3.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6419, 'learning_rate': 0.0002679529860846629, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5211/19440 [15:11:52<14:35:14, 3.69s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6841, 'learning_rate': 0.00026793416387434503, 'epoch': 0.8} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5212/19440 [15:11:55<14:25:03, 3.65s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7656, 'learning_rate': 0.00026791534166402723, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5213/19440 [15:11:59<14:43:24, 3.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7411, 'learning_rate': 0.0002678965194537094, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5214/19440 [15:12:03<14:29:35, 3.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5296, 'learning_rate': 0.0002678776972433915, 'epoch': 0.8} + 27%|███████████████████▎ | 5215/19440 [15:12:06<14:12:32, 3.60s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7207, 'learning_rate': 0.00026785887503307366, 'epoch': 0.8} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5216/19440 [15:12:10<14:02:45, 3.55s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6075, 'learning_rate': 0.00026784005282275586, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5217/19440 [15:12:13<13:52:57, 3.51s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.483, 'learning_rate': 0.00026782123061243806, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5218/19440 [15:12:16<13:40:11, 3.46s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4297, 'learning_rate': 0.0002678024084021202, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5219/19440 [15:12:20<13:31:28, 3.42s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5220/19440 [15:12:23<13:20:17, 3.38s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5642, 'learning_rate': 0.00026778358619180235, 'epoch': 0.81} +{'loss': 6.4773, 'learning_rate': 0.0002677647639814845, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5221/19440 [15:12:26<13:08:54, 3.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3278, 'learning_rate': 0.0002677459417711667, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5222/19440 [15:12:29<12:58:59, 3.29s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5225, 'learning_rate': 0.00026772711956084884, 'epoch': 0.81} + 27%|███████████████████▎ | 5223/19440 [15:12:32<12:47:25, 3.24s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2409, 'learning_rate': 0.00026770829735053104, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5224/19440 [15:12:36<12:43:15, 3.22s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3446, 'learning_rate': 0.00026768947514021324, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5225/19440 [15:12:39<13:04:20, 3.31s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.577, 'learning_rate': 0.0002676706529298954, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5226/19440 [15:12:42<12:52:57, 3.26s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2596, 'learning_rate': 0.00026765183071957753, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5227/19440 [15:12:45<12:39:39, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.1596, 'learning_rate': 0.0002676330085092597, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5228/19440 [15:12:48<12:28:56, 3.16s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0848, 'learning_rate': 0.0002676141862989419, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5229/19440 [15:12:51<12:19:49, 3.12s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.1654, 'learning_rate': 0.000267595364088624, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5230/19440 [15:12:54<12:09:59, 3.08s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3121, 'learning_rate': 0.0002675765418783062, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▎ | 5231/19440 [15:12:57<12:01:58, 3.05s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5232/19440 [15:13:00<11:54:38, 3.02s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1355, 'learning_rate': 0.00026755771966798837, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2811, 'learning_rate': 0.00026753889745767057, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5233/19440 [15:13:03<12:01:58, 3.05s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5234/19440 [15:13:06<11:55:14, 3.02s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2358, 'learning_rate': 0.0002675200752473527, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8805, 'learning_rate': 0.00026750125303703486, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5235/19440 [15:13:09<11:46:07, 2.98s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0596, 'learning_rate': 0.00026748243082671706, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5236/19440 [15:13:12<11:37:01, 2.94s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.7796, 'learning_rate': 0.0002674636086163992, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5237/19440 [15:13:15<11:31:58, 2.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6424, 'learning_rate': 0.0002674447864060814, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5238/19440 [15:13:18<11:52:26, 3.01s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6315, 'learning_rate': 0.00026742596419576355, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5239/19440 [15:13:21<11:43:45, 2.97s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.8122, 'learning_rate': 0.00026740714198544575, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5240/19440 [15:13:24<11:29:59, 2.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5241/19440 [15:13:27<11:16:54, 2.86s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.7493, 'learning_rate': 0.0002673883197751279, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.7384, 'learning_rate': 0.00026736949756481004, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5242/19440 [15:13:29<11:04:31, 2.81s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.4551, 'learning_rate': 0.00026735067535449224, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5243/19440 [15:13:32<10:53:45, 2.76s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5244/19440 [15:13:35<10:45:10, 2.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.5409, 'learning_rate': 0.0002673318531441744, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.3528, 'learning_rate': 0.0002673130309338566, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5245/19440 [15:13:37<10:34:53, 2.68s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.3772, 'learning_rate': 0.00026729420872353873, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5246/19440 [15:13:40<10:36:19, 2.69s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.2534, 'learning_rate': 0.0002672753865132209, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5247/19440 [15:13:42<10:25:54, 2.65s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5248/19440 [15:13:45<10:13:43, 2.59s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.0206, 'learning_rate': 0.000267256564302903, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.7432, 'learning_rate': 0.0002672377420925852, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5249/19440 [15:13:47<9:59:14, 2.53s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.6664, 'learning_rate': 0.00026721891988226737, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5250/19440 [15:13:50<10:14:39, 2.60s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.1088, 'learning_rate': 0.00026720009767194957, 'epoch': 0.81} + 27%|███████████████████▍ | 5251/19440 [15:13:55<12:47:11, 3.24s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.1781, 'learning_rate': 0.00026718127546163176, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5252/19440 [15:13:59<13:56:15, 3.54s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.0285, 'learning_rate': 0.0002671624532513139, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5253/19440 [15:14:03<14:31:27, 3.69s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.1721, 'learning_rate': 0.00026714363104099606, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5254/19440 [15:14:07<14:52:36, 3.78s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8896, 'learning_rate': 0.0002671248088306782, 'epoch': 0.81} + 27%|███████████████████▍ | 5255/19440 [15:14:11<15:00:05, 3.81s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5256/19440 [15:14:15<15:04:02, 3.82s/it] + 27%|███████��███████████▍ | 5256/19440 [15:14:15<15:04:02, 3.82s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8152, 'learning_rate': 0.00026708716441004255, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5257/19440 [15:14:19<15:10:03, 3.85s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8803, 'learning_rate': 0.00026706834219972475, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5258/19440 [15:14:22<15:03:13, 3.82s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.9613, 'learning_rate': 0.0002670495199894069, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5259/19440 [15:14:26<14:50:52, 3.77s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8748, 'learning_rate': 0.0002670306977790891, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5260/19440 [15:14:30<14:41:16, 3.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7768, 'learning_rate': 0.00026701187556877124, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5261/19440 [15:14:33<14:28:23, 3.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8197, 'learning_rate': 0.0002669930533584534, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5262/19440 [15:14:37<14:15:00, 3.62s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5263/19440 [15:14:41<14:33:35, 3.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6697, 'learning_rate': 0.0002669742311481356, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8138, 'learning_rate': 0.0002669554089378177, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▍ | 5264/19440 [15:14:44<14:18:57, 3.64s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7171, 'learning_rate': 0.0002669365867274999, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5265/19440 [15:14:48<14:03:10, 3.57s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6396, 'learning_rate': 0.00026691776451718207, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5266/19440 [15:14:51<13:47:49, 3.50s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5342, 'learning_rate': 0.00026689894230686427, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5267/19440 [15:14:54<13:37:50, 3.46s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5268/19440 [15:14:58<13:23:57, 3.40s/it] + 27%|███████████████████▌ | 5268/19440 [15:14:58<13:23:57, 3.40s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5004, 'learning_rate': 0.00026686129788622856, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5269/19440 [15:15:01<13:13:40, 3.36s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5368, 'learning_rate': 0.00026684247567591076, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5270/19440 [15:15:04<13:05:43, 3.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6659, 'learning_rate': 0.0002668236534655929, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5271/19440 [15:15:07<12:54:19, 3.28s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2801, 'learning_rate': 0.0002668048312552751, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5272/19440 [15:15:10<12:48:31, 3.25s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5273/19440 [15:15:14<12:37:16, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6029, 'learning_rate': 0.00026678600904495725, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.4412, 'learning_rate': 0.0002667671868346394, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5274/19440 [15:15:17<12:28:42, 3.17s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4423, 'learning_rate': 0.00026674836462432154, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5275/19440 [15:15:20<12:51:55, 3.27s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0734, 'learning_rate': 0.00026672954241400374, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5276/19440 [15:15:23<12:40:33, 3.22s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2705, 'learning_rate': 0.00026671072020368594, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5277/19440 [15:15:26<12:31:42, 3.18s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3784, 'learning_rate': 0.0002666918979933681, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5278/19440 [15:15:29<12:18:10, 3.13s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.436, 'learning_rate': 0.0002666730757830503, 'epoch': 0.81} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5279/19440 [15:15:32<12:08:22, 3.09s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1877, 'learning_rate': 0.00026665425357273243, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5280/19440 [15:15:35<12:03:20, 3.06s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0595, 'learning_rate': 0.0002666354313624146, 'epoch': 0.81} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5281/19440 [15:15:38<11:52:32, 3.02s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2734, 'learning_rate': 0.0002666166091520967, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5282/19440 [15:15:41<11:43:52, 2.98s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5283/19440 [15:15:44<11:34:24, 2.94s/it] + 27%|███████████████████▌ | 5283/19440 [15:15:44<11:34:24, 2.94s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.9483, 'learning_rate': 0.00026657896473146107, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5284/19440 [15:15:47<11:30:12, 2.93s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0144, 'learning_rate': 0.00026656014252114327, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5285/19440 [15:15:50<11:25:25, 2.91s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.111, 'learning_rate': 0.00026654132031082547, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5286/19440 [15:15:53<11:17:44, 2.87s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8899, 'learning_rate': 0.0002665224981005076, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5287/19440 [15:15:55<11:16:01, 2.87s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.9695, 'learning_rate': 0.00026650367589018976, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5288/19440 [15:15:59<11:41:09, 2.97s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9581, 'learning_rate': 0.0002664848536798719, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5289/19440 [15:16:01<11:30:18, 2.93s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5290/19440 [15:16:04<11:18:10, 2.88s/it] + 27%|███████████████████▌ | 5290/19440 [15:16:04<11:18:10, 2.88s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8847, 'learning_rate': 0.00026644720925923625, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5291/19440 [15:16:07<11:06:25, 2.83s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.6392, 'learning_rate': 0.00026642838704891845, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5292/19440 [15:16:10<10:56:59, 2.79s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5293/19440 [15:16:12<10:46:01, 2.74s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.5232, 'learning_rate': 0.0002664095648386006, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.5409, 'learning_rate': 0.0002663907426282828, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5294/19440 [15:16:15<10:33:38, 2.69s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.2597, 'learning_rate': 0.00026637192041796494, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5295/19440 [15:16:17<10:27:23, 2.66s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.1682, 'learning_rate': 0.0002663530982076471, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|██████���████████████▌ | 5296/19440 [15:16:20<10:17:04, 2.62s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▌ | 5297/19440 [15:16:22<10:07:23, 2.58s/it] + 27%|███████████████████▌ | 5297/19440 [15:16:22<10:07:23, 2.58s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.7347, 'learning_rate': 0.0002663154537870114, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▉ | 5298/19440 [15:16:25<9:57:33, 2.54s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.635, 'learning_rate': 0.0002662966315766936, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▉ | 5299/19440 [15:16:27<9:48:37, 2.50s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.5734, 'learning_rate': 0.00026627780936637577, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5300/19440 [15:16:30<10:01:02, 2.55s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.3118, 'learning_rate': 0.0002662589871560579, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5301/19440 [15:16:35<12:50:01, 3.27s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.0103, 'learning_rate': 0.00026624016494574006, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5302/19440 [15:16:39<14:04:11, 3.58s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.995, 'learning_rate': 0.00026622134273542226, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5303/19440 [15:16:43<14:38:02, 3.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.118, 'learning_rate': 0.00026620252052510446, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5304/19440 [15:16:47<14:52:51, 3.79s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.1194, 'learning_rate': 0.0002661836983147866, 'epoch': 0.82} + 27%|███████████████████▋ | 5305/19440 [15:16:51<14:56:57, 3.81s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5306/19440 [15:16:55<14:56:29, 3.81s/it] + 27%|███████████████████▋ | 5306/19440 [15:16:55<14:56:29, 3.81s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5307/19440 [15:16:59<15:01:10, 3.83s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7802, 'learning_rate': 0.00026614605389415095, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5308/19440 [15:17:02<14:53:51, 3.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9615, 'learning_rate': 0.0002661272316838331, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8188, 'learning_rate': 0.00026610840947351524, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5309/19440 [15:17:06<14:46:10, 3.76s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8733, 'learning_rate': 0.00026608958726319744, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5310/19440 [15:17:10<14:38:23, 3.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8261, 'learning_rate': 0.00026607076505287964, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5311/19440 [15:17:13<14:31:40, 3.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7105, 'learning_rate': 0.0002660519428425618, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5312/19440 [15:17:17<14:23:05, 3.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7052, 'learning_rate': 0.000266033120632244, 'epoch': 0.82} + 27%|███████████████████▋ | 5313/19440 [15:17:21<14:40:50, 3.74s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5314/19440 [15:17:24<14:25:02, 3.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6806, 'learning_rate': 0.00026601429842192613, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6674, 'learning_rate': 0.0002659954762116083, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5315/19440 [15:17:28<14:05:56, 3.59s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5874, 'learning_rate': 0.0002659766540012904, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5316/19440 [15:17:31<13:52:58, 3.54s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5317/19440 [15:17:35<13:37:06, 3.47s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.69, 'learning_rate': 0.0002659578317909726, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5523, 'learning_rate': 0.00026593900958065477, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5318/19440 [15:17:38<13:24:22, 3.42s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.4107, 'learning_rate': 0.00026592018737033697, 'epoch': 0.82} + 27%|███████████████████▋ | 5319/19440 [15:17:41<13:13:05, 3.37s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3817, 'learning_rate': 0.00026590136516001917, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5320/19440 [15:17:44<13:05:21, 3.34s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3907, 'learning_rate': 0.0002658825429497013, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5321/19440 [15:17:48<12:56:26, 3.30s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5322/19440 [15:17:51<12:47:28, 3.26s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.4387, 'learning_rate': 0.00026586372073938346, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3166, 'learning_rate': 0.0002658448985290656, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5323/19440 [15:17:54<12:44:13, 3.25s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5324/19440 [15:17:57<12:36:53, 3.22s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4322, 'learning_rate': 0.0002658260763187478, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2278, 'learning_rate': 0.00026580725410842995, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5325/19440 [15:18:01<13:03:35, 3.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.391, 'learning_rate': 0.00026578843189811215, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5326/19440 [15:18:04<12:52:02, 3.28s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5327/19440 [15:18:07<12:35:36, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3891, 'learning_rate': 0.0002657696096877943, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2199, 'learning_rate': 0.00026575078747747644, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5328/19440 [15:18:10<12:26:24, 3.17s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5329/19440 [15:18:13<12:17:22, 3.14s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1468, 'learning_rate': 0.00026573196526715864, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0826, 'learning_rate': 0.0002657131430568408, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5330/19440 [15:18:16<12:10:32, 3.11s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5331/19440 [15:18:19<11:59:35, 3.06s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.1323, 'learning_rate': 0.000265694320846523, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0305, 'learning_rate': 0.00026567549863620513, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▋ | 5332/19440 [15:18:22<11:54:05, 3.04s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5333/19440 [15:18:25<11:42:33, 2.99s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3133, 'learning_rate': 0.00026565667642588733, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2524, 'learning_rate': 0.00026563785421556947, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5334/19440 [15:18:28<11:40:58, 2.98s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5335/19440 [15:18:31<11:46:25, 3.00s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0932, 'learning_rate': 0.0002656190320052516, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0982, 'learning_rate': 0.00026560020979493376, 'epoch': 0.82} + 27%|███████████████████▊ | 5336/19440 [15:18:34<11:32:37, 2.95s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0795, 'learning_rate': 0.00026558138758461596, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5337/19440 [15:18:36<11:21:43, 2.90s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0823, 'learning_rate': 0.00026556256537429816, 'epoch': 0.82} + 27%|███████████████████▊ | 5338/19440 [15:18:40<11:39:41, 2.98s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.832, 'learning_rate': 0.0002655437431639803, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5339/19440 [15:18:42<11:27:50, 2.93s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5340/19440 [15:18:45<11:16:58, 2.88s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8042, 'learning_rate': 0.0002655249209536625, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6653, 'learning_rate': 0.00026550609874334465, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5341/19440 [15:18:48<11:03:00, 2.82s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6142, 'learning_rate': 0.0002654872765330268, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5342/19440 [15:18:51<10:51:34, 2.77s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5343/19440 [15:18:53<10:41:38, 2.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.5594, 'learning_rate': 0.00026546845432270894, 'epoch': 0.82} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5344/19440 [15:18:56<10:32:18, 2.69s/it] + 27%|███████████████████▊ | 5344/19440 [15:18:56<10:32:18, 2.69s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.3073, 'learning_rate': 0.0002654308099020733, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 27%|███████████████████▊ | 5345/19440 [15:18:58<10:23:38, 2.65s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5346/19440 [15:19:01<10:13:32, 2.61s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.9735, 'learning_rate': 0.0002654119876917555, 'epoch': 0.82} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5347/19440 [15:19:03<10:02:23, 2.56s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.0256, 'learning_rate': 0.0002653931654814377, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5348/19440 [15:19:06<9:53:55, 2.53s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.9181, 'learning_rate': 0.00026537434327111983, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.9471, 'learning_rate': 0.000265355521060802, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5349/19440 [15:19:08<9:44:31, 2.49s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5350/19440 [15:19:11<10:00:52, 2.56s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.6875, 'learning_rate': 0.0002653366988504841, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5351/19440 [15:19:16<12:30:20, 3.20s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.3718, 'learning_rate': 0.0002653178766401663, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.321, 'learning_rate': 0.00026529905442984847, 'epoch': 0.83} + 28%|███████████████████▊ | 5352/19440 [15:19:20<13:45:08, 3.51s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5353/19440 [15:19:24<14:24:44, 3.68s/it] + 28%|███████████████████▊ | 5353/19440 [15:19:24<14:24:44, 3.68s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5354/19440 [15:19:28<14:43:20, 3.76s/it] + 28%|███████████████████▊ | 5354/19440 [15:19:28<14:43:20, 3.76s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5355/19440 [15:19:32<14:48:06, 3.78s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.1694, 'learning_rate': 0.00026524258779889496, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████��██▊ | 5356/19440 [15:19:36<15:01:39, 3.84s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8816, 'learning_rate': 0.00026522376558857716, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5357/19440 [15:19:40<15:02:48, 3.85s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.9627, 'learning_rate': 0.0002652049433782593, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9357, 'learning_rate': 0.0002651861211679415, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5358/19440 [15:19:43<14:49:20, 3.79s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.9143, 'learning_rate': 0.00026516729895762365, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5359/19440 [15:19:47<14:41:03, 3.75s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8367, 'learning_rate': 0.00026514847674730585, 'epoch': 0.83} + 28%|███████████████████▊ | 5360/19440 [15:19:50<14:28:42, 3.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5361/19440 [15:19:54<14:20:24, 3.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5368, 'learning_rate': 0.000265129654536988, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5362/19440 [15:19:58<14:10:09, 3.62s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8248, 'learning_rate': 0.00026511083232667014, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6102, 'learning_rate': 0.00026509201011635234, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5363/19440 [15:20:01<14:27:24, 3.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6859, 'learning_rate': 0.0002650731879060345, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5364/19440 [15:20:05<14:11:07, 3.63s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5365/19440 [15:20:08<13:55:16, 3.56s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5644, 'learning_rate': 0.0002650543656957167, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▊ | 5366/19440 [15:20:12<13:41:04, 3.50s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7175, 'learning_rate': 0.00026503554348539883, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6483, 'learning_rate': 0.00026501672127508103, 'epoch': 0.83} + 28%|███████████████████▉ | 5367/19440 [15:20:15<13:26:04, 3.44s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5368/19440 [15:20:18<13:10:59, 3.37s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5963, 'learning_rate': 0.0002649978990647632, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5012, 'learning_rate': 0.0002649790768544453, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5369/19440 [15:20:21<12:56:58, 3.31s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5370/19440 [15:20:25<12:51:24, 3.29s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3413, 'learning_rate': 0.00026496025464412746, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.4024, 'learning_rate': 0.00026494143243380966, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5371/19440 [15:20:28<12:41:51, 3.25s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5372/19440 [15:20:31<12:36:22, 3.23s/it] + 28%|███████████████████▉ | 5372/19440 [15:20:31<12:36:22, 3.23s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5373/19440 [15:20:34<12:31:58, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3628, 'learning_rate': 0.000264903788013174, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5333, 'learning_rate': 0.0002648849658028562, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5374/19440 [15:20:37<12:21:51, 3.16s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5375/19440 [15:20:41<12:43:42, 3.26s/it] + 28%|███████████████████▉ | 5375/19440 [15:20:41<12:43:42, 3.26s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|█��█████████████████▉ | 5376/19440 [15:20:44<12:33:01, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2416, 'learning_rate': 0.0002648473213822205, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5377/19440 [15:20:47<12:17:53, 3.15s/it] + 28%|███████████████████▉ | 5377/19440 [15:20:47<12:17:53, 3.15s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5378/19440 [15:20:50<12:06:03, 3.10s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3078, 'learning_rate': 0.00026480967696158484, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5379/19440 [15:20:53<11:55:37, 3.05s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1589, 'learning_rate': 0.000264790854751267, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5380/19440 [15:20:56<11:51:25, 3.04s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3578, 'learning_rate': 0.0002647720325409492, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5381/19440 [15:20:59<11:42:26, 3.00s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1386, 'learning_rate': 0.00026475321033063133, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0363, 'learning_rate': 0.0002647343881203135, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5382/19440 [15:21:01<11:34:57, 2.97s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5383/19440 [15:21:04<11:28:19, 2.94s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.234, 'learning_rate': 0.0002647155659099957, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9729, 'learning_rate': 0.0002646967436996778, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5384/19440 [15:21:07<11:22:54, 2.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5385/19440 [15:21:10<11:15:20, 2.88s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1542, 'learning_rate': 0.00026467792148936, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5386/19440 [15:21:13<11:09:15, 2.86s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0412, 'learning_rate': 0.00026465909927904217, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2245, 'learning_rate': 0.00026464027706872437, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5387/19440 [15:21:16<11:02:23, 2.83s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5388/19440 [15:21:19<11:26:00, 2.93s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9029, 'learning_rate': 0.0002646214548584065, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9255, 'learning_rate': 0.00026460263264808866, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5389/19440 [15:21:21<11:14:15, 2.88s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5390/19440 [15:21:24<11:00:21, 2.82s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6013, 'learning_rate': 0.00026458381043777086, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5391/19440 [15:21:27<10:48:38, 2.77s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.7456, 'learning_rate': 0.000264564988227453, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.6665, 'learning_rate': 0.0002645461660171352, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5392/19440 [15:21:29<10:40:00, 2.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5393/19440 [15:21:32<10:31:12, 2.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.8873, 'learning_rate': 0.00026452734380681735, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5394/19440 [15:21:35<10:25:49, 2.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6572, 'learning_rate': 0.00026450852159649955, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.2032, 'learning_rate': 0.0002644896993861817, 'epoch': 0.83} + 28%|███████████████████▉ | 5395/19440 [15:21:37<10:15:58, 2.63s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.1896, 'learning_rate': 0.00026447087717586384, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████████████▉ | 5396/19440 [15:21:40<10:05:10, 2.59s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5397/19440 [15:21:42<9:55:27, 2.54s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.1594, 'learning_rate': 0.000264452054965546, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5398/19440 [15:21:45<9:46:32, 2.51s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.0671, 'learning_rate': 0.0002644332327552282, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5399/19440 [15:21:47<9:39:14, 2.48s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.9863, 'learning_rate': 0.0002644144105449104, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.6278, 'learning_rate': 0.00026439558833459253, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5400/19440 [15:21:50<9:55:24, 2.54s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5401/19440 [15:21:54<12:29:50, 3.20s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.3243, 'learning_rate': 0.00026437676612427473, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5402/19440 [15:21:59<13:43:27, 3.52s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.3223, 'learning_rate': 0.0002643579439139569, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5403/19440 [15:22:03<14:22:18, 3.69s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.0393, 'learning_rate': 0.000264339121703639, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5404/19440 [15:22:07<14:39:07, 3.76s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.0474, 'learning_rate': 0.00026432029949332117, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5405/19440 [15:22:11<14:46:00, 3.79s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.0454, 'learning_rate': 0.00026430147728300336, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5406/19440 [15:22:14<14:50:29, 3.81s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8599, 'learning_rate': 0.00026428265507268556, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5407/19440 [15:22:18<14:52:53, 3.82s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7652, 'learning_rate': 0.0002642638328623677, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7348, 'learning_rate': 0.00026424501065204985, 'epoch': 0.83} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5408/19440 [15:22:22<14:42:36, 3.77s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5409/19440 [15:22:26<14:36:28, 3.75s/it] + 28%|████████████████████ | 5409/19440 [15:22:26<14:36:28, 3.75s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5410/19440 [15:22:29<14:30:10, 3.72s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.9013, 'learning_rate': 0.0002642073662314142, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5411/19440 [15:22:33<14:19:57, 3.68s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8167, 'learning_rate': 0.00026418854402109635, 'epoch': 0.83} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5412/19440 [15:22:37<14:23:26, 3.69s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.9145, 'learning_rate': 0.00026416972181077854, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5413/19440 [15:22:40<14:37:59, 3.76s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8572, 'learning_rate': 0.0002641508996004607, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.7758, 'learning_rate': 0.0002641320773901429, 'epoch': 0.84} + 28%|████████████████████ | 5414/19440 [15:22:44<14:18:56, 3.67s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5415/19440 [15:22:47<13:59:53, 3.59s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5918, 'learning_rate': 0.00026411325517982503, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5416/19440 [15:22:51<13:44:21, 3.53s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6816, 'learning_rate': 0.0002640944329695072, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5758, 'learning_rate': 0.0002640756107591894, 'epoch': 0.84} + 28%|████████████████████ | 5417/19440 [15:22:54<13:31:53, 3.47s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5418/19440 [15:22:57<13:15:00, 3.40s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.7446, 'learning_rate': 0.0002640567885488715, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5419/19440 [15:23:01<13:04:18, 3.36s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5982, 'learning_rate': 0.0002640379663385537, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5420/19440 [15:23:04<12:56:37, 3.32s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.596, 'learning_rate': 0.00026401914412823587, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5421/19440 [15:23:07<12:49:44, 3.29s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.5991, 'learning_rate': 0.00026400032191791807, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2699, 'learning_rate': 0.0002639814997076002, 'epoch': 0.84} + 28%|████████████████████ | 5422/19440 [15:23:10<12:47:08, 3.28s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5423/19440 [15:23:13<12:39:16, 3.25s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.29, 'learning_rate': 0.00026396267749728236, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4969, 'learning_rate': 0.00026394385528696456, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5424/19440 [15:23:17<12:33:08, 3.22s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5425/19440 [15:23:20<12:54:35, 3.32s/it] + 28%|████████████████████ | 5425/19440 [15:23:20<12:54:35, 3.32s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5426/19440 [15:23:23<12:41:49, 3.26s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2905, 'learning_rate': 0.0002639062108663289, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5427/19440 [15:23:26<12:28:27, 3.20s/it] + 28%|████████████████████ | 5427/19440 [15:23:26<12:28:27, 3.20s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5428/19440 [15:23:29<12:15:21, 3.15s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3262, 'learning_rate': 0.00026386856644569325, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5429/19440 [15:23:32<12:06:40, 3.11s/it] + 28%|████████████████████ | 5429/19440 [15:23:32<12:06:40, 3.11s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5430/19440 [15:23:35<11:56:31, 3.07s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.9985, 'learning_rate': 0.00026383092202505754, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5431/19440 [15:23:38<11:48:29, 3.03s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3218, 'learning_rate': 0.0002638120998147397, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5432/19440 [15:23:42<12:18:21, 3.16s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.1871, 'learning_rate': 0.0002637932776044219, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████ | 5433/19440 [15:23:45<12:03:07, 3.10s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0757, 'learning_rate': 0.0002637744553941041, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5434/19440 [15:23:48<11:48:21, 3.03s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9613, 'learning_rate': 0.00026375563318378623, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5435/19440 [15:23:50<11:35:55, 2.98s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9311, 'learning_rate': 0.0002637368109734684, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5436/19440 [15:23:53<11:28:08, 2.95s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9537, 'learning_rate': 0.0002637179887631505, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5437/19440 [15:23:56<11:19:42, 2.91s/it] + 28%|████████████████████▏ | 5437/19440 [15:23:56<11:19:42, 2.91s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5438/19440 [15:23:59<11:42:09, 3.01s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.8016, 'learning_rate': 0.00026368034434251487, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5439/19440 [15:24:02<11:31:07, 2.96s/it] + 28%|████████████████████▏ | 5439/19440 [15:24:02<11:31:07, 2.96s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5440/19440 [15:24:05<11:16:26, 2.90s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.8277, 'learning_rate': 0.0002636426999218792, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5441/19440 [15:24:08<11:03:36, 2.84s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.6631, 'learning_rate': 0.0002636238777115614, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.7839, 'learning_rate': 0.00026360505550124356, 'epoch': 0.84} + 28%|████████████████████▏ | 5442/19440 [15:24:10<10:55:41, 2.81s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.5369, 'learning_rate': 0.0002635862332909257, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5443/19440 [15:24:13<10:44:55, 2.76s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5444/19440 [15:24:16<10:37:02, 2.73s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.6975, 'learning_rate': 0.0002635674110806079, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5445/19440 [15:24:18<10:26:04, 2.68s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.2611, 'learning_rate': 0.00026354858887029005, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5446/19440 [15:24:21<10:16:05, 2.64s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.6138, 'learning_rate': 0.00026352976665997225, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5447/19440 [15:24:23<10:08:14, 2.61s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.0781, 'learning_rate': 0.0002635109444496544, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▍ | 5448/19440 [15:24:26<9:58:28, 2.57s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.8467, 'learning_rate': 0.0002634921222393366, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▍ | 5449/19440 [15:24:28<9:48:22, 2.52s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.9404, 'learning_rate': 0.00026347330002901874, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5450/19440 [15:24:31<10:00:31, 2.58s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.4603, 'learning_rate': 0.0002634544778187009, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.5232, 'learning_rate': 0.0002634356556083831, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5451/19440 [15:24:36<12:32:10, 3.23s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.1705, 'learning_rate': 0.0002634168333980652, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5452/19440 [15:24:40<13:46:32, 3.55s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.1179, 'learning_rate': 0.0002633980111877474, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████���████████▏ | 5453/19440 [15:24:44<14:26:27, 3.72s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 7.0453, 'learning_rate': 0.00026337918897742957, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5454/19440 [15:24:48<14:45:07, 3.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 7.055, 'learning_rate': 0.00026336036676711177, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5455/19440 [15:24:52<14:48:27, 3.81s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.9114, 'learning_rate': 0.0002633415445567939, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5456/19440 [15:24:56<14:51:23, 3.82s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6926, 'learning_rate': 0.00026332272234647606, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5457/19440 [15:25:00<14:55:29, 3.84s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.8101, 'learning_rate': 0.00026330390013615826, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5458/19440 [15:25:03<14:45:33, 3.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5459/19440 [15:25:07<14:36:04, 3.76s/it] + 28%|████████████████████▏ | 5459/19440 [15:25:07<14:36:04, 3.76s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5921, 'learning_rate': 0.0002632662557155226, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5460/19440 [15:25:11<14:25:51, 3.72s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.8884, 'learning_rate': 0.00026324743350520475, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5461/19440 [15:25:14<14:13:46, 3.66s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6955, 'learning_rate': 0.0002632286112948869, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5462/19440 [15:25:18<14:05:36, 3.63s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.6144, 'learning_rate': 0.00026320978908456904, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5463/19440 [15:25:22<14:25:15, 3.71s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.5977, 'learning_rate': 0.00026319096687425124, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5464/19440 [15:25:25<14:11:29, 3.66s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.736, 'learning_rate': 0.0002631721446639334, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5465/19440 [15:25:29<13:52:28, 3.57s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.4369, 'learning_rate': 0.0002631533224536156, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5466/19440 [15:25:32<13:47:46, 3.55s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▏ | 5467/19440 [15:25:35<13:32:32, 3.49s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.6088, 'learning_rate': 0.0002631345002432978, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2617, 'learning_rate': 0.00026311567803297993, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5468/19440 [15:25:39<13:17:03, 3.42s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4902, 'learning_rate': 0.0002630968558226621, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5469/19440 [15:25:42<13:04:29, 3.37s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4919, 'learning_rate': 0.0002630780336123442, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5470/19440 [15:25:45<12:56:45, 3.34s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.4632, 'learning_rate': 0.0002630592114020264, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5471/19440 [15:25:48<12:46:00, 3.29s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3952, 'learning_rate': 0.00026304038919170857, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5472/19440 [15:25:52<12:38:45, 3.26s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.3476, 'learning_rate': 0.00026302156698139077, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|███████████��████████▎ | 5473/19440 [15:25:55<12:34:18, 3.24s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5474/19440 [15:25:58<12:26:17, 3.21s/it] + 28%|████████████████████▎ | 5474/19440 [15:25:58<12:26:17, 3.21s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5475/19440 [15:26:02<12:50:22, 3.31s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2908, 'learning_rate': 0.0002629839225607551, 'epoch': 0.84} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.3333, 'learning_rate': 0.00026296510035043726, 'epoch': 0.84} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5476/19440 [15:26:05<12:40:22, 3.27s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5477/19440 [15:26:08<12:24:18, 3.20s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.2369, 'learning_rate': 0.0002629462781401194, 'epoch': 0.85} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0442, 'learning_rate': 0.0002629274559298016, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5478/19440 [15:26:11<12:12:01, 3.15s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5479/19440 [15:26:14<12:04:32, 3.11s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.407, 'learning_rate': 0.00026290863371948375, 'epoch': 0.85} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.263, 'learning_rate': 0.00026288981150916595, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5480/19440 [15:26:17<11:56:28, 3.08s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5481/19440 [15:26:20<11:48:51, 3.05s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.0313, 'learning_rate': 0.0002628709892988481, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.2339, 'learning_rate': 0.0002628521670885303, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5482/19440 [15:26:23<11:38:46, 3.00s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5483/19440 [15:26:26<11:32:41, 2.98s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.9356, 'learning_rate': 0.00026283334487821244, 'epoch': 0.85} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 6.0939, 'learning_rate': 0.0002628145226678946, 'epoch': 0.85} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5484/19440 [15:26:28<11:23:25, 2.94s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.9858, 'learning_rate': 0.0002627957004575768, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5485/19440 [15:26:31<11:15:19, 2.90s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.9592, 'learning_rate': 0.0002627768782472589, 'epoch': 0.85} + 28%|████████████████████▎ | 5486/19440 [15:26:34<11:10:38, 2.88s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.91, 'learning_rate': 0.0002627580560369411, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5487/19440 [15:26:37<11:03:37, 2.85s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5488/19440 [15:26:40<11:28:15, 2.96s/it] + 28%|████████████████████▎ | 5488/19440 [15:26:40<11:28:15, 2.96s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.529, 'learning_rate': 0.0002627204116163054, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5489/19440 [15:26:43<11:18:01, 2.92s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 6.1441, 'learning_rate': 0.00026270158940598756, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5490/19440 [15:26:46<11:03:07, 2.85s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.7983, 'learning_rate': 0.00026268276719566976, 'epoch': 0.85} + 28%|████████████████████▎ | 5491/19440 [15:26:48<10:49:54, 2.80s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.39, 'learning_rate': 0.0002626639449853519, 'epoch': 0.85} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5492/19440 [15:26:51<10:38:25, 2.75s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.3332, 'learning_rate': 0.0002626451227750341, 'epoch': 0.85} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5493/19440 [15:26:53<10:28:44, 2.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5494/19440 [15:26:56<10:19:02, 2.66s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.5261, 'learning_rate': 0.0002626263005647163, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 5.531, 'learning_rate': 0.00026260747835439845, 'epoch': 0.85} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5495/19440 [15:26:59<10:09:33, 2.62s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.9314, 'learning_rate': 0.0002625886561440806, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▎ | 5496/19440 [15:27:01<10:01:15, 2.59s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 5.0433, 'learning_rate': 0.00026256983393376274, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▋ | 5497/19440 [15:27:04<9:55:42, 2.56s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▋ | 5498/19440 [15:27:06<9:46:10, 2.52s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +{'loss': 4.6725, 'learning_rate': 0.00026255101172344494, 'epoch': 0.85} +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.859, 'learning_rate': 0.0002625321895131271, 'epoch': 0.85} + 28%|████████████████████▋ | 5499/19440 [15:27:08<9:33:45, 2.47s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed +{'loss': 4.3866, 'learning_rate': 0.0002625133673028093, 'epoch': 0.85} +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`... +Could not estimate the number of tokens of the input, floating-point operations will not be computed + 28%|████████████████████▋ | 5500/19440 [15:27:11<9:49:54, 2.54s/it]The following columns in the evaluation set don't have a corresponding argument in `SpeechEncoderDecoderModel.forward` and have been ignored: length, lang. If length, lang are not expected by `SpeechEncoderDecoderModel.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 14760 + Batch size = 4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +100%|█████████████████████████████████████████████████████████████████████████████| 3690/3690 [1:01:30<00:00, 1.04it/s] + +Configuration saved in ./checkpoint-5500/config.json +Model weights saved in ./checkpoint-5500/pytorch_model.bin +Feature extractor saved in ./checkpoint-5500/preprocessor_config.json +Traceback (most recent call last): + File "/home/sanchit_huggingface_co/gcp/lib/python3.9/site-packages/torch/serialization.py", line 379, in save + _save(obj, opened_zipfile, pickle_module, pickle_protocol) + File "/home/sanchit_huggingface_co/gcp/lib/python3.9/site-packages/torch/serialization.py", line 499, in _save + zip_file.write_record(name, storage.data_ptr(), num_bytes) +OSError: [Errno 28] No space left on device +During handling of the above exception, another exception occurred: +Traceback (most recent call last): + File "/home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en_2/run_xtreme_s.py", line 947, in + main() + File "/home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en_2/run_xtreme_s.py", line 874, in main + train_result = trainer.train(resume_from_checkpoint=checkpoint) + File "/home/sanchit_huggingface_co/transformers/src/transformers/trainer.py", line 1524, in train + self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) + File "/home/sanchit_huggingface_co/transformers/src/transformers/trainer.py", line 1655, in _maybe_log_save_evaluate + self._save_checkpoint(model, trial, metrics=metrics) + File "/home/sanchit_huggingface_co/transformers/src/transformers/trainer.py", line 1757, in _save_checkpoint + torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) + File "/home/sanchit_huggingface_co/gcp/lib/python3.9/site-packages/torch/serialization.py", line 380, in save + return + File "/home/sanchit_huggingface_co/gcp/lib/python3.9/site-packages/torch/serialization.py", line 259, in __exit__ + self.file_like.write_end_of_file() +RuntimeError: [enforce fail at inline_container.cc:300] . unexpected pos 2888813184 vs 2888813072 \ No newline at end of file