diff --git "a/train_job_output.txt" "b/train_job_output.txt" --- "a/train_job_output.txt" +++ "b/train_job_output.txt" @@ -487,4 +487,72 @@ command outputs: 85%|████████▌ | 10050/11788 [1:25:11<14:19, 2.02it/s] 85%|████████▌ | 10051/11788 [1:25:11<14:19, 2.02it/s] 85%|████████▌ | 10052/11788 [1:25:12<14:18, 2.02it/s] 85%|████████▌ | 10053/11788 [1:25:12<14:17, 2.02it/s] 85%|████████▌ | 10054/11788 [1:25:13<14:17, 2.02it/s] 85%|████████▌ | 10055/11788 [1:25:13<14:15, 2.03it/s] 85%|████████▌ | 10056/11788 [1:25:14<14:14, 2.03it/s] 85%|████████▌ | 10057/11788 [1:25:14<14:14, 2.03it/s] 85%|████████▌ | 10058/11788 [1:25:15<14:14, 2.02it/s] 85%|████████▌ | 10059/11788 [1:25:15<14:14, 2.02it/s] 85%|████████▌ | 10060/11788 [1:25:16<14:13, 2.02it/s] 85%|████████▌ | 10061/11788 [1:25:16<14:12, 2.02it/s] 85%|████████▌ | 10062/11788 [1:25:17<14:13, 2.02it/s] 85%|████████▌ | 10063/11788 [1:25:17<14:12, 2.02it/s] 85%|████████▌ | 10064/11788 [1:25:18<14:13, 2.02it/s] 85%|████████▌ | 10065/11788 [1:25:18<14:11, 2.02it/s] 85%|████████▌ | 10066/11788 [1:25:19<14:11, 2.02it/s] 85%|████████▌ | 10067/11788 [1:25:19<14:09, 2.02it/s] 85%|████████▌ | 10068/11788 [1:25:20<14:10, 2.02it/s] 85%|████████▌ | 10069/11788 [1:25:20<14:09, 2.02it/s] 85%|████████▌ | 10070/11788 [1:25:21<14:08, 2.02it/s] 85%|████████▌ | 10071/11788 [1:25:21<14:07, 2.03it/s] 85%|████████▌ | 10072/11788 [1:25:22<14:06, 2.03it/s] 85%|████████▌ | 10073/11788 [1:25:22<14:06, 2.03it/s] 85%|████████▌ | 10074/11788 [1:25:23<14:05, 2.03it/s] 85%|████████▌ | 10075/11788 [1:25:23<14:06, 2.02it/s]{'loss': 2.6053, 'grad_norm': 0.25624313950538635, 'learning_rate': 6.296119755458624e-05, 'epoch': 11.97} 85%|████████▌ | 10075/11788 [1:25:23<14:06, 2.02it/s] 85%|████████▌ | 10076/11788 [1:25:24<14:06, 2.02it/s] 85%|████████▌ | 10077/11788 [1:25:24<14:06, 2.02it/s] 85%|████████▌ | 10078/11788 [1:25:25<14:04, 2.03it/s] 86%|████████▌ | 10079/11788 [1:25:25<14:04, 2.02it/s] 86%|████████▌ | 10080/11788 [1:25:26<14:02, 2.03it/s] 86%|████████▌ | 10081/11788 [1:25:26<14:02, 2.03it/s] 86%|████████▌ | 10082/11788 [1:25:27<14:02, 2.03it/s] 86%|████████▌ | 10083/11788 [1:25:27<14:01, 2.03it/s] 86%|████████▌ | 10084/11788 [1:25:28<14:01, 2.03it/s] 86%|████████▌ | 10085/11788 [1:25:28<14:01, 2.02it/s] 86%|████████▌ | 10086/11788 [1:25:29<14:01, 2.02it/s] 86%|████████▌ | 10087/11788 [1:25:29<14:00, 2.02it/s] 86%|████████▌ | 10088/11788 [1:25:30<14:00, 2.02it/s] 86%|████████▌ | 10089/11788 [1:25:30<13:58, 2.03it/s] 86%|████████▌ | 10090/11788 [1:25:31<13:58, 2.02it/s] 86%|████████▌ | 10091/11788 [1:25:31<13:57, 2.03it/s] 86%|████████▌ | 10092/11788 [1:25:32<13:57, 2.03it/s] 86%|████████▌ | 10093/11788 [1:25:32<13:56, 2.03it/s] 86%|████████▌ | 10094/11788 [1:25:33<13:56, 2.03it/s] 86%|████████▌ | 10095/11788 [1:25:33<13:56, 2.02it/s] 86%|████████▌ | 10096/11788 [1:25:34<13:54, 2.03it/s] 86%|████████▌ | 10097/11788 [1:25:34<13:54, 2.03it/s] 86%|████████▌ | 10098/11788 [1:25:35<13:54, 2.03it/s] 86%|████████▌ | 10099/11788 [1:25:35<13:54, 2.02it/s] 86%|████████▌ | 10100/11788 [1:25:36<13:54, 2.02it/s]{'loss': 2.6224, 'grad_norm': 0.254876047372818, 'learning_rate': 6.117502033369798e-05, 'epoch': 12.0} 86%|████████▌ | 10100/11788 [1:25:36<13:54, 2.02it/s] 86%|████████▌ | 10101/11788 [1:25:36<13:54, 2.02it/s] 86%|████████▌ | 10102/11788 [1:25:37<13:54, 2.02it/s] 86%|████████▌ | 10103/11788 [1:25:37<13:53, 2.02it/s] 86%|████████▌ | 10104/11788 [1:25:38<13:47, 2.03it/s] 86%|████████▌ | 10105/11788 [1:25:49<1:50:03, 3.92s/it] 86%|████████▌ | 10106/11788 [1:25:50<1:21:12, 2.90s/it] 86%|████████▌ | 10107/11788 [1:25:50<1:00:57, 2.18s/it] 86%|████████▌ | 10108/11788 [1:25:51<46:48, 1.67s/it] 86%|████████▌ | 10109/11788 [1:25:51<36:58, 1.32s/it] 86%|████████▌ | 10110/11788 [1:25:52<29:59, 1.07s/it] 86%|████████▌ | 10111/11788 [1:25:52<25:10, 1.11it/s] 86%|████████▌ | 10112/11788 [1:25:53<21:49, 1.28it/s] 86%|████████▌ | 10113/11788 [1:25:53<19:23, 1.44it/s] 86%|████████▌ | 10114/11788 [1:25:54<17:44, 1.57it/s] 86%|████████▌ | 10115/11788 [1:25:54<16:31, 1.69it/s] 86%|████████▌ | 10116/11788 [1:25:55<15:42, 1.77it/s] 86%|████████▌ | 10117/11788 [1:25:55<15:07, 1.84it/s] 86%|████████▌ | 10118/11788 [1:25:56<14:43, 1.89it/s] 86%|████████▌ | 10119/11788 [1:25:56<14:24, 1.93it/s] 86%|████████▌ | 10120/11788 [1:25:57<14:12, 1.96it/s] 86%|████████▌ | 10121/11788 [1:25:57<14:02, 1.98it/s] 86%|████████▌ | 10122/11788 [1:25:58<13:57, 1.99it/s] 86%|████████▌ | 10123/11788 [1:25:58<13:52, 2.00it/s] 86%|████████▌ | 10124/11788 [1:25:59<13:49, 2.01it/s] 86%|████████▌ | 10125/11788 [1:25:59<13:47, 2.01it/s] {'loss': 2.5569, 'grad_norm': 0.2573976516723633, 'learning_rate': 5.9412893397509603e-05, 'epoch': 12.02} - 86%|████████▌ | 10125/11788 [1:25:59<13:47, 2.01it/s] 86%|████████▌ | 10126/11788 [1:26:00<13:47, 2.01it/s] 86%|████████▌ | 10127/11788 [1:26:00<13:45, 2.01it/s] 86%|████████▌ | 10128/11788 [1:26:01<13:44, 2.01it/s] 86%|████████▌ | 10129/11788 [1:26:01<13:42, 2.02it/s] \ No newline at end of file + 86%|████████▌ | 10125/11788 [1:25:59<13:47, 2.01it/s] 86%|████████▌ | 10126/11788 [1:26:00<13:47, 2.01it/s] 86%|████████▌ | 10127/11788 [1:26:00<13:45, 2.01it/s] 86%|████████▌ | 10128/11788 [1:26:01<13:44, 2.01it/s] 86%|████████▌ | 10129/11788 [1:26:01<13:42, 2.02it/s] 86%|████████▌ | 10130/11788 [1:26:02<13:41, 2.02it/s] 86%|████████▌ | 10131/11788 [1:26:02<13:39, 2.02it/s] 86%|████████▌ | 10132/11788 [1:26:03<13:39, 2.02it/s] 86%|████████▌ | 10133/11788 [1:26:03<13:37, 2.02it/s] 86%|████████▌ | 10134/11788 [1:26:04<13:38, 2.02it/s] 86%|████████▌ | 10135/11788 [1:26:04<13:36, 2.02it/s] 86%|████████▌ | 10136/11788 [1:26:05<13:36, 2.02it/s] 86%|████████▌ | 10137/11788 [1:26:05<13:35, 2.02it/s] 86%|████████▌ | 10138/11788 [1:26:06<13:35, 2.02it/s] 86%|████████▌ | 10139/11788 [1:26:06<13:34, 2.02it/s] 86%|████████▌ | 10140/11788 [1:26:07<13:34, 2.02it/s] 86%|████████▌ | 10141/11788 [1:26:07<13:33, 2.02it/s] 86%|████████▌ | 10142/11788 [1:26:08<13:32, 2.03it/s] 86%|████████▌ | 10143/11788 [1:26:08<13:32, 2.03it/s] 86%|████████▌ | 10144/11788 [1:26:09<13:33, 2.02it/s] 86%|████████▌ | 10145/11788 [1:26:09<13:32, 2.02it/s] 86%|████████▌ | 10146/11788 [1:26:10<13:30, 2.03it/s] 86%|████████▌ | 10147/11788 [1:26:10<13:30, 2.03it/s] 86%|████████▌ | 10148/11788 [1:26:11<13:29, 2.03it/s] 86%|████████▌ | 10149/11788 [1:26:11<13:28, 2.03it/s] 86%|████████▌ | 10150/11788 [1:26:12<13:27, 2.03it/s]{'loss': 2.5455, 'grad_norm': 0.25414496660232544, 'learning_rate': 5.76749133213218e-05, 'epoch': 12.05} + 86%|████████▌ | 10150/11788 [1:26:12<13:27, 2.03it/s] 86%|████████▌ | 10151/11788 [1:26:12<13:28, 2.02it/s] 86%|████████▌ | 10152/11788 [1:26:13<13:27, 2.03it/s] 86%|████████▌ | 10153/11788 [1:26:13<13:25, 2.03it/s] 86%|████████▌ | 10154/11788 [1:26:14<13:25, 2.03it/s] 86%|████████▌ | 10155/11788 [1:26:14<13:25, 2.03it/s] 86%|████████▌ | 10156/11788 [1:26:15<13:25, 2.02it/s] 86%|████████▌ | 10157/11788 [1:26:15<13:25, 2.03it/s] 86%|████████▌ | 10158/11788 [1:26:16<13:24, 2.03it/s] 86%|████████▌ | 10159/11788 [1:26:16<13:24, 2.02it/s] 86%|████████▌ | 10160/11788 [1:26:17<13:24, 2.02it/s] 86%|████████▌ | 10161/11788 [1:26:17<13:23, 2.02it/s] 86%|████████▌ | 10162/11788 [1:26:18<13:23, 2.02it/s] 86%|████████▌ | 10163/11788 [1:26:18<13:22, 2.02it/s] 86%|████████▌ | 10164/11788 [1:26:19<13:21, 2.03it/s] 86%|████████▌ | 10165/11788 [1:26:19<13:21, 2.02it/s] 86%|████████▌ | 10166/11788 [1:26:20<13:20, 2.03it/s] 86%|████████▌ | 10167/11788 [1:26:20<14:31, 1.86it/s] 86%|████████▋ | 10168/11788 [1:26:21<14:09, 1.91it/s] 86%|████████▋ | 10169/11788 [1:26:21<13:54, 1.94it/s] 86%|████████▋ | 10170/11788 [1:26:22<13:43, 1.97it/s] 86%|████████▋ | 10171/11788 [1:26:22<13:35, 1.98it/s] 86%|████████▋ | 10172/11788 [1:26:23<13:29, 2.00it/s] 86%|████████▋ | 10173/11788 [1:26:23<13:25, 2.01it/s] 86%|████████▋ | 10174/11788 [1:26:24<13:21, 2.01it/s] 86%|████████▋ | 10175/11788 [1:26:24<13:19, 2.02it/s]{'loss': 2.5565, 'grad_norm': 0.25472745299339294, 'learning_rate': 5.596117535704048e-05, 'epoch': 12.08} + 86%|████████▋ | 10175/11788 [1:26:24<13:19, 2.02it/s] 86%|████████▋ | 10176/11788 [1:26:25<13:18, 2.02it/s] 86%|████████▋ | 10177/11788 [1:26:25<13:17, 2.02it/s] 86%|████████▋ | 10178/11788 [1:26:26<13:15, 2.02it/s] 86%|████████▋ | 10179/11788 [1:26:26<13:14, 2.03it/s] 86%|████████▋ | 10180/11788 [1:26:27<13:13, 2.03it/s] 86%|████████▋ | 10181/11788 [1:26:27<13:12, 2.03it/s] 86%|████████▋ | 10182/11788 [1:26:28<13:12, 2.03it/s] 86%|████████▋ | 10183/11788 [1:26:28<13:11, 2.03it/s] 86%|████████▋ | 10184/11788 [1:26:29<13:12, 2.02it/s] 86%|████████▋ | 10185/11788 [1:26:29<13:10, 2.03it/s] 86%|████████▋ | 10186/11788 [1:26:30<13:10, 2.03it/s] 86%|████████▋ | 10187/11788 [1:26:30<13:09, 2.03it/s] 86%|████████▋ | 10188/11788 [1:26:31<13:08, 2.03it/s] 86%|████████▋ | 10189/11788 [1:26:31<13:09, 2.03it/s] 86%|████████▋ | 10190/11788 [1:26:32<13:08, 2.03it/s] 86%|████████▋ | 10191/11788 [1:26:32<13:08, 2.03it/s] 86%|████████▋ | 10192/11788 [1:26:33<13:06, 2.03it/s] 86%|████████▋ | 10193/11788 [1:26:33<13:10, 2.02it/s] 86%|████████▋ | 10194/11788 [1:26:34<13:07, 2.02it/s] 86%|████████▋ | 10195/11788 [1:26:34<13:06, 2.02it/s] 86%|████████▋ | 10196/11788 [1:26:35<13:05, 2.03it/s] 87%|████████▋ | 10197/11788 [1:26:35<13:05, 2.03it/s] 87%|████████▋ | 10198/11788 [1:26:36<13:04, 2.03it/s] 87%|████████▋ | 10199/11788 [1:26:36<13:03, 2.03it/s] 87%|████████▋ | 10200/11788 [1:26:37<13:03, 2.03it/s]{'loss': 2.5511, 'grad_norm': 0.25652530789375305, 'learning_rate': 5.427177342795592e-05, 'epoch': 12.11} + 87%|████████▋ | 10200/11788 [1:26:37<13:03, 2.03it/s] 87%|████████▋ | 10201/11788 [1:26:37<13:03, 2.03it/s] 87%|████████▋ | 10202/11788 [1:26:38<13:03, 2.03it/s] 87%|████████▋ | 10203/11788 [1:26:38<13:02, 2.03it/s] 87%|████████▋ | 10204/11788 [1:26:39<13:01, 2.03it/s] 87%|████████▋ | 10205/11788 [1:26:39<13:01, 2.03it/s] 87%|████████▋ | 10206/11788 [1:26:40<12:59, 2.03it/s] 87%|████████▋ | 10207/11788 [1:26:40<12:59, 2.03it/s] 87%|████████▋ | 10208/11788 [1:26:41<12:59, 2.03it/s] 87%|████████▋ | 10209/11788 [1:26:41<13:00, 2.02it/s] 87%|████████▋ | 10210/11788 [1:26:42<12:59, 2.02it/s] 87%|████████▋ | 10211/11788 [1:26:42<13:00, 2.02it/s] 87%|████████▋ | 10212/11788 [1:26:42<12:58, 2.02it/s] 87%|████████▋ | 10213/11788 [1:26:43<12:58, 2.02it/s] 87%|████████▋ | 10214/11788 [1:26:43<12:57, 2.02it/s] 87%|████████▋ | 10215/11788 [1:26:44<12:56, 2.03it/s] 87%|████████▋ | 10216/11788 [1:26:44<12:56, 2.03it/s] 87%|████████▋ | 10217/11788 [1:26:45<12:55, 2.03it/s] 87%|████████▋ | 10218/11788 [1:26:45<12:55, 2.02it/s] 87%|████████▋ | 10219/11788 [1:26:46<12:55, 2.02it/s] 87%|████████▋ | 10220/11788 [1:26:46<12:54, 2.02it/s] 87%|████████▋ | 10221/11788 [1:26:47<12:54, 2.02it/s] 87%|████████▋ | 10222/11788 [1:26:47<12:53, 2.02it/s] 87%|████████▋ | 10223/11788 [1:26:48<12:53, 2.02it/s] 87%|████████▋ | 10224/11788 [1:26:48<12:52, 2.03it/s] 87%|████████▋ | 10225/11788 [1:26:49<12:52, 2.02it/s]{'loss': 2.5585, 'grad_norm': 0.2553425133228302, 'learning_rate': 5.260680012359531e-05, 'epoch': 12.14} + 87%|████████▋ | 10225/11788 [1:26:49<12:52, 2.02it/s] 87%|████████▋ | 10226/11788 [1:26:49<12:52, 2.02it/s] 87%|████████▋ | 10227/11788 [1:26:50<12:51, 2.02it/s] 87%|███���████▋ | 10228/11788 [1:26:50<12:50, 2.02it/s] 87%|████████▋ | 10229/11788 [1:26:51<12:49, 2.02it/s] 87%|████████▋ | 10230/11788 [1:26:51<12:49, 2.03it/s] 87%|████████▋ | 10231/11788 [1:26:52<12:48, 2.02it/s] 87%|████████▋ | 10232/11788 [1:26:52<12:47, 2.03it/s] 87%|████████▋ | 10233/11788 [1:26:53<12:47, 2.03it/s] 87%|████████▋ | 10234/11788 [1:26:53<12:47, 2.03it/s] 87%|████████▋ | 10235/11788 [1:26:54<12:46, 2.03it/s] 87%|████████▋ | 10236/11788 [1:26:54<12:47, 2.02it/s] 87%|████████▋ | 10237/11788 [1:26:55<12:46, 2.02it/s] 87%|████████▋ | 10238/11788 [1:26:55<12:45, 2.03it/s] 87%|████████▋ | 10239/11788 [1:26:56<12:44, 2.03it/s] 87%|████████▋ | 10240/11788 [1:26:56<12:44, 2.02it/s] 87%|████████▋ | 10241/11788 [1:26:57<12:44, 2.02it/s] 87%|████████▋ | 10242/11788 [1:26:57<12:43, 2.02it/s] 87%|████████▋ | 10243/11788 [1:26:58<12:43, 2.02it/s] 87%|████████▋ | 10244/11788 [1:26:58<12:42, 2.02it/s] 87%|████████▋ | 10245/11788 [1:26:59<12:41, 2.03it/s] 87%|████████▋ | 10246/11788 [1:26:59<12:41, 2.03it/s] 87%|████████▋ | 10247/11788 [1:27:00<12:40, 2.03it/s] 87%|████████▋ | 10248/11788 [1:27:00<12:39, 2.03it/s] 87%|████████▋ | 10249/11788 [1:27:01<12:39, 2.03it/s] 87%|████████▋ | 10250/11788 [1:27:01<12:38, 2.03it/s]{'loss': 2.5698, 'grad_norm': 0.25583794713020325, 'learning_rate': 5.096634669464817e-05, 'epoch': 12.17} + 87%|████████▋ | 10250/11788 [1:27:01<12:38, 2.03it/s] 87%|████████▋ | 10251/11788 [1:27:02<12:39, 2.02it/s] 87%|████████▋ | 10252/11788 [1:27:02<12:38, 2.02it/s] 87%|████████▋ | 10253/11788 [1:27:03<13:43, 1.86it/s] 87%|████████▋ | 10254/11788 [1:27:03<13:23, 1.91it/s] 87%|████████▋ | 10255/11788 [1:27:04<13:08, 1.94it/s] 87%|████████▋ | 10256/11788 [1:27:04<12:59, 1.96it/s] 87%|████████▋ | 10257/11788 [1:27:05<12:52, 1.98it/s] 87%|████████▋ | 10258/11788 [1:27:05<12:48, 1.99it/s] 87%|████████▋ | 10259/11788 [1:27:06<12:44, 2.00it/s] 87%|████████▋ | 10260/11788 [1:27:06<12:41, 2.01it/s] 87%|████████▋ | 10261/11788 [1:27:07<12:38, 2.01it/s] 87%|████████▋ | 10262/11788 [1:27:07<12:37, 2.01it/s] 87%|████████▋ | 10263/11788 [1:27:08<12:36, 2.02it/s] 87%|████████▋ | 10264/11788 [1:27:08<12:34, 2.02it/s] 87%|████████▋ | 10265/11788 [1:27:09<12:33, 2.02it/s] 87%|████████▋ | 10266/11788 [1:27:09<12:33, 2.02it/s] 87%|████████▋ | 10267/11788 [1:27:10<12:33, 2.02it/s] 87%|████████▋ | 10268/11788 [1:27:10<12:32, 2.02it/s] 87%|████████▋ | 10269/11788 [1:27:11<12:31, 2.02it/s] 87%|████████▋ | 10270/11788 [1:27:11<12:30, 2.02it/s] 87%|████████▋ | 10271/11788 [1:27:12<12:29, 2.02it/s] 87%|████████▋ | 10272/11788 [1:27:12<12:29, 2.02it/s] 87%|████████▋ | 10273/11788 [1:27:13<12:28, 2.02it/s] 87%|████████▋ | 10274/11788 [1:27:13<12:28, 2.02it/s] 87%|████████▋ | 10275/11788 [1:27:14<12:27, 2.03it/s]{'loss': 2.5586, 'grad_norm': 0.25417497754096985, 'learning_rate': 4.9350503047965344e-05, 'epoch': 12.2} + 87%|████████▋ | 10275/11788 [1:27:14<12:27, 2.03it/s] 87%|████████▋ | 10276/11788 [1:27:14<12:27, 2.02it/s] 87%|████████▋ | 10277/11788 [1:27:15<12:26, 2.02it/s] 87%|████████▋ | 10278/11788 [1:27:15<12:26, 2.02it/s] 87%|████████▋ | 10279/11788 [1:27:16<12:25, 2.02it/s] 87%|████████▋ | 10280/11788 [1:27:16<12:25, 2.02it/s] 87%|████████▋ | 10281/11788 [1:27:17<12:24, 2.02it/s] 87%|████████▋ | 10282/11788 [1:27:17<12:24, 2.02it/s] 87%|████████▋ | 10283/11788 [1:27:18<12:23, 2.02it/s] 87%|████████▋ | 10284/11788 [1:27:18<12:22, 2.02it/s] 87%|████████▋ | 10285/11788 [1:27:19<12:22, 2.02it/s] 87%|████████▋ | 10286/11788 [1:27:19<12:21, 2.03it/s] 87%|████████▋ | 10287/11788 [1:27:20<12:22, 2.02it/s] 87%|████████▋ | 10288/11788 [1:27:20<12:22, 2.02it/s] 87%|████████▋ | 10289/11788 [1:27:21<12:21, 2.02it/s] 87%|████████▋ | 10290/11788 [1:27:21<12:20, 2.02it/s] 87%|████████▋ | 10291/11788 [1:27:22<12:20, 2.02it/s] 87%|████████▋ | 10292/11788 [1:27:22<12:19, 2.02it/s] 87%|████████▋ | 10293/11788 [1:27:23<12:19, 2.02it/s] 87%|████████▋ | 10294/11788 [1:27:23<12:19, 2.02it/s] 87%|████████▋ | 10295/11788 [1:27:24<12:18, 2.02it/s] 87%|████████▋ | 10296/11788 [1:27:24<12:18, 2.02it/s] 87%|████████▋ | 10297/11788 [1:27:25<12:17, 2.02it/s] 87%|████████▋ | 10298/11788 [1:27:25<12:17, 2.02it/s] 87%|████████▋ | 10299/11788 [1:27:26<12:17, 2.02it/s] 87%|████████▋ | 10300/11788 [1:27:26<12:17, 2.02it/s]{'loss': 2.5653, 'grad_norm': 0.2572789490222931, 'learning_rate': 4.7759357741631594e-05, 'epoch': 12.23} + 87%|████████▋ | 10300/11788 [1:27:26<12:17, 2.02it/s] 87%|████████▋ | 10301/11788 [1:27:27<12:16, 2.02it/s] 87%|████████▋ | 10302/11788 [1:27:27<12:16, 2.02it/s] 87%|████████▋ | 10303/11788 [1:27:28<12:17, 2.01it/s] 87%|████████▋ | 10304/11788 [1:27:28<12:16, 2.02it/s] 87%|████████▋ | 10305/11788 [1:27:29<12:14, 2.02it/s] 87%|████████▋ | 10306/11788 [1:27:29<12:13, 2.02it/s] 87%|████████▋ | 10307/11788 [1:27:30<12:13, 2.02it/s] 87%|████████▋ | 10308/11788 [1:27:30<12:11, 2.02it/s] 87%|████████▋ | 10309/11788 [1:27:31<12:12, 2.02it/s] 87%|████████▋ | 10310/11788 [1:27:31<12:10, 2.02it/s] 87%|████████▋ | 10311/11788 [1:27:32<12:10, 2.02it/s] 87%|████████▋ | 10312/11788 [1:27:32<12:09, 2.02it/s] 87%|████████▋ | 10313/11788 [1:27:33<12:09, 2.02it/s] 87%|████████▋ | 10314/11788 [1:27:33<12:07, 2.03it/s] 88%|████████▊ | 10315/11788 [1:27:34<12:08, 2.02it/s] 88%|████████▊ | 10316/11788 [1:27:34<12:07, 2.02it/s] 88%|████████▊ | 10317/11788 [1:27:35<12:08, 2.02it/s] 88%|████████▊ | 10318/11788 [1:27:35<12:06, 2.02it/s] 88%|████████▊ | 10319/11788 [1:27:36<12:07, 2.02it/s] 88%|████████▊ | 10320/11788 [1:27:36<12:06, 2.02it/s] 88%|████████▊ | 10321/11788 [1:27:37<12:06, 2.02it/s] 88%|████████▊ | 10322/11788 [1:27:37<12:05, 2.02it/s] 88%|████████▊ | 10323/11788 [1:27:38<12:05, 2.02it/s] 88%|████████▊ | 10324/11788 [1:27:38<12:04, 2.02it/s] 88%|████████▊ | 10325/11788 [1:27:38<12:04, 2.02it/s]{'loss': 2.5602, 'grad_norm': 0.2611031234264374, 'learning_rate': 4.6192997980112064e-05, 'epoch': 12.26} + 88%|████████▊ | 10325/11788 [1:27:39<12:04, 2.02it/s] 88%|████████▊ | 10326/11788 [1:27:39<12:04, 2.02it/s] 88%|████████▊ | 10327/11788 [1:27:39<12:03, 2.02it/s] 88%|████████▊ | 10328/11788 [1:27:40<12:03, 2.02it/s] 88%|████████▊ | 10329/11788 [1:27:40<12:02, 2.02it/s] 88%|████████▊ | 10330/11788 [1:27:41<12:01, 2.02it/s] 88%|████████▊ | 10331/11788 [1:27:41<12:01, 2.02it/s] 88%|████████▊ | 10332/11788 [1:27:42<12:00, 2.02it/s] 88%|████████▊ | 10333/11788 [1:27:42<12:01, 2.02it/s] 88%|████████▊ | 10334/11788 [1:27:43<12:01, 2.01it/s] 88%|████████▊ | 10335/11788 [1:27:43<12:01, 2.01it/s] 88%|████████▊ | 10336/11788 [1:27:44<12:00, 2.01it/s] 88%|████████▊ | 10337/11788 [1:27:44<11:59, 2.02it/s] 88%|████████▊ | 10338/11788 [1:27:45<11:58, 2.02it/s] 88%|████████▊ | 10339/11788 [1:27:45<11:57, 2.02it/s] 88%|████████▊ | 10340/11788 [1:27:46<11:57, 2.02it/s] 88%|████████▊ | 10341/11788 [1:27:46<11:56, 2.02it/s] 88%|████████▊ | 10342/11788 [1:27:47<11:55, 2.02it/s] 88%|████████▊ | 10343/11788 [1:27:47<11:54, 2.02it/s] 88%|████████▊ | 10344/11788 [1:27:48<11:54, 2.02it/s] 88%|████████▊ | 10345/11788 [1:27:48<11:52, 2.02it/s] 88%|████████▊ | 10346/11788 [1:27:49<11:52, 2.03it/s] 88%|████████▊ | 10347/11788 [1:27:49<11:51, 2.02it/s] 88%|████████▊ | 10348/11788 [1:27:50<11:51, 2.02it/s] 88%|████████▊ | 10349/11788 [1:27:50<11:51, 2.02it/s] 88%|████████▊ | 10350/11788 [1:27:51<11:50, 2.02it/s]{'loss': 2.5572, 'grad_norm': 0.2577296197414398, 'learning_rate': 4.4651509609472565e-05, 'epoch': 12.29} + 88%|████████▊ | 10350/11788 [1:27:51<11:50, 2.02it/s] 88%|████████▊ | 10351/11788 [1:27:51<11:50, 2.02it/s] 88%|████████▊ | 10352/11788 [1:27:52<11:49, 2.02it/s] 88%|████████▊ | 10353/11788 [1:27:52<11:50, 2.02it/s] 88%|████████▊ | 10354/11788 [1:27:53<11:49, 2.02it/s] 88%|████████▊ | 10355/11788 [1:27:53<11:49, 2.02it/s] 88%|████████▊ | 10356/11788 [1:27:54<11:47, 2.02it/s] 88%|████████▊ | 10357/11788 [1:27:54<11:47, 2.02it/s] 88%|████████▊ | 10358/11788 [1:27:55<11:46, 2.02it/s] 88%|████████▊ | 10359/11788 [1:27:55<11:46, 2.02it/s] 88%|████████▊ | 10360/11788 [1:27:56<11:45, 2.02it/s] 88%|████████▊ | 10361/11788 [1:27:56<11:47, 2.02it/s] 88%|████████▊ | 10362/11788 [1:27:57<11:45, 2.02it/s] 88%|████████▊ | 10363/11788 [1:27:57<11:45, 2.02it/s] 88%|████████▊ | 10364/11788 [1:27:58<11:44, 2.02it/s] 88%|████████▊ | 10365/11788 [1:27:58<11:44, 2.02it/s] 88%|████████▊ | 10366/11788 [1:27:59<11:42, 2.02it/s] 88%|████████▊ | 10367/11788 [1:27:59<11:42, 2.02it/s] 88%|████████▊ | 10368/11788 [1:28:00<11:41, 2.02it/s] 88%|████████▊ | 10369/11788 [1:28:00<11:41, 2.02it/s] 88%|████████▊ | 10370/11788 [1:28:01<11:40, 2.02it/s] 88%|████████▊ | 10371/11788 [1:28:01<11:40, 2.02it/s] 88%|████████▊ | 10372/11788 [1:28:02<11:39, 2.02it/s] 88%|████████▊ | 10373/11788 [1:28:02<11:39, 2.02it/s] 88%|████████▊ | 10374/11788 [1:28:03<11:38, 2.03it/s] 88%|████████▊ | 10375/11788 [1:28:03<11:37, 2.02it/s]{'loss': 2.5633, 'grad_norm': 0.2557050585746765, 'learning_rate': 4.313497711267567e-05, 'epoch': 12.32} + 88%|████████▊ | 10375/11788 [1:28:03<11:37, 2.02it/s] 88%|████████▊ | 10376/11788 [1:28:04<11:38, 2.02it/s] 88%|████████▊ | 10377/11788 [1:28:04<11:39, 2.02it/s] 88%|████████▊ | 10378/11788 [1:28:05<11:39, 2.01it/s] 88%|████████▊ | 10379/11788 [1:28:05<11:38, 2.02it/s] 88%|████████▊ | 10380/11788 [1:28:06<11:37, 2.02it/s] 88%|████████▊ | 10381/11788 [1:28:06<11:36, 2.02it/s] 88%|████████▊ | 10382/11788 [1:28:07<11:35, 2.02it/s] 88%|████████▊ | 10383/11788 [1:28:07<11:35, 2.02it/s] 88%|████████▊ | 10384/11788 [1:28:08<11:34, 2.02it/s] 88%|████████▊ | 10385/11788 [1:28:08<11:34, 2.02it/s] 88%|████████▊ | 10386/11788 [1:28:09<11:33, 2.02it/s] 88%|████████▊ | 10387/11788 [1:28:09<11:33, 2.02it/s] 88%|████████▊ | 10388/11788 [1:28:10<11:32, 2.02it/s] 88%|████████▊ | 10389/11788 [1:28:10<11:32, 2.02it/s] 88%|████████▊ | 10390/11788 [1:28:11<11:31, 2.02it/s] 88%|████████▊ | 10391/11788 [1:28:11<11:31, 2.02it/s] 88%|████████▊ | 10392/11788 [1:28:12<11:31, 2.02it/s] 88%|████████▊ | 10393/11788 [1:28:12<11:30, 2.02it/s] 88%|████████▊ | 10394/11788 [1:28:13<11:29, 2.02it/s] 88%|████████▊ | 10395/11788 [1:28:13<11:28, 2.02it/s] 88%|████████▊ | 10396/11788 [1:28:14<11:28, 2.02it/s] 88%|████████▊ | 10397/11788 [1:28:14<11:27, 2.02it/s] 88%|████████▊ | 10398/11788 [1:28:15<11:26, 2.02it/s] 88%|████████▊ | 10399/11788 [1:28:15<11:26, 2.02it/s] 88%|████████▊ | 10400/11788 [1:28:16<11:26, 2.02it/s]{'loss': 2.5639, 'grad_norm': 0.25953951478004456, 'learning_rate': 4.164348360494941e-05, 'epoch': 12.35} + 88%|████████▊ | 10400/11788 [1:28:16<11:26, 2.02it/s] 88%|████████▊ | 10401/11788 [1:28:16<11:26, 2.02it/s] 88%|████████▊ | 10402/11788 [1:28:17<11:25, 2.02it/s] 88%|████████▊ | 10403/11788 [1:28:17<11:25, 2.02it/s] 88%|████████▊ | 10404/11788 [1:28:18<11:23, 2.02it/s] 88%|████████▊ | 10405/11788 [1:28:18<11:23, 2.02it/s] 88%|████████▊ | 10406/11788 [1:28:19<11:23, 2.02it/s] 88%|████████▊ | 10407/11788 [1:28:19<11:22, 2.02it/s] 88%|████████▊ | 10408/11788 [1:28:20<11:21, 2.02it/s] 88%|████████▊ | 10409/11788 [1:28:20<11:22, 2.02it/s] 88%|████████▊ | 10410/11788 [1:28:21<11:21, 2.02it/s] 88%|████████▊ | 10411/11788 [1:28:21<11:21, 2.02it/s] 88%|████████▊ | 10412/11788 [1:28:22<11:20, 2.02it/s] 88%|██��█████▊ | 10413/11788 [1:28:22<11:20, 2.02it/s] 88%|████████▊ | 10414/11788 [1:28:23<11:18, 2.02it/s] 88%|████████▊ | 10415/11788 [1:28:23<11:19, 2.02it/s] 88%|████████▊ | 10416/11788 [1:28:24<11:18, 2.02it/s] 88%|████████▊ | 10417/11788 [1:28:24<11:18, 2.02it/s] 88%|████████▊ | 10418/11788 [1:28:25<11:16, 2.02it/s] 88%|████████▊ | 10419/11788 [1:28:25<11:17, 2.02it/s] 88%|████████▊ | 10420/11788 [1:28:25<11:16, 2.02it/s] 88%|████████▊ | 10421/11788 [1:28:26<11:16, 2.02it/s] 88%|████████▊ | 10422/11788 [1:28:26<11:15, 2.02it/s] 88%|████████▊ | 10423/11788 [1:28:27<11:15, 2.02it/s] 88%|████████▊ | 10424/11788 [1:28:27<11:14, 2.02it/s] 88%|████████▊ | 10425/11788 [1:28:28<11:13, 2.02it/s]{'loss': 2.5667, 'grad_norm': 0.25528305768966675, 'learning_rate': 4.0177110829232536e-05, 'epoch': 12.38} + 88%|████████▊ | 10425/11788 [1:28:28<11:13, 2.02it/s] 88%|████████▊ | 10426/11788 [1:28:28<11:13, 2.02it/s] 88%|████████▊ | 10427/11788 [1:28:29<11:13, 2.02it/s] 88%|████████▊ | 10428/11788 [1:28:29<11:12, 2.02it/s] 88%|████████▊ | 10429/11788 [1:28:30<11:12, 2.02it/s] 88%|████████▊ | 10430/11788 [1:28:30<11:11, 2.02it/s] 88%|████████▊ | 10431/11788 [1:28:31<11:11, 2.02it/s] 88%|████████▊ | 10432/11788 [1:28:31<11:10, 2.02it/s] 89%|████████▊ | 10433/11788 [1:28:32<11:09, 2.02it/s] 89%|████████▊ | 10434/11788 [1:28:32<11:09, 2.02it/s] 89%|████████▊ | 10435/11788 [1:28:33<11:08, 2.02it/s] 89%|████████▊ | 10436/11788 [1:28:33<11:08, 2.02it/s] 89%|████████▊ | 10437/11788 [1:28:34<11:07, 2.02it/s] 89%|████████▊ | 10438/11788 [1:28:34<11:06, 2.02it/s] 89%|████████▊ | 10439/11788 [1:28:35<11:05, 2.03it/s] 89%|████████▊ | 10440/11788 [1:28:35<11:06, 2.02it/s] 89%|████████▊ | 10441/11788 [1:28:36<11:05, 2.02it/s] 89%|████████▊ | 10442/11788 [1:28:36<11:05, 2.02it/s] 89%|████████▊ | 10443/11788 [1:28:37<11:04, 2.02it/s] 89%|████████▊ | 10444/11788 [1:28:37<11:04, 2.02it/s] 89%|████████▊ | 10445/11788 [1:28:38<11:03, 2.02it/s] 89%|████████▊ | 10446/11788 [1:28:38<11:03, 2.02it/s] 89%|████████▊ | 10447/11788 [1:28:39<11:02, 2.02it/s] 89%|████████▊ | 10448/11788 [1:28:39<11:02, 2.02it/s] 89%|████████▊ | 10449/11788 [1:28:40<11:01, 2.02it/s] 89%|████████▊ | 10450/11788 [1:28:40<11:01, 2.02it/s]{'loss': 2.5643, 'grad_norm': 0.2559700608253479, 'learning_rate': 3.873593915169465e-05, 'epoch': 12.41} + 89%|████████▊ | 10450/11788 [1:28:40<11:01, 2.02it/s] 89%|████████▊ | 10451/11788 [1:28:41<11:01, 2.02it/s] 89%|████████▊ | 10452/11788 [1:28:41<11:01, 2.02it/s] 89%|████████▊ | 10453/11788 [1:28:42<11:00, 2.02it/s] 89%|████████▊ | 10454/11788 [1:28:42<11:00, 2.02it/s] 89%|████████▊ | 10455/11788 [1:28:43<10:59, 2.02it/s] 89%|████████▊ | 10456/11788 [1:28:43<10:59, 2.02it/s] 89%|████████▊ | 10457/11788 [1:28:44<10:58, 2.02it/s] 89%|████████▊ | 10458/11788 [1:28:44<10:58, 2.02it/s] 89%|████████▊ | 10459/11788 [1:28:45<10:56, 2.02it/s] 89%|████████▊ | 10460/11788 [1:28:45<10:56, 2.02it/s] 89%|████████▊ | 10461/11788 [1:28:46<10:55, 2.02it/s] 89%|████████▉ | 10462/11788 [1:28:46<10:55, 2.02it/s] 89%|████████▉ | 10463/11788 [1:28:47<10:54, 2.03it/s] 89%|████████▉ | 10464/11788 [1:28:47<10:54, 2.02it/s] 89%|████████▉ | 10465/11788 [1:28:48<10:54, 2.02it/s] 89%|████████▉ | 10466/11788 [1:28:48<10:53, 2.02it/s] 89%|████████▉ | 10467/11788 [1:28:49<10:52, 2.02it/s] 89%|████████▉ | 10468/11788 [1:28:49<10:52, 2.02it/s] 89%|████████▉ | 10469/11788 [1:28:50<10:51, 2.02it/s] 89%|████████▉ | 10470/11788 [1:28:50<10:51, 2.02it/s] 89%|████████▉ | 10471/11788 [1:28:51<10:50, 2.03it/s] 89%|████████▉ | 10472/11788 [1:28:51<10:48, 2.03it/s] 89%|████████▉ | 10473/11788 [1:28:52<10:49, 2.02it/s] 89%|████████▉ | 10474/11788 [1:28:52<10:48, 2.03it/s] 89%|████████▉ | 10475/11788 [1:28:53<10:49, 2.02it/s]{'loss': 2.5566, 'grad_norm': 0.26133981347084045, 'learning_rate': 3.7320047557331705e-05, 'epoch': 12.44} + 89%|████████▉ | 10475/11788 [1:28:53<10:49, 2.02it/s] 89%|████████▉ | 10476/11788 [1:28:53<10:48, 2.02it/s] 89%|████████▉ | 10477/11788 [1:28:54<10:48, 2.02it/s] 89%|████████▉ | 10478/11788 [1:28:54<10:47, 2.02it/s] 89%|████████▉ | 10479/11788 [1:28:55<10:47, 2.02it/s] 89%|████████▉ | 10480/11788 [1:28:55<10:46, 2.02it/s] 89%|████████▉ | 10481/11788 [1:28:56<10:46, 2.02it/s] 89%|████████▉ | 10482/11788 [1:28:56<10:45, 2.02it/s] 89%|████████▉ | 10483/11788 [1:28:57<10:44, 2.02it/s] 89%|████████▉ | 10484/11788 [1:28:57<10:44, 2.02it/s] 89%|████████▉ | 10485/11788 [1:28:58<10:43, 2.03it/s] 89%|████████▉ | 10486/11788 [1:28:58<10:43, 2.02it/s] 89%|████████▉ | 10487/11788 [1:28:59<10:42, 2.02it/s] 89%|████████▉ | 10488/11788 [1:28:59<10:42, 2.02it/s] 89%|████████▉ | 10489/11788 [1:29:00<10:41, 2.02it/s] 89%|████████▉ | 10490/11788 [1:29:00<10:42, 2.02it/s] 89%|████████▉ | 10491/11788 [1:29:01<10:41, 2.02it/s] 89%|████████▉ | 10492/11788 [1:29:01<10:41, 2.02it/s] 89%|████████▉ | 10493/11788 [1:29:02<10:40, 2.02it/s] 89%|████████▉ | 10494/11788 [1:29:02<10:39, 2.02it/s] 89%|████████▉ | 10495/11788 [1:29:03<10:39, 2.02it/s] 89%|████████▉ | 10496/11788 [1:29:03<10:39, 2.02it/s] 89%|████████▉ | 10497/11788 [1:29:04<10:38, 2.02it/s] 89%|████████▉ | 10498/11788 [1:29:04<10:37, 2.02it/s] 89%|████████▉ | 10499/11788 [1:29:05<10:37, 2.02it/s] 89%|████████▉ | 10500/11788 [1:29:05<10:37, 2.02it/s]{'loss': 2.5593, 'grad_norm': 0.2581780254840851, 'learning_rate': 3.5929513645636516e-05, 'epoch': 12.47} + 89%|████████▉ | 10500/11788 [1:29:05<10:37, 2.02it/s] 89%|████████▉ | 10501/11788 [1:29:06<10:37, 2.02it/s] 89%|████████▉ | 10502/11788 [1:29:06<10:36, 2.02it/s] 89%|████████▉ | 10503/11788 [1:29:07<10:36, 2.02it/s] 89%|████████▉ | 10504/11788 [1:29:07<10:35, 2.02it/s] 89%|████████▉ | 10505/11788 [1:29:08<10:35, 2.02it/s] 89%|████████▉ | 10506/11788 [1:29:08<10:34, 2.02it/s] 89%|████████▉ | 10507/11788 [1:29:09<10:33, 2.02it/s] 89%|████████▉ | 10508/11788 [1:29:09<10:33, 2.02it/s] 89%|████████▉ | 10509/11788 [1:29:10<10:32, 2.02it/s] 89%|████████▉ | 10510/11788 [1:29:10<10:32, 2.02it/s] 89%|████████▉ | 10511/11788 [1:29:10<10:31, 2.02it/s] 89%|████████▉ | 10512/11788 [1:29:11<10:31, 2.02it/s] 89%|████████▉ | 10513/11788 [1:29:11<10:30, 2.02it/s] 89%|████████▉ | 10514/11788 [1:29:12<10:30, 2.02it/s] 89%|████████▉ | 10515/11788 [1:29:12<10:29, 2.02it/s] 89%|████████▉ | 10516/11788 [1:29:13<10:29, 2.02it/s] 89%|████████▉ | 10517/11788 [1:29:13<10:28, 2.02it/s] 89%|████████▉ | 10518/11788 [1:29:14<10:28, 2.02it/s] 89%|████████▉ | 10519/11788 [1:29:14<10:27, 2.02it/s] 89%|████████▉ | 10520/11788 [1:29:15<10:27, 2.02it/s] 89%|████████▉ | 10521/11788 [1:29:15<10:26, 2.02it/s] 89%|████████▉ | 10522/11788 [1:29:16<10:25, 2.02it/s] 89%|████████▉ | 10523/11788 [1:29:16<10:25, 2.02it/s] 89%|████████▉ | 10524/11788 [1:29:17<10:24, 2.02it/s] 89%|████████▉ | 10525/11788 [1:29:17<10:24, 2.02it/s]{'loss': 2.5847, 'grad_norm': 0.2545866370201111, 'learning_rate': 3.4564413626346434e-05, 'epoch': 12.5} + 89%|████████▉ | 10525/11788 [1:29:17<10:24, 2.02it/s] 89%|████████▉ | 10526/11788 [1:29:18<10:25, 2.02it/s] 89%|████████▉ | 10527/11788 [1:29:18<10:24, 2.02it/s] 89%|████████▉ | 10528/11788 [1:29:19<10:23, 2.02it/s] 89%|████████▉ | 10529/11788 [1:29:19<10:23, 2.02it/s] 89%|████████▉ | 10530/11788 [1:29:20<10:22, 2.02it/s] 89%|████████▉ | 10531/11788 [1:29:20<10:21, 2.02it/s] 89%|████████▉ | 10532/11788 [1:29:21<10:20, 2.02it/s] 89%|████████▉ | 10533/11788 [1:29:21<10:20, 2.02it/s] 89%|████████▉ | 10534/11788 [1:29:22<10:19, 2.02it/s] 89%|████████��� | 10535/11788 [1:29:22<10:19, 2.02it/s] 89%|████████▉ | 10536/11788 [1:29:23<10:19, 2.02it/s] 89%|████████▉ | 10537/11788 [1:29:23<10:18, 2.02it/s] 89%|████████▉ | 10538/11788 [1:29:24<10:18, 2.02it/s] 89%|████████▉ | 10539/11788 [1:29:24<10:17, 2.02it/s] 89%|████████▉ | 10540/11788 [1:29:25<10:17, 2.02it/s] 89%|████████▉ | 10541/11788 [1:29:25<10:16, 2.02it/s] 89%|████████▉ | 10542/11788 [1:29:26<10:16, 2.02it/s] 89%|████████▉ | 10543/11788 [1:29:26<10:15, 2.02it/s] 89%|████████▉ | 10544/11788 [1:29:27<10:15, 2.02it/s] 89%|████████▉ | 10545/11788 [1:29:27<10:14, 2.02it/s] 89%|████████▉ | 10546/11788 [1:29:28<10:14, 2.02it/s] 89%|████████▉ | 10547/11788 [1:29:28<10:13, 2.02it/s] 89%|████████▉ | 10548/11788 [1:29:29<10:13, 2.02it/s] 89%|████████▉ | 10549/11788 [1:29:29<10:13, 2.02it/s] 89%|████████▉ | 10550/11788 [1:29:30<10:13, 2.02it/s]{'loss': 2.5735, 'grad_norm': 0.25575879216194153, 'learning_rate': 3.322482231526675e-05, 'epoch': 12.53} + 89%|████████▉ | 10550/11788 [1:29:30<10:13, 2.02it/s] 90%|████████▉ | 10551/11788 [1:29:30<10:12, 2.02it/s] 90%|████████▉ | 10552/11788 [1:29:31<10:11, 2.02it/s] 90%|████████▉ | 10553/11788 [1:29:31<10:10, 2.02it/s] 90%|████████▉ | 10554/11788 [1:29:32<10:10, 2.02it/s] 90%|████████▉ | 10555/11788 [1:29:32<10:09, 2.02it/s] 90%|████████▉ | 10556/11788 [1:29:33<10:09, 2.02it/s] 90%|████████▉ | 10557/11788 [1:29:33<10:08, 2.02it/s] 90%|████████▉ | 10558/11788 [1:29:34<10:08, 2.02it/s] 90%|████████▉ | 10559/11788 [1:29:34<10:07, 2.02it/s] 90%|████████▉ | 10560/11788 [1:29:35<10:07, 2.02it/s] 90%|████████▉ | 10561/11788 [1:29:35<10:06, 2.02it/s] 90%|████████▉ | 10562/11788 [1:29:36<10:06, 2.02it/s] 90%|████████▉ | 10563/11788 [1:29:36<10:05, 2.02it/s] 90%|████████▉ | 10564/11788 [1:29:37<10:05, 2.02it/s] 90%|████████▉ | 10565/11788 [1:29:37<10:04, 2.02it/s] 90%|████████▉ | 10566/11788 [1:29:38<10:04, 2.02it/s] 90%|████████▉ | 10567/11788 [1:29:38<10:03, 2.02it/s] 90%|████████▉ | 10568/11788 [1:29:39<10:04, 2.02it/s] 90%|████████▉ | 10569/11788 [1:29:39<10:03, 2.02it/s] 90%|████████▉ | 10570/11788 [1:29:40<10:02, 2.02it/s] 90%|████████▉ | 10571/11788 [1:29:40<10:01, 2.02it/s] 90%|████████▉ | 10572/11788 [1:29:41<10:01, 2.02it/s] 90%|████████▉ | 10573/11788 [1:29:41<10:00, 2.02it/s] 90%|████████▉ | 10574/11788 [1:29:42<10:00, 2.02it/s] 90%|████████▉ | 10575/11788 [1:29:42<09:59, 2.02it/s]{'loss': 2.5743, 'grad_norm': 0.25533536076545715, 'learning_rate': 3.1910813130169355e-05, 'epoch': 12.56} + 90%|████████▉ | 10575/11788 [1:29:42<09:59, 2.02it/s] 90%|████████▉ | 10576/11788 [1:29:43<10:00, 2.02it/s] 90%|████████▉ | 10577/11788 [1:29:43<09:58, 2.02it/s] 90%|████████▉ | 10578/11788 [1:29:44<09:58, 2.02it/s] 90%|████████▉ | 10579/11788 [1:29:44<09:57, 2.02it/s] 90%|████████▉ | 10580/11788 [1:29:45<09:57, 2.02it/s] 90%|████████▉ | 10581/11788 [1:29:45<09:56, 2.02it/s] 90%|████████▉ | 10582/11788 [1:29:46<09:56, 2.02it/s] 90%|████████▉ | 10583/11788 [1:29:46<09:55, 2.02it/s] 90%|████████▉ | 10584/11788 [1:29:47<09:55, 2.02it/s] 90%|████████▉ | 10585/11788 [1:29:47<09:54, 2.02it/s] 90%|████████▉ | 10586/11788 [1:29:48<09:55, 2.02it/s] 90%|████████▉ | 10587/11788 [1:29:48<09:54, 2.02it/s] 90%|████████▉ | 10588/11788 [1:29:49<09:53, 2.02it/s] 90%|████████▉ | 10589/11788 [1:29:49<09:52, 2.02it/s] 90%|████████▉ | 10590/11788 [1:29:50<09:52, 2.02it/s] 90%|████████▉ | 10591/11788 [1:29:50<09:51, 2.02it/s] 90%|████████▉ | 10592/11788 [1:29:51<09:51, 2.02it/s] 90%|████████▉ | 10593/11788 [1:29:51<09:50, 2.02it/s] 90%|████████▉ | 10594/11788 [1:29:52<09:50, 2.02it/s] 90%|████████▉ | 10595/11788 [1:29:52<09:49, 2.02it/s] 90%|████████▉ | 10596/11788 [1:29:53<09:49, 2.02it/s] 90%|████████▉ | 10597/11788 [1:29:53<09:48, 2.02it/s] 90%|█���██████▉ | 10598/11788 [1:29:54<09:48, 2.02it/s] 90%|████████▉ | 10599/11788 [1:29:54<09:47, 2.02it/s] 90%|████████▉ | 10600/11788 [1:29:55<09:47, 2.02it/s]{'loss': 2.5668, 'grad_norm': 0.2539268136024475, 'learning_rate': 3.062245808677022e-05, 'epoch': 12.59} + 90%|████████▉ | 10600/11788 [1:29:55<09:47, 2.02it/s] 90%|████████▉ | 10601/11788 [1:29:55<09:47, 2.02it/s] 90%|████████▉ | 10602/11788 [1:29:55<09:47, 2.02it/s] 90%|████████▉ | 10603/11788 [1:29:56<09:46, 2.02it/s] 90%|████████▉ | 10604/11788 [1:29:56<09:45, 2.02it/s] 90%|████████▉ | 10605/11788 [1:29:57<09:44, 2.02it/s] 90%|████████▉ | 10606/11788 [1:29:57<09:44, 2.02it/s] 90%|████████▉ | 10607/11788 [1:29:58<09:43, 2.02it/s] 90%|████████▉ | 10608/11788 [1:29:58<09:43, 2.02it/s] 90%|████████▉ | 10609/11788 [1:29:59<09:43, 2.02it/s] 90%|█████████ | 10610/11788 [1:29:59<09:42, 2.02it/s] 90%|█████████ | 10611/11788 [1:30:00<09:41, 2.02it/s] 90%|█████████ | 10612/11788 [1:30:00<09:41, 2.02it/s] 90%|█████████ | 10613/11788 [1:30:01<09:40, 2.02it/s] 90%|█████████ | 10614/11788 [1:30:01<09:41, 2.02it/s] 90%|█████████ | 10615/11788 [1:30:02<09:40, 2.02it/s] 90%|█████████ | 10616/11788 [1:30:02<09:40, 2.02it/s] 90%|█████████ | 10617/11788 [1:30:03<09:39, 2.02it/s] 90%|█████████ | 10618/11788 [1:30:03<09:39, 2.02it/s] 90%|█████████ | 10619/11788 [1:30:04<09:38, 2.02it/s] 90%|█████████ | 10620/11788 [1:30:04<09:38, 2.02it/s] 90%|█████████ | 10621/11788 [1:30:05<09:37, 2.02it/s] 90%|█████████ | 10622/11788 [1:30:05<09:37, 2.02it/s] 90%|█████████ | 10623/11788 [1:30:06<09:36, 2.02it/s] 90%|█████████ | 10624/11788 [1:30:06<09:36, 2.02it/s] 90%|█████████ | 10625/11788 [1:30:07<09:35, 2.02it/s]{'loss': 2.556, 'grad_norm': 0.259285569190979, 'learning_rate': 2.935982779478158e-05, 'epoch': 12.62} + 90%|█████████ | 10625/11788 [1:30:07<09:35, 2.02it/s] 90%|█████████ | 10626/11788 [1:30:07<09:36, 2.02it/s] 90%|█████████ | 10627/11788 [1:30:08<09:35, 2.02it/s] 90%|█████████ | 10628/11788 [1:30:08<09:34, 2.02it/s] 90%|█████████ | 10629/11788 [1:30:09<09:34, 2.02it/s] 90%|█████████ | 10630/11788 [1:30:09<09:33, 2.02it/s] 90%|█████████ | 10631/11788 [1:30:10<09:33, 2.02it/s] 90%|█████████ | 10632/11788 [1:30:10<09:32, 2.02it/s] 90%|█████████ | 10633/11788 [1:30:11<09:31, 2.02it/s] 90%|█████████ | 10634/11788 [1:30:11<09:30, 2.02it/s] 90%|█████████ | 10635/11788 [1:30:12<09:30, 2.02it/s] 90%|█████████ | 10636/11788 [1:30:12<09:30, 2.02it/s] 90%|█████████ | 10637/11788 [1:30:13<09:29, 2.02it/s] 90%|█████████ | 10638/11788 [1:30:13<09:28, 2.02it/s] 90%|█████████ | 10639/11788 [1:30:14<09:28, 2.02it/s] 90%|█████████ | 10640/11788 [1:30:14<09:27, 2.02it/s] 90%|█████████ | 10641/11788 [1:30:15<09:26, 2.02it/s] 90%|█████████ | 10642/11788 [1:30:15<09:26, 2.02it/s] 90%|█████████ | 10643/11788 [1:30:16<09:26, 2.02it/s] 90%|█████████ | 10644/11788 [1:30:16<09:26, 2.02it/s] 90%|█████████ | 10645/11788 [1:30:17<09:25, 2.02it/s] 90%|█████████ | 10646/11788 [1:30:17<09:24, 2.02it/s] 90%|█████████ | 10647/11788 [1:30:18<09:23, 2.02it/s] 90%|█████████ | 10648/11788 [1:30:18<09:23, 2.02it/s] 90%|█████████ | 10649/11788 [1:30:19<09:23, 2.02it/s] 90%|█████████ | 10650/11788 [1:30:19<09:22, 2.02it/s]{'loss': 2.5786, 'grad_norm': 0.2575604319572449, 'learning_rate': 2.8122991454042623e-05, 'epoch': 12.65} + 90%|█████████ | 10650/11788 [1:30:19<09:22, 2.02it/s] 90%|█████████ | 10651/11788 [1:30:20<09:22, 2.02it/s] 90%|█████████ | 10652/11788 [1:30:20<09:22, 2.02it/s] 90%|█████████ | 10653/11788 [1:30:21<09:21, 2.02it/s] 90%|█████████ | 10654/11788 [1:30:21<09:20, 2.02it/s] 90%|█████████ | 10655/11788 [1:30:22<09:20, 2.02it/s] 90%|█████████ | 10656/11788 [1:30:22<09:19, 2.02it/s] 90%|█████████ | 10657/11788 [1:30:23<09:19, 2.02it/s] 90%|█████████ | 10658/11788 [1:30:23<09:18, 2.02it/s] 90%|█████████ | 10659/11788 [1:30:24<09:17, 2.02it/s] 90%|█████████ | 10660/11788 [1:30:24<09:17, 2.02it/s] 90%|█████████ | 10661/11788 [1:30:25<09:16, 2.02it/s] 90%|█████████ | 10662/11788 [1:30:25<09:16, 2.02it/s] 90%|█████████ | 10663/11788 [1:30:26<09:15, 2.02it/s] 90%|█████████ | 10664/11788 [1:30:26<09:15, 2.02it/s] 90%|█████████ | 10665/11788 [1:30:27<09:15, 2.02it/s] 90%|█████████ | 10666/11788 [1:30:27<09:14, 2.02it/s] 90%|█████████ | 10667/11788 [1:30:28<09:14, 2.02it/s] 90%|█████████ | 10668/11788 [1:30:28<09:13, 2.02it/s] 91%|█████████ | 10669/11788 [1:30:29<09:12, 2.02it/s] 91%|█████████ | 10670/11788 [1:30:29<09:12, 2.02it/s] 91%|█████████ | 10671/11788 [1:30:30<09:12, 2.02it/s] 91%|█████████ | 10672/11788 [1:30:30<09:12, 2.02it/s] 91%|█████████ | 10673/11788 [1:30:31<09:11, 2.02it/s] 91%|█████████ | 10674/11788 [1:30:31<09:11, 2.02it/s] 91%|█████████ | 10675/11788 [1:30:32<09:10, 2.02it/s]{'loss': 2.5719, 'grad_norm': 0.25431135296821594, 'learning_rate': 2.691201685072675e-05, 'epoch': 12.68} + 91%|█████████ | 10675/11788 [1:30:32<09:10, 2.02it/s] 91%|█████████ | 10676/11788 [1:30:32<09:11, 2.02it/s] 91%|█████████ | 10677/11788 [1:30:33<09:09, 2.02it/s] 91%|█████████ | 10678/11788 [1:30:33<09:10, 2.02it/s] 91%|█████████ | 10679/11788 [1:30:34<09:08, 2.02it/s] 91%|█████████ | 10680/11788 [1:30:34<09:08, 2.02it/s] 91%|█████████ | 10681/11788 [1:30:35<09:07, 2.02it/s] 91%|█████████ | 10682/11788 [1:30:35<09:06, 2.02it/s] 91%|█████████ | 10683/11788 [1:30:36<09:06, 2.02it/s] 91%|█████████ | 10684/11788 [1:30:36<09:06, 2.02it/s] 91%|█████████ | 10685/11788 [1:30:37<09:04, 2.02it/s] 91%|█████████ | 10686/11788 [1:30:37<09:04, 2.02it/s] 91%|█████████ | 10687/11788 [1:30:38<09:04, 2.02it/s] 91%|█████████ | 10688/11788 [1:30:38<09:04, 2.02it/s] 91%|█████████ | 10689/11788 [1:30:39<09:03, 2.02it/s] 91%|█████████ | 10690/11788 [1:30:39<09:02, 2.02it/s] 91%|█████████ | 10691/11788 [1:30:40<09:02, 2.02it/s] 91%|█████████ | 10692/11788 [1:30:40<09:01, 2.02it/s] 91%|█████████ | 10693/11788 [1:30:41<09:01, 2.02it/s] 91%|█████████ | 10694/11788 [1:30:41<09:00, 2.02it/s] 91%|█████████ | 10695/11788 [1:30:41<09:00, 2.02it/s] 91%|█████████ | 10696/11788 [1:30:42<08:59, 2.03it/s] 91%|█████████ | 10697/11788 [1:30:42<08:58, 2.03it/s] 91%|█████████ | 10698/11788 [1:30:43<08:58, 2.03it/s] 91%|█████████ | 10699/11788 [1:30:43<08:58, 2.02it/s] 91%|█████████ | 10700/11788 [1:30:44<08:57, 2.03it/s]{'loss': 2.5611, 'grad_norm': 0.2538653314113617, 'learning_rate': 2.572697035362609e-05, 'epoch': 12.71} + 91%|█████████ | 10700/11788 [1:30:44<08:57, 2.03it/s] 91%|█████████ | 10701/11788 [1:30:44<08:57, 2.02it/s] 91%|█████████ | 10702/11788 [1:30:45<08:56, 2.02it/s] 91%|█████████ | 10703/11788 [1:30:45<08:56, 2.02it/s] 91%|█████████ | 10704/11788 [1:30:46<08:56, 2.02it/s] 91%|█████████ | 10705/11788 [1:30:46<08:55, 2.02it/s] 91%|█████████ | 10706/11788 [1:30:47<08:55, 2.02it/s] 91%|█████████ | 10707/11788 [1:30:47<08:54, 2.02it/s] 91%|█████████ | 10708/11788 [1:30:48<08:54, 2.02it/s] 91%|█████████ | 10709/11788 [1:30:48<08:53, 2.02it/s] 91%|█████████ | 10710/11788 [1:30:49<08:53, 2.02it/s] 91%|█████████ | 10711/11788 [1:30:49<08:53, 2.02it/s] 91%|█████████ | 10712/11788 [1:30:50<08:52, 2.02it/s] 91%|█████████ | 10713/11788 [1:30:50<08:52, 2.02it/s] 91%|█████████ | 10714/11788 [1:30:51<08:51, 2.02it/s] 91%|█████████ | 10715/11788 [1:30:51<08:51, 2.02it/s] 91%|█████████ | 10716/11788 [1:30:52<08:50, 2.02it/s] 91%|█████████ | 10717/11788 [1:30:52<08:49, 2.02it/s] 91%|█████████ | 10718/11788 [1:30:53<08:48, 2.02it/s] 91%|█████████ | 10719/11788 [1:30:53<08:49, 2.02it/s] 91%|████████�� | 10720/11788 [1:30:54<08:48, 2.02it/s] 91%|█████████ | 10721/11788 [1:30:54<08:47, 2.02it/s] 91%|█████████ | 10722/11788 [1:30:55<08:47, 2.02it/s] 91%|█████████ | 10723/11788 [1:30:55<08:46, 2.02it/s] 91%|█████████ | 10724/11788 [1:30:56<08:46, 2.02it/s] 91%|█████████ | 10725/11788 [1:30:56<08:45, 2.02it/s]{'loss': 2.5617, 'grad_norm': 0.25805187225341797, 'learning_rate': 2.456791691051491e-05, 'epoch': 12.74} + 91%|█████████ | 10725/11788 [1:30:56<08:45, 2.02it/s] 91%|█████████ | 10726/11788 [1:30:57<08:45, 2.02it/s] 91%|█████████ | 10727/11788 [1:30:57<08:45, 2.02it/s] 91%|█████████ | 10728/11788 [1:30:58<08:43, 2.02it/s] 91%|█████████ | 10729/11788 [1:30:58<08:43, 2.02it/s] 91%|█████████ | 10730/11788 [1:30:59<08:43, 2.02it/s] 91%|█████████ | 10731/11788 [1:30:59<08:42, 2.02it/s] 91%|█████████ | 10732/11788 [1:31:00<08:41, 2.02it/s] 91%|█████████ | 10733/11788 [1:31:00<08:41, 2.02it/s] 91%|█████████ | 10734/11788 [1:31:01<08:40, 2.02it/s] 91%|█████████ | 10735/11788 [1:31:01<08:40, 2.02it/s] 91%|█████████ | 10736/11788 [1:31:02<08:39, 2.02it/s] 91%|█████████ | 10737/11788 [1:31:02<08:39, 2.02it/s] 91%|█████████ | 10738/11788 [1:31:03<08:39, 2.02it/s] 91%|█████████ | 10739/11788 [1:31:03<08:39, 2.02it/s] 91%|█████████ | 10740/11788 [1:31:04<08:37, 2.02it/s] 91%|█████████ | 10741/11788 [1:31:04<08:37, 2.02it/s] 91%|█████████ | 10742/11788 [1:31:05<08:36, 2.02it/s] 91%|█████████ | 10743/11788 [1:31:05<08:35, 2.03it/s] 91%|█████████ | 10744/11788 [1:31:06<08:35, 2.02it/s] 91%|█████████ | 10745/11788 [1:31:06<08:34, 2.03it/s] 91%|█████████ | 10746/11788 [1:31:07<08:35, 2.02it/s] 91%|█████████ | 10747/11788 [1:31:07<08:34, 2.02it/s] 91%|█████████ | 10748/11788 [1:31:08<08:34, 2.02it/s] 91%|█████████ | 10749/11788 [1:31:08<08:33, 2.02it/s] 91%|█████████ | 10750/11788 [1:31:09<08:33, 2.02it/s]{'loss': 2.5859, 'grad_norm': 0.25547170639038086, 'learning_rate': 2.343492004458947e-05, 'epoch': 12.77} + 91%|█████████ | 10750/11788 [1:31:09<08:33, 2.02it/s] 91%|█████████ | 10751/11788 [1:31:09<08:32, 2.02it/s] 91%|█████████ | 10752/11788 [1:31:10<08:33, 2.02it/s] 91%|█████████ | 10753/11788 [1:31:10<08:32, 2.02it/s] 91%|█████████ | 10754/11788 [1:31:11<08:32, 2.02it/s] 91%|█████████ | 10755/11788 [1:31:11<08:30, 2.02it/s] 91%|█████████ | 10756/11788 [1:31:12<08:30, 2.02it/s] 91%|█████████▏| 10757/11788 [1:31:12<08:29, 2.02it/s] 91%|█████████▏| 10758/11788 [1:31:13<08:29, 2.02it/s] 91%|█████████▏| 10759/11788 [1:31:13<08:28, 2.02it/s] 91%|█████████▏| 10760/11788 [1:31:14<08:27, 2.03it/s] 91%|█████████▏| 10761/11788 [1:31:14<08:26, 2.03it/s] 91%|█████████▏| 10762/11788 [1:31:15<08:26, 2.03it/s] 91%|█████████▏| 10763/11788 [1:31:15<08:26, 2.02it/s] 91%|█████████▏| 10764/11788 [1:31:16<08:25, 2.02it/s] 91%|█████████▏| 10765/11788 [1:31:16<08:25, 2.02it/s] 91%|█████████▏| 10766/11788 [1:31:17<08:24, 2.03it/s] 91%|█████████▏| 10767/11788 [1:31:17<08:24, 2.03it/s] 91%|█████████▏| 10768/11788 [1:31:18<08:23, 2.02it/s] 91%|█████████▏| 10769/11788 [1:31:18<08:22, 2.03it/s] 91%|█████████▏| 10770/11788 [1:31:19<08:22, 2.03it/s] 91%|█████████▏| 10771/11788 [1:31:19<08:22, 2.03it/s] 91%|█████████▏| 10772/11788 [1:31:20<08:21, 2.02it/s] 91%|█████████▏| 10773/11788 [1:31:20<08:20, 2.03it/s] 91%|█████████▏| 10774/11788 [1:31:21<08:21, 2.02it/s] 91%|█████████▏| 10775/11788 [1:31:21<08:19, 2.03it/s]{'loss': 2.5727, 'grad_norm': 0.2553807497024536, 'learning_rate': 2.2328041850986524e-05, 'epoch': 12.8} + 91%|█████████▏| 10775/11788 [1:31:21<08:19, 2.03it/s] 91%|█████████▏| 10776/11788 [1:31:22<08:20, 2.02it/s] 91%|█████████▏| 10777/11788 [1:31:22<08:19, 2.02it/s] 91%|█████████▏| 10778/11788 [1:31:23<08:19, 2.02it/s] 91%|█████████▏| 10779/11788 [1:31:23<08:19, 2.02it/s] 91%|█████████▏| 10780/11788 [1:31:24<08:18, 2.02it/s] 91%|█████████▏| 10781/11788 [1:31:24<08:17, 2.02it/s] 91%|█████████▏| 10782/11788 [1:31:25<08:16, 2.02it/s] 91%|█████████▏| 10783/11788 [1:31:25<08:16, 2.03it/s] 91%|█████████▏| 10784/11788 [1:31:25<08:15, 2.03it/s] 91%|█████████▏| 10785/11788 [1:31:26<08:15, 2.02it/s] 91%|█████████▏| 10786/11788 [1:31:26<08:14, 2.02it/s] 92%|█████████▏| 10787/11788 [1:31:27<08:14, 2.02it/s] 92%|█████████▏| 10788/11788 [1:31:27<08:14, 2.02it/s] 92%|█████████▏| 10789/11788 [1:31:28<08:13, 2.02it/s] 92%|█████████▏| 10790/11788 [1:31:28<08:12, 2.02it/s] 92%|█████████▏| 10791/11788 [1:31:29<08:12, 2.02it/s] 92%|█████████▏| 10792/11788 [1:31:29<08:11, 2.03it/s] 92%|█████████▏| 10793/11788 [1:31:30<08:12, 2.02it/s] 92%|█████████▏| 10794/11788 [1:31:30<08:11, 2.02it/s] 92%|█████████▏| 10795/11788 [1:31:31<08:10, 2.02it/s] 92%|█████████▏| 10796/11788 [1:31:31<08:09, 2.03it/s] 92%|█████████▏| 10797/11788 [1:31:32<08:08, 2.03it/s] 92%|█████████▏| 10798/11788 [1:31:32<08:08, 2.03it/s] 92%|█████████▏| 10799/11788 [1:31:33<08:08, 2.02it/s] 92%|█████████▏| 10800/11788 [1:31:33<08:08, 2.02it/s] {'loss': 2.5572, 'grad_norm': 0.25495895743370056, 'learning_rate': 2.1247342993380437e-05, 'epoch': 12.83} + 92%|█████████▏| 10800/11788 [1:31:33<08:08, 2.02it/s] 92%|█████████▏| 10801/11788 [1:31:34<08:08, 2.02it/s] 92%|█████████▏| 10802/11788 [1:31:34<08:07, 2.02it/s] 92%|█████████▏| 10803/11788 [1:31:35<08:06, 2.02it/s] 92%|█████████▏| 10804/11788 [1:31:35<08:06, 2.02it/s] 92%|█████████▏| 10805/11788 [1:31:36<08:05, 2.02it/s] 92%|█████████▏| 10806/11788 [1:31:36<08:04, 2.03it/s] 92%|█████████▏| 10807/11788 [1:31:37<08:04, 2.02it/s] 92%|█████████▏| 10808/11788 [1:31:37<08:03, 2.03it/s] 92%|█████████▏| 10809/11788 [1:31:38<08:45, 1.86it/s] 92%|█████████▏| 10810/11788 [1:31:38<08:31, 1.91it/s] 92%|█████████▏| 10811/11788 [1:31:39<08:22, 1.94it/s] 92%|█████████▏| 10812/11788 [1:31:39<08:16, 1.97it/s] 92%|█████████▏| 10813/11788 [1:31:40<08:11, 1.98it/s] 92%|█████████▏| 10814/11788 [1:31:40<08:08, 1.99it/s] 92%|█████████▏| 10815/11788 [1:31:41<08:06, 2.00it/s] 92%|█████████▏| 10816/11788 [1:31:41<08:04, 2.01it/s] 92%|█████████▏| 10817/11788 [1:31:42<08:02, 2.01it/s] 92%|█████████▏| 10818/11788 [1:31:42<08:01, 2.01it/s] 92%|█████████▏| 10819/11788 [1:31:43<08:00, 2.02it/s] 92%|█████████▏| 10820/11788 [1:31:43<07:59, 2.02it/s] 92%|█████████▏| 10821/11788 [1:31:44<07:58, 2.02it/s] 92%|█████████▏| 10822/11788 [1:31:44<07:58, 2.02it/s] 92%|█████████▏| 10823/11788 [1:31:45<07:57, 2.02it/s] 92%|█████████▏| 10824/11788 [1:31:45<07:56, 2.02it/s] 92%|█████████▏| 10825/11788 [1:31:46<07:56, 2.02it/s]{'loss': 2.5685, 'grad_norm': 0.25548508763313293, 'learning_rate': 2.0192882700658612e-05, 'epoch': 12.86} + 92%|█████████▏| 10825/11788 [1:31:46<07:56, 2.02it/s] 92%|█████████▏| 10826/11788 [1:31:46<07:56, 2.02it/s] 92%|█████████▏| 10827/11788 [1:31:47<07:55, 2.02it/s] 92%|█████████▏| 10828/11788 [1:31:47<07:55, 2.02it/s] 92%|█████████▏| 10829/11788 [1:31:48<07:54, 2.02it/s] 92%|█████████▏| 10830/11788 [1:31:48<07:54, 2.02it/s] 92%|█████████▏| 10831/11788 [1:31:49<07:53, 2.02it/s] 92%|█████████▏| 10832/11788 [1:31:49<07:53, 2.02it/s] 92%|█████████▏| 10833/11788 [1:31:50<07:51, 2.02it/s] 92%|█████████▏| 10834/11788 [1:31:50<07:51, 2.02it/s] 92%|█████████▏| 10835/11788 [1:31:51<07:51, 2.02it/s] 92%|█████████▏| 10836/11788 [1:31:51<07:50, 2.02it/s] 92%|█████████▏| 10837/11788 [1:31:52<07:50, 2.02it/s] 92%|█████████▏| 10838/11788 [1:31:52<07:49, 2.02it/s] 92%|█████████▏| 10839/11788 [1:31:53<07:48, 2.02it/s] 92%|█████████▏| 10840/11788 [1:31:53<07:48, 2.02it/s] 92%|█████████▏| 10841/11788 [1:31:54<07:48, 2.02it/s] 92%|█████████▏| 10842/11788 [1:31:54<07:48, 2.02it/s] 92%|█████████▏| 10843/11788 [1:31:55<07:47, 2.02it/s] 92%|█████████▏| 10844/11788 [1:31:55<07:47, 2.02it/s] 92%|█████████▏| 10845/11788 [1:31:56<07:46, 2.02it/s] 92%|█████████▏| 10846/11788 [1:31:56<07:45, 2.02it/s] 92%|█████████▏| 10847/11788 [1:31:57<07:44, 2.02it/s] 92%|█████████▏| 10848/11788 [1:31:57<07:44, 2.02it/s] 92%|█████████▏| 10849/11788 [1:31:58<07:44, 2.02it/s] 92%|█████████▏| 10850/11788 [1:31:58<07:44, 2.02it/s]{'loss': 2.5678, 'grad_norm': 0.25530749559402466, 'learning_rate': 1.9164718763674648e-05, 'epoch': 12.89} + 92%|█████████▏| 10850/11788 [1:31:58<07:44, 2.02it/s] 92%|█████████▏| 10851/11788 [1:31:59<07:43, 2.02it/s] 92%|█████████▏| 10852/11788 [1:31:59<07:42, 2.02it/s] 92%|█████████▏| 10853/11788 [1:32:00<07:41, 2.02it/s] 92%|█████████▏| 10854/11788 [1:32:00<07:41, 2.02it/s] 92%|█████████▏| 10855/11788 [1:32:01<07:41, 2.02it/s] 92%|█████████▏| 10856/11788 [1:32:01<07:40, 2.02it/s] 92%|█████████▏| 10857/11788 [1:32:02<07:39, 2.02it/s] 92%|█████████▏| 10858/11788 [1:32:02<07:39, 2.02it/s] 92%|█████████▏| 10859/11788 [1:32:03<07:38, 2.02it/s] 92%|█████████▏| 10860/11788 [1:32:03<07:38, 2.02it/s] 92%|█████████▏| 10861/11788 [1:32:04<07:38, 2.02it/s] 92%|█████████▏| 10862/11788 [1:32:04<07:37, 2.02it/s] 92%|█████████▏| 10863/11788 [1:32:05<07:36, 2.02it/s] 92%|█████████▏| 10864/11788 [1:32:05<07:36, 2.02it/s] 92%|█████████▏| 10865/11788 [1:32:06<07:35, 2.02it/s] 92%|█████████▏| 10866/11788 [1:32:06<07:35, 2.03it/s] 92%|█████████▏| 10867/11788 [1:32:07<07:34, 2.03it/s] 92%|█████████▏| 10868/11788 [1:32:07<07:34, 2.03it/s] 92%|█████████▏| 10869/11788 [1:32:08<07:33, 2.02it/s] 92%|█████████▏| 10870/11788 [1:32:08<07:33, 2.03it/s] 92%|█████████▏| 10871/11788 [1:32:09<07:33, 2.02it/s] 92%|█████████▏| 10872/11788 [1:32:09<07:32, 2.03it/s] 92%|█████████▏| 10873/11788 [1:32:10<07:32, 2.02it/s] 92%|█████████▏| 10874/11788 [1:32:10<07:31, 2.02it/s] 92%|█████████▏| 10875/11788 [1:32:11<07:31, 2.02it/s]{'loss': 2.5737, 'grad_norm': 0.2549803555011749, 'learning_rate': 1.8162907532081717e-05, 'epoch': 12.92} + 92%|█████████▏| 10875/11788 [1:32:11<07:31, 2.02it/s] 92%|█████████▏| 10876/11788 [1:32:11<07:31, 2.02it/s] 92%|█████████▏| 10877/11788 [1:32:12<07:31, 2.02it/s] 92%|█████████▏| 10878/11788 [1:32:12<07:29, 2.02it/s] 92%|█████████▏| 10879/11788 [1:32:13<07:29, 2.02it/s] 92%|█████████▏| 10880/11788 [1:32:13<07:28, 2.02it/s] 92%|█████████▏| 10881/11788 [1:32:14<07:28, 2.02it/s] 92%|█████████▏| 10882/11788 [1:32:14<07:27, 2.02it/s] 92%|█████████▏| 10883/11788 [1:32:15<07:27, 2.02it/s] 92%|█████████▏| 10884/11788 [1:32:15<07:26, 2.02it/s] 92%|█████████▏| 10885/11788 [1:32:16<07:26, 2.02it/s] 92%|█████████▏| 10886/11788 [1:32:16<07:26, 2.02it/s] 92%|█████████▏| 10887/11788 [1:32:17<07:25, 2.02it/s] 92%|█████████▏| 10888/11788 [1:32:17<07:24, 2.02it/s] 92%|█████████▏| 10889/11788 [1:32:18<07:24, 2.02it/s] 92%|█████████▏| 10890/11788 [1:32:18<07:24, 2.02it/s] 92%|█████████▏| 10891/11788 [1:32:19<07:24, 2.02it/s] 92%|█████████▏| 10892/11788 [1:32:19<07:23, 2.02it/s] 92%|█████████▏| 10893/11788 [1:32:20<07:23, 2.02it/s] 92%|█████████▏| 10894/11788 [1:32:20<07:22, 2.02it/s] 92%|█████████▏| 10895/11788 [1:32:21<07:21, 2.02it/s] 92%|█████████▏| 10896/11788 [1:32:21<07:58, 1.86it/s] 92%|█████████▏| 10897/11788 [1:32:22<07:47, 1.91it/s] 92%|█████████▏| 10898/11788 [1:32:22<07:38, 1.94it/s] 92%|█████████▏| 10899/11788 [1:32:23<07:32, 1.97it/s] 92%|█████████▏| 10900/11788 [1:32:23<07:28, 1.98it/s]{'loss': 2.5668, 'grad_norm': 0.2555125951766968, 'learning_rate': 1.7187503911244183e-05, 'epoch': 12.95} + 92%|█████████▏| 10900/11788 [1:32:23<07:28, 1.98it/s] 92%|█████████▏| 10901/11788 [1:32:24<07:24, 1.99it/s] 92%|█████████▏| 10902/11788 [1:32:24<07:22, 2.00it/s] 92%|█████████▏| 10903/11788 [1:32:25<07:21, 2.01it/s] 93%|█████████▎| 10904/11788 [1:32:25<07:19, 2.01it/s] 93%|█████████▎| 10905/11788 [1:32:26<07:17, 2.02it/s] 93%|█████████▎| 10906/11788 [1:32:26<07:17, 2.02it/s] 93%|█████████▎| 10907/11788 [1:32:27<07:16, 2.02it/s] 93%|█████████▎| 10908/11788 [1:32:27<07:15, 2.02it/s] 93%|█████████▎| 10909/11788 [1:32:28<07:14, 2.02it/s] 93%|█████████▎| 10910/11788 [1:32:28<07:14, 2.02it/s] 93%|█████████▎| 10911/11788 [1:32:29<07:13, 2.02it/s] 93%|█████████▎| 10912/11788 [1:32:29<07:13, 2.02it/s] 93%|█████████▎| 10913/11788 [1:32:30<07:12, 2.02it/s] 93%|█████████▎| 10914/11788 [1:32:30<07:12, 2.02it/s] 93%|█████████▎| 10915/11788 [1:32:31<07:11, 2.02it/s] 93%|█████████▎| 10916/11788 [1:32:31<07:10, 2.02it/s] 93%|█████████▎| 10917/11788 [1:32:32<07:10, 2.02it/s] 93%|█████████▎| 10918/11788 [1:32:32<07:10, 2.02it/s] 93%|█████████▎| 10919/11788 [1:32:33<07:09, 2.02it/s] 93%|█████████▎| 10920/11788 [1:32:33<07:09, 2.02it/s] 93%|█████████▎| 10921/11788 [1:32:34<07:08, 2.02it/s] 93%|█████████▎| 10922/11788 [1:32:34<07:08, 2.02it/s] 93%|█████████▎| 10923/11788 [1:32:34<07:07, 2.02it/s] 93%|█████████▎| 10924/11788 [1:32:35<07:06, 2.02it/s] 93%|█████████▎| 10925/11788 [1:32:35<07:06, 2.02it/s]{'loss': 2.5665, 'grad_norm': 0.251896470785141, 'learning_rate': 1.6238561359228022e-05, 'epoch': 12.98} + 93%|█████████▎| 10925/11788 [1:32:35<07:06, 2.02it/s] 93%|█████████▎| 10926/11788 [1:32:36<07:08, 2.01it/s] 93%|█████████▎| 10927/11788 [1:32:36<07:06, 2.02it/s] 93%|█████████▎| 10928/11788 [1:32:37<07:06, 2.02it/s] 93%|█████████▎| 10929/11788 [1:32:37<07:05, 2.02it/s] 93%|█████████▎| 10930/11788 [1:32:38<07:04, 2.02it/s] 93%|█████████▎| 10931/11788 [1:32:38<07:03, 2.02it/s] 93%|█████████▎| 10932/11788 [1:32:39<07:03, 2.02it/s] 93%|█████████▎| 10933/11788 [1:32:39<07:02, 2.02it/s] 93%|█████████▎| 10934/11788 [1:32:40<07:02, 2.02it/s] 93%|█████████▎| 10935/11788 [1:32:40<07:01, 2.02it/s] 93%|█████████▎| 10936/11788 [1:32:41<07:01, 2.02it/s] 93%|█████████▎| 10937/11788 [1:32:41<07:00, 2.02it/s] 93%|█████████▎| 10938/11788 [1:32:42<07:00, 2.02it/s] 93%|█████████▎| 10939/11788 [1:32:42<07:00, 2.02it/s] 93%|█████████▎| 10940/11788 [1:32:43<06:59, 2.02it/s] 93%|█████████▎| 10941/11788 [1:32:43<06:59, 2.02it/s] 93%|█████████▎| 10942/11788 [1:32:44<06:58, 2.02it/s] 93%|█████████▎| 10943/11788 [1:32:44<06:58, 2.02it/s] 93%|█████████▎| 10944/11788 [1:32:45<06:57, 2.02it/s] 93%|█████████▎| 10945/11788 [1:32:45<06:57, 2.02it/s] 93%|█████████▎| 10946/11788 [1:32:46<06:53, 2.04it/s] 93%|█████████▎| 10947/11788 [1:32:58<55:00, 3.92s/it] 93%|█████████▎| 10948/11788 [1:32:58<40:32, 2.90s/it] 93%|█████████▎| 10949/11788 [1:32:59<30:24, 2.17s/it] 93%|█████████▎| 10950/11788 [1:32:59<23:21, 1.67s/it]{'loss': 2.5569, 'grad_norm': 0.2526967525482178, 'learning_rate': 1.5316131883871432e-05, 'epoch': 13.0} + 93%|█████████▎| 10950/11788 [1:32:59<23:21, 1.67s/it] 93%|█████████▎| 10951/11788 [1:33:00<18:24, 1.32s/it] 93%|█████████▎| 10952/11788 [1:33:00<14:56, 1.07s/it] 93%|█████████▎| 10953/11788 [1:33:01<12:31, 1.11it/s] 93%|█████████▎| 10954/11788 [1:33:01<10:48, 1.29it/s] 93%|█████████▎| 10955/11788 [1:33:02<09:37, 1.44it/s] 93%|█████████▎| 10956/11788 [1:33:02<08:47, 1.58it/s] 93%|█████████▎| 10957/11788 [1:33:03<08:12, 1.69it/s] 93%|█████████▎| 10958/11788 [1:33:03<07:46, 1.78it/s] 93%|███���█████▎| 10959/11788 [1:33:04<07:28, 1.85it/s] 93%|█████████▎| 10960/11788 [1:33:04<07:16, 1.90it/s] 93%|█████████▎| 10961/11788 [1:33:05<07:07, 1.93it/s] 93%|█████████▎| 10962/11788 [1:33:05<07:02, 1.96it/s] 93%|█████████▎| 10963/11788 [1:33:06<06:58, 1.97it/s] 93%|█████████▎| 10964/11788 [1:33:06<06:54, 1.99it/s] 93%|█████████▎| 10965/11788 [1:33:07<06:51, 2.00it/s] 93%|█████████▎| 10966/11788 [1:33:07<06:49, 2.01it/s] 93%|█████████▎| 10967/11788 [1:33:08<06:47, 2.01it/s] 93%|█████████▎| 10968/11788 [1:33:08<06:46, 2.02it/s] 93%|█████████▎| 10969/11788 [1:33:09<06:45, 2.02it/s] 93%|█████████▎| 10970/11788 [1:33:09<06:44, 2.02it/s] 93%|█████████▎| 10971/11788 [1:33:10<06:44, 2.02it/s] 93%|█████████▎| 10972/11788 [1:33:10<06:43, 2.02it/s] 93%|█████████▎| 10973/11788 [1:33:11<06:42, 2.02it/s] 93%|█████████▎| 10974/11788 [1:33:11<06:42, 2.02it/s] 93%|█████████▎| 10975/11788 [1:33:12<06:42, 2.02it/s]{'loss': 2.5423, 'grad_norm': 0.2564471960067749, 'learning_rate': 1.442026603993435e-05, 'epoch': 13.03} + 93%|█████████▎| 10975/11788 [1:33:12<06:42, 2.02it/s] 93%|█████████▎| 10976/11788 [1:33:12<06:41, 2.02it/s] 93%|█████████▎| 10977/11788 [1:33:13<06:41, 2.02it/s] 93%|█████████▎| 10978/11788 [1:33:13<06:40, 2.02it/s] 93%|█████████▎| 10979/11788 [1:33:14<06:40, 2.02it/s] 93%|█████████▎| 10980/11788 [1:33:14<06:39, 2.02it/s] 93%|█████████▎| 10981/11788 [1:33:15<06:38, 2.03it/s] 93%|█████████▎| 10982/11788 [1:33:15<06:37, 2.03it/s] 93%|█████████▎| 10983/11788 [1:33:16<06:37, 2.03it/s] 93%|█████████▎| 10984/11788 [1:33:16<06:37, 2.02it/s] 93%|█████████▎| 10985/11788 [1:33:17<06:36, 2.02it/s] 93%|█████████▎| 10986/11788 [1:33:17<06:36, 2.02it/s] 93%|█████████▎| 10987/11788 [1:33:18<06:35, 2.03it/s] 93%|█████████▎| 10988/11788 [1:33:18<06:35, 2.02it/s] 93%|█████████▎| 10989/11788 [1:33:19<06:34, 2.02it/s] 93%|█████████▎| 10990/11788 [1:33:19<06:34, 2.02it/s] 93%|█████████▎| 10991/11788 [1:33:20<06:33, 2.02it/s] 93%|█████████▎| 10992/11788 [1:33:20<06:33, 2.02it/s] 93%|█████████▎| 10993/11788 [1:33:21<06:32, 2.02it/s] 93%|█████████▎| 10994/11788 [1:33:21<06:32, 2.02it/s] 93%|█████████▎| 10995/11788 [1:33:22<06:31, 2.03it/s] 93%|█████████▎| 10996/11788 [1:33:22<06:31, 2.02it/s] 93%|█████████▎| 10997/11788 [1:33:23<06:30, 2.03it/s] 93%|█████████▎| 10998/11788 [1:33:23<06:30, 2.03it/s] 93%|█████████▎| 10999/11788 [1:33:24<06:29, 2.02it/s] 93%|█████████▎| 11000/11788 [1:33:24<06:29, 2.02it/s]{'loss': 2.5509, 'grad_norm': 0.2568507790565491, 'learning_rate': 1.3551012926327822e-05, 'epoch': 13.06} + 93%|█████████▎| 11000/11788 [1:33:24<06:29, 2.02it/s] 93%|█████████▎| 11001/11788 [1:33:25<06:30, 2.02it/s] 93%|█████████▎| 11002/11788 [1:33:25<06:30, 2.01it/s] 93%|█████████▎| 11003/11788 [1:33:25<06:29, 2.02it/s] 93%|█████████▎| 11004/11788 [1:33:26<06:28, 2.02it/s] 93%|█████████▎| 11005/11788 [1:33:26<06:27, 2.02it/s] 93%|█████████▎| 11006/11788 [1:33:27<06:26, 2.02it/s] 93%|█████████▎| 11007/11788 [1:33:27<06:26, 2.02it/s] 93%|█████████▎| 11008/11788 [1:33:28<06:25, 2.02it/s] 93%|█████████▎| 11009/11788 [1:33:28<06:25, 2.02it/s] 93%|█████████▎| 11010/11788 [1:33:29<06:24, 2.02it/s] 93%|█████████▎| 11011/11788 [1:33:29<06:24, 2.02it/s] 93%|█████████▎| 11012/11788 [1:33:30<06:23, 2.02it/s] 93%|█████████▎| 11013/11788 [1:33:30<06:23, 2.02it/s] 93%|█████████▎| 11014/11788 [1:33:31<06:22, 2.02it/s] 93%|█████████▎| 11015/11788 [1:33:31<06:22, 2.02it/s] 93%|█████████▎| 11016/11788 [1:33:32<06:21, 2.02it/s] 93%|█████████▎| 11017/11788 [1:33:32<06:21, 2.02it/s] 93%|█████████▎| 11018/11788 [1:33:33<06:20, 2.02it/s] 93%|█████████▎| 11019/11788 [1:33:33<06:20, 2.02it/s] 93%|████���████▎| 11020/11788 [1:33:34<06:19, 2.02it/s] 93%|█████████▎| 11021/11788 [1:33:34<06:19, 2.02it/s] 94%|█████████▎| 11022/11788 [1:33:35<06:18, 2.02it/s] 94%|█████████▎| 11023/11788 [1:33:35<06:18, 2.02it/s] 94%|█████████▎| 11024/11788 [1:33:36<06:17, 2.02it/s] 94%|█████████▎| 11025/11788 [1:33:36<06:17, 2.02it/s]{'loss': 2.5285, 'grad_norm': 0.25689539313316345, 'learning_rate': 1.2708420183422776e-05, 'epoch': 13.09} + 94%|█████████▎| 11025/11788 [1:33:36<06:17, 2.02it/s] 94%|█████████▎| 11026/11788 [1:33:37<06:17, 2.02it/s] 94%|█████████▎| 11027/11788 [1:33:37<06:16, 2.02it/s] 94%|█████████▎| 11028/11788 [1:33:38<06:16, 2.02it/s] 94%|█████████▎| 11029/11788 [1:33:38<06:15, 2.02it/s] 94%|█████████▎| 11030/11788 [1:33:39<06:15, 2.02it/s] 94%|█████████▎| 11031/11788 [1:33:39<06:14, 2.02it/s] 94%|█████████▎| 11032/11788 [1:33:40<06:13, 2.02it/s] 94%|█████████▎| 11033/11788 [1:33:40<06:13, 2.02it/s] 94%|█████████▎| 11034/11788 [1:33:41<06:12, 2.02it/s] 94%|█████████▎| 11035/11788 [1:33:41<06:12, 2.02it/s] 94%|█████████▎| 11036/11788 [1:33:42<06:11, 2.02it/s] 94%|█████████▎| 11037/11788 [1:33:42<06:11, 2.02it/s] 94%|█████████▎| 11038/11788 [1:33:43<06:10, 2.02it/s] 94%|█████████▎| 11039/11788 [1:33:43<06:10, 2.02it/s] 94%|█████████▎| 11040/11788 [1:33:44<06:09, 2.02it/s] 94%|█████████▎| 11041/11788 [1:33:44<06:08, 2.03it/s] 94%|█████████▎| 11042/11788 [1:33:45<06:08, 2.02it/s] 94%|█████████▎| 11043/11788 [1:33:45<06:07, 2.03it/s] 94%|█████████▎| 11044/11788 [1:33:46<06:07, 2.02it/s] 94%|█████████▎| 11045/11788 [1:33:46<06:07, 2.02it/s] 94%|█████████▎| 11046/11788 [1:33:47<06:07, 2.02it/s] 94%|█████████▎| 11047/11788 [1:33:47<06:06, 2.02it/s] 94%|█████████▎| 11048/11788 [1:33:48<06:06, 2.02it/s] 94%|█████████▎| 11049/11788 [1:33:48<06:05, 2.02it/s] 94%|█████████▎| 11050/11788 [1:33:49<06:05, 2.02it/s]{'loss': 2.5429, 'grad_norm': 0.25668925046920776, 'learning_rate': 1.1892533990439491e-05, 'epoch': 13.12} + 94%|█████████▎| 11050/11788 [1:33:49<06:05, 2.02it/s] 94%|█████████▎| 11051/11788 [1:33:49<06:04, 2.02it/s] 94%|█████████▍| 11052/11788 [1:33:50<06:04, 2.02it/s] 94%|█████████▍| 11053/11788 [1:33:50<06:03, 2.02it/s] 94%|█████████▍| 11054/11788 [1:33:51<06:03, 2.02it/s] 94%|█████████▍| 11055/11788 [1:33:51<06:02, 2.02it/s] 94%|█████████▍| 11056/11788 [1:33:52<06:02, 2.02it/s] 94%|█████████▍| 11057/11788 [1:33:52<06:01, 2.02it/s] 94%|█████████▍| 11058/11788 [1:33:53<06:01, 2.02it/s] 94%|█████████▍| 11059/11788 [1:33:53<06:00, 2.02it/s] 94%|█████████▍| 11060/11788 [1:33:54<06:00, 2.02it/s] 94%|█████████▍| 11061/11788 [1:33:54<05:59, 2.02it/s] 94%|█████████▍| 11062/11788 [1:33:55<05:58, 2.02it/s] 94%|█████████▍| 11063/11788 [1:33:55<05:58, 2.02it/s] 94%|█████████▍| 11064/11788 [1:33:56<05:58, 2.02it/s] 94%|█████████▍| 11065/11788 [1:33:56<05:57, 2.02it/s] 94%|█████████▍| 11066/11788 [1:33:57<05:57, 2.02it/s] 94%|█████████▍| 11067/11788 [1:33:57<05:55, 2.03it/s] 94%|█████████▍| 11068/11788 [1:33:58<05:55, 2.03it/s] 94%|█████████▍| 11069/11788 [1:33:58<05:54, 2.03it/s] 94%|█████████▍| 11070/11788 [1:33:59<05:54, 2.02it/s] 94%|█████████▍| 11071/11788 [1:33:59<05:54, 2.02it/s] 94%|█████████▍| 11072/11788 [1:34:00<05:54, 2.02it/s] 94%|█████████▍| 11073/11788 [1:34:00<05:53, 2.02it/s] 94%|█████████▍| 11074/11788 [1:34:01<05:52, 2.02it/s] 94%|█████████▍| 11075/11788 [1:34:01<05:52, 2.02it/s]{'loss': 2.554, 'grad_norm': 0.25547388195991516, 'learning_rate': 1.1103399062916298e-05, 'epoch': 13.15} + 94%|█████████▍| 11075/11788 [1:34:01<05:52, 2.02it/s] 94%|█████████▍| 11076/11788 [1:34:02<05:52, 2.02it/s] 94%|█████████▍| 11077/11788 [1:34:02<05:52, 2.02it/s] 94%|█████████▍| 11078/11788 [1:34:03<05:51, 2.02it/s] 94%|█████████▍| 11079/11788 [1:34:03<05:51, 2.02it/s] 94%|█████████▍| 11080/11788 [1:34:04<05:51, 2.02it/s] 94%|█████████▍| 11081/11788 [1:34:04<05:50, 2.02it/s] 94%|█████████▍| 11082/11788 [1:34:05<05:49, 2.02it/s] 94%|█████████▍| 11083/11788 [1:34:05<05:48, 2.02it/s] 94%|█████████▍| 11084/11788 [1:34:06<05:48, 2.02it/s] 94%|█████████▍| 11085/11788 [1:34:06<05:47, 2.02it/s] 94%|█████████▍| 11086/11788 [1:34:07<05:47, 2.02it/s] 94%|█████████▍| 11087/11788 [1:34:07<05:46, 2.02it/s] 94%|█████████▍| 11088/11788 [1:34:08<05:46, 2.02it/s] 94%|█████████▍| 11089/11788 [1:34:08<05:45, 2.02it/s] 94%|█████████▍| 11090/11788 [1:34:09<05:45, 2.02it/s] 94%|█████████▍| 11091/11788 [1:34:09<05:45, 2.02it/s] 94%|█████████▍| 11092/11788 [1:34:10<05:44, 2.02it/s] 94%|█████████▍| 11093/11788 [1:34:10<05:44, 2.02it/s] 94%|█████████▍| 11094/11788 [1:34:11<05:43, 2.02it/s] 94%|█████████▍| 11095/11788 [1:34:11<05:42, 2.02it/s] 94%|█████████▍| 11096/11788 [1:34:11<05:42, 2.02it/s] 94%|█████████▍| 11097/11788 [1:34:12<05:41, 2.02it/s] 94%|█████████▍| 11098/11788 [1:34:12<05:41, 2.02it/s] 94%|█████████▍| 11099/11788 [1:34:13<05:40, 2.02it/s] 94%|█████████▍| 11100/11788 [1:34:13<05:40, 2.02it/s]{'loss': 2.5411, 'grad_norm': 0.2547819912433624, 'learning_rate': 1.0341058650259083e-05, 'epoch': 13.18} + 94%|█████████▍| 11100/11788 [1:34:13<05:40, 2.02it/s] 94%|█████████▍| 11101/11788 [1:34:14<05:39, 2.02it/s] 94%|█████████▍| 11102/11788 [1:34:14<05:39, 2.02it/s] 94%|█████████▍| 11103/11788 [1:34:15<05:38, 2.02it/s] 94%|█████████▍| 11104/11788 [1:34:15<05:38, 2.02it/s] 94%|█████████▍| 11105/11788 [1:34:16<05:37, 2.02it/s] 94%|█████████▍| 11106/11788 [1:34:16<05:37, 2.02it/s] 94%|█████████▍| 11107/11788 [1:34:17<05:36, 2.02it/s] 94%|█████████▍| 11108/11788 [1:34:17<05:36, 2.02it/s] 94%|█████████▍| 11109/11788 [1:34:18<05:35, 2.02it/s] 94%|█████████▍| 11110/11788 [1:34:18<05:35, 2.02it/s] 94%|█████████▍| 11111/11788 [1:34:19<05:34, 2.02it/s] 94%|█████████▍| 11112/11788 [1:34:19<05:34, 2.02it/s] 94%|█████████▍| 11113/11788 [1:34:20<05:33, 2.02it/s] 94%|█████████▍| 11114/11788 [1:34:20<05:33, 2.02it/s] 94%|█████████▍| 11115/11788 [1:34:21<05:32, 2.02it/s] 94%|█████████▍| 11116/11788 [1:34:21<05:32, 2.02it/s] 94%|█████████▍| 11117/11788 [1:34:22<05:31, 2.02it/s] 94%|█████████▍| 11118/11788 [1:34:22<05:31, 2.02it/s] 94%|█████████▍| 11119/11788 [1:34:23<05:31, 2.02it/s] 94%|█████████▍| 11120/11788 [1:34:23<05:30, 2.02it/s] 94%|█████████▍| 11121/11788 [1:34:24<05:30, 2.02it/s] 94%|█████████▍| 11122/11788 [1:34:24<05:29, 2.02it/s] 94%|█████████▍| 11123/11788 [1:34:25<05:29, 2.02it/s] 94%|█████████▍| 11124/11788 [1:34:25<05:28, 2.02it/s] 94%|█████████▍| 11125/11788 [1:34:26<05:28, 2.02it/s]{'loss': 2.5358, 'grad_norm': 0.2540516257286072, 'learning_rate': 9.605554533371086e-06, 'epoch': 13.21} + 94%|█████████▍| 11125/11788 [1:34:26<05:28, 2.02it/s] 94%|█████████▍| 11126/11788 [1:34:26<05:28, 2.02it/s] 94%|█████████▍| 11127/11788 [1:34:27<05:27, 2.02it/s] 94%|█████████▍| 11128/11788 [1:34:27<05:26, 2.02it/s] 94%|█████████▍| 11129/11788 [1:34:28<05:26, 2.02it/s] 94%|█████████▍| 11130/11788 [1:34:28<05:25, 2.02it/s] 94%|█████████▍| 11131/11788 [1:34:29<05:24, 2.02it/s] 94%|█████████▍| 11132/11788 [1:34:29<05:24, 2.02it/s] 94%|█████████▍| 11133/11788 [1:34:30<05:24, 2.02it/s] 94%|█████████▍| 11134/11788 [1:34:30<05:23, 2.02it/s] 94%|█████████▍| 11135/11788 [1:34:31<05:23, 2.02it/s] 94%|█████████▍| 11136/11788 [1:34:31<05:22, 2.02it/s] 94%|█████████▍| 11137/11788 [1:34:32<05:22, 2.02it/s] 94%|█████████▍| 11138/11788 [1:34:32<05:21, 2.02it/s] 94%|█████████▍| 11139/11788 [1:34:33<05:21, 2.02it/s] 95%|█████████▍| 11140/11788 [1:34:33<05:20, 2.02it/s] 95%|█████████▍| 11141/11788 [1:34:34<05:20, 2.02it/s] 95%|█████████▍| 11142/11788 [1:34:34<05:19, 2.02it/s] 95%|█████████▍| 11143/11788 [1:34:35<05:19, 2.02it/s] 95%|█████████▍| 11144/11788 [1:34:35<05:18, 2.02it/s] 95%|█████████▍| 11145/11788 [1:34:36<05:17, 2.02it/s] 95%|█████████▍| 11146/11788 [1:34:36<05:17, 2.02it/s] 95%|█████████▍| 11147/11788 [1:34:37<05:16, 2.02it/s] 95%|█████████▍| 11148/11788 [1:34:37<05:16, 2.02it/s] 95%|█████████▍| 11149/11788 [1:34:38<05:15, 2.02it/s] 95%|█████████▍| 11150/11788 [1:34:38<05:15, 2.02it/s]{'loss': 2.5512, 'grad_norm': 0.2550664246082306, 'learning_rate': 8.896927022362612e-06, 'epoch': 13.24} + 95%|█████████▍| 11150/11788 [1:34:38<05:15, 2.02it/s] 95%|█████████▍| 11151/11788 [1:34:39<05:15, 2.02it/s] 95%|█████████▍| 11152/11788 [1:34:39<05:15, 2.01it/s] 95%|█████████▍| 11153/11788 [1:34:40<05:14, 2.02it/s] 95%|█████████▍| 11154/11788 [1:34:40<05:14, 2.02it/s] 95%|█████████▍| 11155/11788 [1:34:41<05:13, 2.02it/s] 95%|█████████▍| 11156/11788 [1:34:41<05:12, 2.02it/s] 95%|█████████▍| 11157/11788 [1:34:42<05:11, 2.02it/s] 95%|█████████▍| 11158/11788 [1:34:42<05:11, 2.02it/s] 95%|█████████▍| 11159/11788 [1:34:43<05:10, 2.02it/s] 95%|█████████▍| 11160/11788 [1:34:43<05:10, 2.02it/s] 95%|█████████▍| 11161/11788 [1:34:44<05:09, 2.02it/s] 95%|█████████▍| 11162/11788 [1:34:44<05:09, 2.02it/s] 95%|█████████▍| 11163/11788 [1:34:45<05:09, 2.02it/s] 95%|█████████▍| 11164/11788 [1:34:45<05:08, 2.02it/s] 95%|█████████▍| 11165/11788 [1:34:46<05:08, 2.02it/s] 95%|█████████▍| 11166/11788 [1:34:46<05:08, 2.02it/s] 95%|█████████▍| 11167/11788 [1:34:47<05:07, 2.02it/s] 95%|█████████▍| 11168/11788 [1:34:47<05:07, 2.02it/s] 95%|█████████▍| 11169/11788 [1:34:48<05:06, 2.02it/s] 95%|█████████▍| 11170/11788 [1:34:48<05:05, 2.02it/s] 95%|█████████▍| 11171/11788 [1:34:49<05:04, 2.02it/s] 95%|█████████▍| 11172/11788 [1:34:49<05:04, 2.02it/s] 95%|█████████▍| 11173/11788 [1:34:50<05:04, 2.02it/s] 95%|█████████▍| 11174/11788 [1:34:50<05:03, 2.02it/s] 95%|█████████▍| 11175/11788 [1:34:51<05:02, 2.02it/s]{'loss': 2.539, 'grad_norm': 0.2571260631084442, 'learning_rate': 8.21521495434241e-06, 'epoch': 13.27} + 95%|█████████▍| 11175/11788 [1:34:51<05:02, 2.02it/s] 95%|█████████▍| 11176/11788 [1:34:51<05:02, 2.02it/s] 95%|█████████▍| 11177/11788 [1:34:52<05:01, 2.02it/s] 95%|█████████▍| 11178/11788 [1:34:52<05:01, 2.02it/s] 95%|█████████▍| 11179/11788 [1:34:53<05:01, 2.02it/s] 95%|█████████▍| 11180/11788 [1:34:53<05:00, 2.02it/s] 95%|█████████▍| 11181/11788 [1:34:54<05:00, 2.02it/s] 95%|█████████▍| 11182/11788 [1:34:54<04:59, 2.02it/s] 95%|█████████▍| 11183/11788 [1:34:55<04:58, 2.02it/s] 95%|█████████▍| 11184/11788 [1:34:55<04:58, 2.02it/s] 95%|█████████▍| 11185/11788 [1:34:56<04:58, 2.02it/s] 95%|█████████▍| 11186/11788 [1:34:56<04:57, 2.02it/s] 95%|█████████▍| 11187/11788 [1:34:57<04:57, 2.02it/s] 95%|█████████▍| 11188/11788 [1:34:57<04:56, 2.02it/s] 95%|█████████▍| 11189/11788 [1:34:58<04:56, 2.02it/s] 95%|█████████▍| 11190/11788 [1:34:58<04:55, 2.02it/s] 95%|█████████▍| 11191/11788 [1:34:58<04:55, 2.02it/s] 95%|█████████▍| 11192/11788 [1:34:59<04:55, 2.02it/s] 95%|█████████▍| 11193/11788 [1:34:59<04:54, 2.02it/s] 95%|█████████▍| 11194/11788 [1:35:00<04:54, 2.02it/s] 95%|█████████▍| 11195/11788 [1:35:00<04:53, 2.02it/s] 95%|█████████▍| 11196/11788 [1:35:01<04:53, 2.02it/s] 95%|█████████▍| 11197/11788 [1:35:01<04:52, 2.02it/s] 95%|█████████▍| 11198/11788 [1:35:02<04:52, 2.02it/s] 95%|█████████▌| 11199/11788 [1:35:02<04:51, 2.02it/s] 95%|█████████▌| 11200/11788 [1:35:03<04:51, 2.02it/s]{'loss': 2.5374, 'grad_norm': 0.2579394280910492, 'learning_rate': 7.560455691288548e-06, 'epoch': 13.3} + 95%|█████████▌| 11200/11788 [1:35:03<04:51, 2.02it/s] 95%|█████████▌| 11201/11788 [1:35:03<04:51, 2.02it/s] 95%|█████████▌| 11202/11788 [1:35:04<04:50, 2.02it/s] 95%|█████████▌| 11203/11788 [1:35:04<04:49, 2.02it/s] 95%|█████████▌| 11204/11788 [1:35:05<04:49, 2.02it/s] 95%|█████████▌| 11205/11788 [1:35:05<04:48, 2.02it/s] 95%|█████████▌| 11206/11788 [1:35:06<04:48, 2.02it/s] 95%|█████████▌| 11207/11788 [1:35:06<04:47, 2.02it/s] 95%|█████████▌| 11208/11788 [1:35:07<04:47, 2.02it/s] 95%|█████████▌| 11209/11788 [1:35:07<04:46, 2.02it/s] 95%|█████████▌| 11210/11788 [1:35:08<04:45, 2.02it/s] 95%|█████████▌| 11211/11788 [1:35:08<04:45, 2.02it/s] 95%|█████████▌| 11212/11788 [1:35:09<04:45, 2.02it/s] 95%|█████████▌| 11213/11788 [1:35:09<04:44, 2.02it/s] 95%|█████████▌| 11214/11788 [1:35:10<04:43, 2.02it/s] 95%|█████████▌| 11215/11788 [1:35:10<04:43, 2.02it/s] 95%|█████████▌| 11216/11788 [1:35:11<04:43, 2.02it/s] 95%|█████████▌| 11217/11788 [1:35:11<04:42, 2.02it/s] 95%|█████████▌| 11218/11788 [1:35:12<04:42, 2.02it/s] 95%|█████████▌| 11219/11788 [1:35:12<04:41, 2.02it/s] 95%|█████████▌| 11220/11788 [1:35:13<04:40, 2.02it/s] 95%|█████████▌| 11221/11788 [1:35:13<04:40, 2.02it/s] 95%|█████████▌| 11222/11788 [1:35:14<04:39, 2.02it/s] 95%|█████████▌| 11223/11788 [1:35:14<04:39, 2.02it/s] 95%|█████████▌| 11224/11788 [1:35:15<04:38, 2.02it/s] 95%|█████████▌| 11225/11788 [1:35:15<04:38, 2.02it/s]{'loss': 2.5371, 'grad_norm': 0.2554675042629242, 'learning_rate': 6.932685118001159e-06, 'epoch': 13.33} + 95%|█████████▌| 11225/11788 [1:35:15<04:38, 2.02it/s] 95%|█████████▌| 11226/11788 [1:35:16<04:38, 2.02it/s] 95%|█████████▌| 11227/11788 [1:35:16<04:38, 2.02it/s] 95%|█████████▌| 11228/11788 [1:35:17<04:37, 2.02it/s] 95%|█████████▌| 11229/11788 [1:35:17<04:36, 2.02it/s] 95%|█████████▌| 11230/11788 [1:35:18<04:36, 2.02it/s] 95%|█████████▌| 11231/11788 [1:35:18<04:35, 2.02it/s] 95%|█████████▌| 11232/11788 [1:35:19<04:35, 2.02it/s] 95%|█████████▌| 11233/11788 [1:35:19<04:35, 2.02it/s] 95%|█████████▌| 11234/11788 [1:35:20<04:34, 2.02it/s] 95%|█████████▌| 11235/11788 [1:35:20<04:33, 2.02it/s] 95%|█████████▌| 11236/11788 [1:35:21<04:33, 2.02it/s] 95%|█████████▌| 11237/11788 [1:35:21<04:32, 2.02it/s] 95%|█████████▌| 11238/11788 [1:35:22<04:32, 2.02it/s] 95%|█████████▌| 11239/11788 [1:35:22<04:32, 2.02it/s] 95%|█████████▌| 11240/11788 [1:35:23<04:31, 2.02it/s] 95%|█████████▌| 11241/11788 [1:35:23<04:31, 2.02it/s] 95%|█████████▌| 11242/11788 [1:35:24<04:30, 2.02it/s] 95%|█████████▌| 11243/11788 [1:35:24<04:29, 2.02it/s] 95%|█████████▌| 11244/11788 [1:35:25<04:29, 2.02it/s] 95%|█████████▌| 11245/11788 [1:35:25<04:28, 2.02it/s] 95%|█████████▌| 11246/11788 [1:35:26<04:28, 2.02it/s] 95%|█████████▌| 11247/11788 [1:35:26<04:27, 2.02it/s] 95%|█████████▌| 11248/11788 [1:35:27<04:27, 2.02it/s] 95%|█████████▌| 11249/11788 [1:35:27<04:26, 2.02it/s] 95%|█████████▌| 11250/11788 [1:35:28<04:26, 2.02it/s]{'loss': 2.5438, 'grad_norm': 0.2596590518951416, 'learning_rate': 6.3319376401357875e-06, 'epoch': 13.36} + 95%|█████████▌| 11250/11788 [1:35:28<04:26, 2.02it/s] 95%|█████████▌| 11251/11788 [1:35:28<04:26, 2.02it/s] 95%|█████████▌| 11252/11788 [1:35:29<04:25, 2.02it/s] 95%|█████████▌| 11253/11788 [1:35:29<04:24, 2.02it/s] 95%|█████████▌| 11254/11788 [1:35:30<04:24, 2.02it/s] 95%|█████████▌| 11255/11788 [1:35:30<04:23, 2.02it/s] 95%|█████████▌| 11256/11788 [1:35:31<04:23, 2.02it/s] 95%|█████████▌| 11257/11788 [1:35:31<04:22, 2.02it/s] 96%|���████████▌| 11258/11788 [1:35:32<04:22, 2.02it/s] 96%|█████████▌| 11259/11788 [1:35:32<04:22, 2.02it/s] 96%|█████████▌| 11260/11788 [1:35:33<04:21, 2.02it/s] 96%|█████████▌| 11261/11788 [1:35:33<04:20, 2.02it/s] 96%|█████████▌| 11262/11788 [1:35:34<04:20, 2.02it/s] 96%|█████████▌| 11263/11788 [1:35:34<04:19, 2.02it/s] 96%|█████████▌| 11264/11788 [1:35:35<04:19, 2.02it/s] 96%|█████████▌| 11265/11788 [1:35:35<04:18, 2.02it/s] 96%|█████████▌| 11266/11788 [1:35:36<04:18, 2.02it/s] 96%|█████████▌| 11267/11788 [1:35:36<04:17, 2.02it/s] 96%|█████████▌| 11268/11788 [1:35:37<04:17, 2.02it/s] 96%|█████████▌| 11269/11788 [1:35:37<04:16, 2.02it/s] 96%|█████████▌| 11270/11788 [1:35:38<04:16, 2.02it/s] 96%|█████████▌| 11271/11788 [1:35:38<04:15, 2.02it/s] 96%|█████████▌| 11272/11788 [1:35:39<04:15, 2.02it/s] 96%|█████████▌| 11273/11788 [1:35:39<04:14, 2.02it/s] 96%|█████████▌| 11274/11788 [1:35:40<04:14, 2.02it/s] 96%|█████████▌| 11275/11788 [1:35:40<04:13, 2.02it/s]{'loss': 2.5393, 'grad_norm': 0.2557508051395416, 'learning_rate': 5.7582461823171285e-06, 'epoch': 13.39} + 96%|█████████▌| 11275/11788 [1:35:40<04:13, 2.02it/s] 96%|█████████▌| 11276/11788 [1:35:41<04:13, 2.02it/s] 96%|█████████▌| 11277/11788 [1:35:41<04:12, 2.02it/s] 96%|█████████▌| 11278/11788 [1:35:42<04:12, 2.02it/s] 96%|█████████▌| 11279/11788 [1:35:42<04:11, 2.02it/s] 96%|█████████▌| 11280/11788 [1:35:43<04:11, 2.02it/s] 96%|█████████▌| 11281/11788 [1:35:43<04:10, 2.02it/s] 96%|█████████▌| 11282/11788 [1:35:44<04:10, 2.02it/s] 96%|█████████▌| 11283/11788 [1:35:44<04:10, 2.02it/s] 96%|█████████▌| 11284/11788 [1:35:45<04:09, 2.02it/s] 96%|█████████▌| 11285/11788 [1:35:45<04:09, 2.02it/s] 96%|█████████▌| 11286/11788 [1:35:46<04:08, 2.02it/s] 96%|█████████▌| 11287/11788 [1:35:46<04:07, 2.02it/s] 96%|█████████▌| 11288/11788 [1:35:47<04:07, 2.02it/s] 96%|█████████▌| 11289/11788 [1:35:47<04:07, 2.02it/s] 96%|█████████▌| 11290/11788 [1:35:48<04:06, 2.02it/s] 96%|█████████▌| 11291/11788 [1:35:48<04:05, 2.02it/s] 96%|█████████▌| 11292/11788 [1:35:48<04:05, 2.02it/s] 96%|█████████▌| 11293/11788 [1:35:49<04:04, 2.02it/s] 96%|█████████▌| 11294/11788 [1:35:49<04:04, 2.02it/s] 96%|█████████▌| 11295/11788 [1:35:50<04:03, 2.02it/s] 96%|█████████▌| 11296/11788 [1:35:50<04:03, 2.02it/s] 96%|█████████▌| 11297/11788 [1:35:51<04:03, 2.02it/s] 96%|█████████▌| 11298/11788 [1:35:51<04:02, 2.02it/s] 96%|█████████▌| 11299/11788 [1:35:52<04:02, 2.02it/s] 96%|█████████▌| 11300/11788 [1:35:52<04:01, 2.02it/s]{'loss': 2.5464, 'grad_norm': 0.25831231474876404, 'learning_rate': 5.211642186335408e-06, 'epoch': 13.42} + 96%|█████████▌| 11300/11788 [1:35:52<04:01, 2.02it/s] 96%|█████████▌| 11301/11788 [1:35:53<04:01, 2.01it/s] 96%|█████████▌| 11302/11788 [1:35:53<04:01, 2.01it/s] 96%|█████████▌| 11303/11788 [1:35:54<04:00, 2.01it/s] 96%|█████████▌| 11304/11788 [1:35:54<04:00, 2.02it/s] 96%|█████████▌| 11305/11788 [1:35:55<03:59, 2.02it/s] 96%|█████████▌| 11306/11788 [1:35:55<03:58, 2.02it/s] 96%|█████████▌| 11307/11788 [1:35:56<03:58, 2.02it/s] 96%|█████████▌| 11308/11788 [1:35:56<03:57, 2.02it/s] 96%|█████████▌| 11309/11788 [1:35:57<03:57, 2.02it/s] 96%|█████████▌| 11310/11788 [1:35:57<03:56, 2.02it/s] 96%|█████████▌| 11311/11788 [1:35:58<03:56, 2.02it/s] 96%|█████████▌| 11312/11788 [1:35:58<03:55, 2.02it/s] 96%|█████████▌| 11313/11788 [1:35:59<03:54, 2.02it/s] 96%|█████████▌| 11314/11788 [1:35:59<03:54, 2.02it/s] 96%|█████████▌| 11315/11788 [1:36:00<03:53, 2.02it/s] 96%|█████████▌| 11316/11788 [1:36:00<03:53, 2.02it/s] 96%|█████████▌| 11317/11788 [1:36:01<03:52, 2.02it/s] 96%|█████████▌| 11318/11788 [1:36:01<03:52, 2.02it/s] 96%|█████████▌| 11319/11788 [1:36:02<03:51, 2.02it/s] 96%|█████████▌| 11320/11788 [1:36:02<03:51, 2.02it/s] 96%|█████████▌| 11321/11788 [1:36:03<03:50, 2.02it/s] 96%|█████████▌| 11322/11788 [1:36:03<03:50, 2.02it/s] 96%|█████████▌| 11323/11788 [1:36:04<03:49, 2.02it/s] 96%|█████████▌| 11324/11788 [1:36:04<03:49, 2.02it/s] 96%|█████████▌| 11325/11788 [1:36:05<03:48, 2.02it/s] {'loss': 2.5476, 'grad_norm': 0.25492116808891296, 'learning_rate': 4.692155609422599e-06, 'epoch': 13.45} + 96%|█████████▌| 11325/11788 [1:36:05<03:48, 2.02it/s] 96%|█████████▌| 11326/11788 [1:36:05<03:48, 2.02it/s] 96%|█████████▌| 11327/11788 [1:36:06<03:48, 2.02it/s] 96%|█████████▌| 11328/11788 [1:36:06<03:47, 2.02it/s] 96%|█████████▌| 11329/11788 [1:36:07<03:47, 2.02it/s] 96%|█████████▌| 11330/11788 [1:36:07<03:46, 2.02it/s] 96%|█████████▌| 11331/11788 [1:36:08<03:46, 2.02it/s] 96%|█████████▌| 11332/11788 [1:36:08<03:46, 2.02it/s] 96%|█████████▌| 11333/11788 [1:36:09<03:45, 2.02it/s] 96%|█████████▌| 11334/11788 [1:36:09<03:44, 2.02it/s] 96%|█████████▌| 11335/11788 [1:36:10<03:44, 2.02it/s] 96%|█████████▌| 11336/11788 [1:36:10<03:43, 2.02it/s] 96%|█████████▌| 11337/11788 [1:36:11<03:43, 2.02it/s] 96%|█████████▌| 11338/11788 [1:36:11<03:42, 2.02it/s] 96%|█████████▌| 11339/11788 [1:36:12<03:42, 2.02it/s] 96%|█████████▌| 11340/11788 [1:36:12<03:41, 2.02it/s] 96%|█████████▌| 11341/11788 [1:36:13<03:41, 2.02it/s] 96%|█████████▌| 11342/11788 [1:36:13<03:40, 2.02it/s] 96%|█████████▌| 11343/11788 [1:36:14<03:39, 2.02it/s] 96%|█████████▌| 11344/11788 [1:36:14<03:39, 2.02it/s] 96%|█████████▌| 11345/11788 [1:36:15<03:38, 2.02it/s] 96%|█████████▋| 11346/11788 [1:36:15<03:39, 2.02it/s] 96%|█████████▋| 11347/11788 [1:36:16<03:38, 2.01it/s] 96%|█████████▋| 11348/11788 [1:36:16<03:38, 2.02it/s] 96%|█████████▋| 11349/11788 [1:36:17<03:37, 2.02it/s] 96%|█████████▋| 11350/11788 [1:36:17<03:36, 2.02it/s]{'loss': 2.548, 'grad_norm': 0.2551753520965576, 'learning_rate': 4.199814922610734e-06, 'epoch': 13.48} + 96%|█████████▋| 11350/11788 [1:36:17<03:36, 2.02it/s] 96%|█████████▋| 11351/11788 [1:36:18<03:36, 2.02it/s] 96%|█████████▋| 11352/11788 [1:36:18<03:36, 2.02it/s] 96%|█████████▋| 11353/11788 [1:36:19<03:35, 2.02it/s] 96%|█████████▋| 11354/11788 [1:36:19<03:34, 2.02it/s] 96%|█████████▋| 11355/11788 [1:36:20<03:34, 2.02it/s] 96%|█████████▋| 11356/11788 [1:36:20<03:33, 2.02it/s] 96%|█████████▋| 11357/11788 [1:36:21<03:33, 2.02it/s] 96%|█████████▋| 11358/11788 [1:36:21<03:32, 2.02it/s] 96%|█████████▋| 11359/11788 [1:36:22<03:32, 2.02it/s] 96%|█████████▋| 11360/11788 [1:36:22<03:31, 2.02it/s] 96%|█████████▋| 11361/11788 [1:36:23<03:31, 2.02it/s] 96%|█████████▋| 11362/11788 [1:36:23<03:30, 2.02it/s] 96%|█████████▋| 11363/11788 [1:36:24<03:30, 2.02it/s] 96%|█████████▋| 11364/11788 [1:36:24<03:29, 2.02it/s] 96%|█████████▋| 11365/11788 [1:36:25<03:29, 2.02it/s] 96%|█████████▋| 11366/11788 [1:36:25<03:28, 2.02it/s] 96%|█████████▋| 11367/11788 [1:36:26<03:28, 2.02it/s] 96%|█████████▋| 11368/11788 [1:36:26<03:27, 2.02it/s] 96%|█████████▋| 11369/11788 [1:36:27<03:27, 2.02it/s] 96%|█████████▋| 11370/11788 [1:36:27<03:26, 2.02it/s] 96%|█████████▋| 11371/11788 [1:36:28<03:26, 2.02it/s] 96%|█████████▋| 11372/11788 [1:36:28<03:25, 2.02it/s] 96%|█████████▋| 11373/11788 [1:36:29<03:25, 2.02it/s] 96%|█████████▋| 11374/11788 [1:36:29<03:24, 2.02it/s] 96%|█████████▋| 11375/11788 [1:36:30<03:24, 2.02it/s]{'loss': 2.5419, 'grad_norm': 0.25586843490600586, 'learning_rate': 3.734647109171596e-06, 'epoch': 13.51} + 96%|█████████▋| 11375/11788 [1:36:30<03:24, 2.02it/s] 97%|█████████▋| 11376/11788 [1:36:30<03:24, 2.02it/s] 97%|█████████▋| 11377/11788 [1:36:31<03:23, 2.02it/s] 97%|█████████▋| 11378/11788 [1:36:31<03:23, 2.02it/s] 97%|█████████▋| 11379/11788 [1:36:32<03:22, 2.02it/s] 97%|█████████▋| 11380/11788 [1:36:32<03:22, 2.02it/s] 97%|█████████▋| 11381/11788 [1:36:33<03:21, 2.02it/s] 97%|█████████▋| 11382/11788 [1:36:33<03:21, 2.02it/s] 97%|█████████▋| 11383/11788 [1:36:34<03:20, 2.02it/s] 97%|█████████▋| 11384/11788 [1:36:34<03:20, 2.02it/s] 97%|█████████▋| 11385/11788 [1:36:35<03:19, 2.02it/s] 97%|█████████▋| 11386/11788 [1:36:35<03:19, 2.02it/s] 97%|█████████▋| 11387/11788 [1:36:36<03:18, 2.02it/s] 97%|█████████▋| 11388/11788 [1:36:36<03:18, 2.02it/s] 97%|█████████▋| 11389/11788 [1:36:37<03:18, 2.01it/s] 97%|█████████▋| 11390/11788 [1:36:37<03:17, 2.01it/s] 97%|█████████▋| 11391/11788 [1:36:38<03:16, 2.02it/s] 97%|█████████▋| 11392/11788 [1:36:38<03:16, 2.02it/s] 97%|█████████▋| 11393/11788 [1:36:39<03:15, 2.02it/s] 97%|█████████▋| 11394/11788 [1:36:39<03:15, 2.02it/s] 97%|█████████▋| 11395/11788 [1:36:39<03:14, 2.02it/s] 97%|█████████▋| 11396/11788 [1:36:40<03:14, 2.02it/s] 97%|█████████▋| 11397/11788 [1:36:40<03:13, 2.02it/s] 97%|█████████▋| 11398/11788 [1:36:41<03:12, 2.02it/s] 97%|█████████▋| 11399/11788 [1:36:41<03:12, 2.02it/s] 97%|█████████▋| 11400/11788 [1:36:42<03:12, 2.02it/s]{'loss': 2.5486, 'grad_norm': 0.2575015723705292, 'learning_rate': 3.2966776631379034e-06, 'epoch': 13.54} + 97%|█████████▋| 11400/11788 [1:36:42<03:12, 2.02it/s] 97%|█████████▋| 11401/11788 [1:36:42<03:11, 2.02it/s] 97%|█████████▋| 11402/11788 [1:36:43<03:11, 2.02it/s] 97%|█████████▋| 11403/11788 [1:36:43<03:10, 2.02it/s] 97%|█████████▋| 11404/11788 [1:36:44<03:09, 2.02it/s] 97%|█████████▋| 11405/11788 [1:36:44<03:09, 2.02it/s] 97%|█████████▋| 11406/11788 [1:36:45<03:08, 2.02it/s] 97%|█████████▋| 11407/11788 [1:36:45<03:08, 2.02it/s] 97%|█████████▋| 11408/11788 [1:36:46<03:07, 2.02it/s] 97%|█████████▋| 11409/11788 [1:36:46<03:07, 2.02it/s] 97%|█████████▋| 11410/11788 [1:36:47<03:06, 2.02it/s] 97%|█████████▋| 11411/11788 [1:36:47<03:06, 2.02it/s] 97%|█████████▋| 11412/11788 [1:36:48<03:05, 2.02it/s] 97%|█████████▋| 11413/11788 [1:36:48<03:05, 2.02it/s] 97%|█████████▋| 11414/11788 [1:36:49<03:05, 2.02it/s] 97%|█████████▋| 11415/11788 [1:36:49<03:04, 2.02it/s] 97%|█████████▋| 11416/11788 [1:36:50<03:04, 2.02it/s] 97%|█████████▋| 11417/11788 [1:36:50<03:03, 2.02it/s] 97%|█████████▋| 11418/11788 [1:36:51<03:03, 2.02it/s] 97%|█████████▋| 11419/11788 [1:36:51<03:02, 2.02it/s] 97%|█████████▋| 11420/11788 [1:36:52<03:02, 2.02it/s] 97%|█████████▋| 11421/11788 [1:36:52<03:01, 2.02it/s] 97%|█████████▋| 11422/11788 [1:36:53<03:01, 2.02it/s] 97%|█████████▋| 11423/11788 [1:36:53<03:00, 2.02it/s] 97%|█████████▋| 11424/11788 [1:36:54<03:00, 2.02it/s] 97%|█████████▋| 11425/11788 [1:36:54<02:59, 2.02it/s] {'loss': 2.5477, 'grad_norm': 0.25529131293296814, 'learning_rate': 2.8859305879057606e-06, 'epoch': 13.57} + 97%|█████████▋| 11425/11788 [1:36:54<02:59, 2.02it/s] 97%|█████████▋| 11426/11788 [1:36:55<02:59, 2.02it/s] 97%|█████████▋| 11427/11788 [1:36:55<02:58, 2.02it/s] 97%|█████████▋| 11428/11788 [1:36:56<02:57, 2.02it/s] 97%|█████████▋| 11429/11788 [1:36:56<02:57, 2.02it/s] 97%|█████████▋| 11430/11788 [1:36:57<02:56, 2.02it/s] 97%|█████████▋| 11431/11788 [1:36:57<02:56, 2.02it/s] 97%|█████████▋| 11432/11788 [1:36:58<02:55, 2.02it/s] 97%|█████████▋| 11433/11788 [1:36:58<02:55, 2.02it/s] 97%|█████████▋| 11434/11788 [1:36:59<02:55, 2.02it/s] 97%|█████████▋| 11435/11788 [1:36:59<02:54, 2.02it/s] 97%|█████████▋| 11436/11788 [1:37:00<02:53, 2.02it/s] 97%|█████████▋| 11437/11788 [1:37:00<02:53, 2.02it/s] 97%|█████████▋| 11438/11788 [1:37:01<02:52, 2.02it/s] 97%|█████████▋| 11439/11788 [1:37:01<02:52, 2.02it/s] 97%|█████████▋| 11440/11788 [1:37:02<02:51, 2.02it/s] 97%|█████████▋| 11441/11788 [1:37:02<02:51, 2.02it/s] 97%|█████████▋| 11442/11788 [1:37:03<02:51, 2.02it/s] 97%|█████████▋| 11443/11788 [1:37:03<02:50, 2.02it/s] 97%|█████████▋| 11444/11788 [1:37:04<02:50, 2.02it/s] 97%|█████████▋| 11445/11788 [1:37:04<03:04, 1.86it/s] 97%|█████████▋| 11446/11788 [1:37:05<02:59, 1.91it/s] 97%|█████████▋| 11447/11788 [1:37:05<02:55, 1.94it/s] 97%|█████████▋| 11448/11788 [1:37:06<02:53, 1.96it/s] 97%|█████████▋| 11449/11788 [1:37:06<02:50, 1.98it/s] 97%|█████████▋| 11450/11788 [1:37:07<02:49, 1.99it/s]{'loss': 2.5302, 'grad_norm': 0.25606557726860046, 'learning_rate': 2.502428394919598e-06, 'epoch': 13.6} + 97%|█████████▋| 11450/11788 [1:37:07<02:49, 1.99it/s] 97%|█████████▋| 11451/11788 [1:37:07<02:48, 2.00it/s] 97%|█████████▋| 11452/11788 [1:37:08<02:47, 2.01it/s] 97%|█████████▋| 11453/11788 [1:37:08<02:46, 2.01it/s] 97%|█████████▋| 11454/11788 [1:37:09<02:45, 2.01it/s] 97%|█████████▋| 11455/11788 [1:37:09<02:45, 2.02it/s] 97%|█████████▋| 11456/11788 [1:37:10<02:44, 2.02it/s] 97%|█████████▋| 11457/11788 [1:37:10<02:43, 2.02it/s] 97%|█████████▋| 11458/11788 [1:37:11<02:42, 2.02it/s] 97%|█████████▋| 11459/11788 [1:37:11<02:42, 2.02it/s] 97%|█████████▋| 11460/11788 [1:37:12<02:42, 2.02it/s] 97%|█████████▋| 11461/11788 [1:37:12<02:41, 2.02it/s] 97%|█████████▋| 11462/11788 [1:37:13<02:41, 2.02it/s] 97%|█████████▋| 11463/11788 [1:37:13<02:40, 2.03it/s] 97%|█████████▋| 11464/11788 [1:37:14<02:39, 2.03it/s] 97%|█████████▋| 11465/11788 [1:37:14<02:39, 2.02it/s] 97%|█████████▋| 11466/11788 [1:37:15<02:38, 2.03it/s] 97%|█████████▋| 11467/11788 [1:37:15<02:38, 2.03it/s] 97%|█████████▋| 11468/11788 [1:37:16<02:37, 2.03it/s] 97%|█████████▋| 11469/11788 [1:37:16<02:37, 2.03it/s] 97%|█████████▋| 11470/11788 [1:37:17<02:36, 2.03it/s] 97%|█████████▋| 11471/11788 [1:37:17<02:36, 2.03it/s] 97%|█████████▋| 11472/11788 [1:37:18<02:35, 2.03it/s] 97%|█████████▋| 11473/11788 [1:37:18<02:35, 2.03it/s] 97%|█████████▋| 11474/11788 [1:37:19<02:34, 2.03it/s] 97%|█████████▋| 11475/11788 [1:37:19<02:34, 2.02it/s]{'loss': 2.5446, 'grad_norm': 0.2536528706550598, 'learning_rate': 2.1461921024379938e-06, 'epoch': 13.63} + 97%|█████████▋| 11475/11788 [1:37:19<02:34, 2.02it/s] 97%|█████████▋| 11476/11788 [1:37:20<02:34, 2.02it/s] 97%|█████████▋| 11477/11788 [1:37:20<02:33, 2.02it/s] 97%|█████████▋| 11478/11788 [1:37:21<02:33, 2.03it/s] 97%|█████████▋| 11479/11788 [1:37:21<02:32, 2.02it/s] 97%|█████████▋| 11480/11788 [1:37:22<02:32, 2.03it/s] 97%|█████████▋| 11481/11788 [1:37:22<02:31, 2.03it/s] 97%|█████████▋| 11482/11788 [1:37:23<02:30, 2.03it/s] 97%|█████████▋| 11483/11788 [1:37:23<02:30, 2.03it/s] 97%|█████████▋| 11484/11788 [1:37:24<02:30, 2.03it/s] 97%|█████████▋| 11485/11788 [1:37:24<02:29, 2.03it/s] 97%|█████████▋| 11486/11788 [1:37:25<02:29, 2.02it/s] 97%|█████████▋| 11487/11788 [1:37:25<02:28, 2.02it/s] 97%|█████████▋| 11488/11788 [1:37:26<02:28, 2.02it/s] 97%|█████████▋| 11489/11788 [1:37:26<02:27, 2.03it/s] 97%|█████████▋| 11490/11788 [1:37:27<02:27, 2.02it/s] 97%|█████████▋| 11491/11788 [1:37:27<02:26, 2.03it/s] 97%|█████████▋| 11492/11788 [1:37:28<02:26, 2.02it/s] 97%|█████████▋| 11493/11788 [1:37:28<02:25, 2.03it/s] 98%|█████████▊| 11494/11788 [1:37:29<02:25, 2.03it/s] 98%|█████████▊| 11495/11788 [1:37:29<02:24, 2.03it/s] 98%|█████████▊| 11496/11788 [1:37:30<02:23, 2.03it/s] 98%|█████████▊| 11497/11788 [1:37:30<02:23, 2.02it/s] 98%|█████████▊| 11498/11788 [1:37:31<02:23, 2.03it/s] 98%|█████████▊| 11499/11788 [1:37:31<02:22, 2.03it/s] 98%|█████████▊| 11500/11788 [1:37:32<02:22, 2.03it/s]{'loss': 2.538, 'grad_norm': 0.2540939748287201, 'learning_rate': 1.8172412343819833e-06, 'epoch': 13.66} + 98%|█████████▊| 11500/11788 [1:37:32<02:22, 2.03it/s] 98%|█████████▊| 11501/11788 [1:37:32<02:21, 2.02it/s] 98%|█████████▊| 11502/11788 [1:37:33<02:21, 2.02it/s] 98%|█████████▊| 11503/11788 [1:37:33<02:20, 2.02it/s] 98%|█████████▊| 11504/11788 [1:37:34<02:20, 2.03it/s] 98%|█████████▊| 11505/11788 [1:37:34<02:19, 2.03it/s] 98%|█████████▊| 11506/11788 [1:37:34<02:19, 2.03it/s] 98%|█████████▊| 11507/11788 [1:37:35<02:18, 2.03it/s] 98%|█████████▊| 11508/11788 [1:37:35<02:18, 2.02it/s] 98%|█████████▊| 11509/11788 [1:37:36<02:17, 2.03it/s] 98%|█████████▊| 11510/11788 [1:37:36<02:17, 2.02it/s] 98%|█████████▊| 11511/11788 [1:37:37<02:16, 2.03it/s] 98%|█████████▊| 11512/11788 [1:37:37<02:16, 2.02it/s] 98%|█████████▊| 11513/11788 [1:37:38<02:15, 2.03it/s] 98%|█████████▊| 11514/11788 [1:37:38<02:15, 2.03it/s] 98%|█████████▊| 11515/11788 [1:37:39<02:14, 2.03it/s] 98%|█████████▊| 11516/11788 [1:37:39<02:14, 2.03it/s] 98%|█████████▊| 11517/11788 [1:37:40<02:13, 2.03it/s] 98%|█████████▊| 11518/11788 [1:37:40<02:13, 2.02it/s] 98%|█████████▊| 11519/11788 [1:37:41<02:13, 2.02it/s] 98%|█████████▊| 11520/11788 [1:37:41<02:12, 2.03it/s] 98%|█████████▊| 11521/11788 [1:37:42<02:11, 2.02it/s] 98%|█████████▊| 11522/11788 [1:37:42<02:11, 2.03it/s] 98%|█████████▊| 11523/11788 [1:37:43<02:10, 2.03it/s] 98%|█████████▊| 11524/11788 [1:37:43<02:10, 2.03it/s] 98%|█████████▊| 11525/11788 [1:37:44<02:09, 2.03it/s]{'loss': 2.54, 'grad_norm': 0.25606706738471985, 'learning_rate': 1.515593819265082e-06, 'epoch': 13.69} + 98%|█████████▊| 11525/11788 [1:37:44<02:09, 2.03it/s] 98%|█████████▊| 11526/11788 [1:37:44<02:09, 2.02it/s] 98%|█████████▊| 11527/11788 [1:37:45<02:09, 2.02it/s] 98%|█████████▊| 11528/11788 [1:37:45<02:08, 2.02it/s] 98%|█████████▊| 11529/11788 [1:37:46<02:08, 2.02it/s] 98%|█████████▊| 11530/11788 [1:37:46<02:07, 2.02it/s] 98%|█████████▊| 11531/11788 [1:37:47<02:07, 2.02it/s] 98%|█████████▊| 11532/11788 [1:37:47<02:06, 2.02it/s] 98%|█████████▊| 11533/11788 [1:37:48<02:06, 2.02it/s] 98%|█████████▊| 11534/11788 [1:37:48<02:05, 2.02it/s] 98%|█████████▊| 11535/11788 [1:37:49<02:04, 2.02it/s] 98%|█████████▊| 11536/11788 [1:37:49<02:04, 2.02it/s] 98%|█████████▊| 11537/11788 [1:37:50<02:03, 2.03it/s] 98%|█████████▊| 11538/11788 [1:37:50<02:13, 1.87it/s] 98%|█████████▊| 11539/11788 [1:37:51<02:10, 1.91it/s] 98%|█████████▊| 11540/11788 [1:37:51<02:07, 1.94it/s] 98%|█████████▊| 11541/11788 [1:37:52<02:05, 1.97it/s] 98%|█████████▊| 11542/11788 [1:37:52<02:04, 1.98it/s] 98%|█████████▊| 11543/11788 [1:37:53<02:02, 1.99it/s] 98%|█████████▊| 11544/11788 [1:37:53<02:01, 2.00it/s] 98%|█████████▊| 11545/11788 [1:37:54<02:01, 2.01it/s] 98%|█████████▊| 11546/11788 [1:37:54<02:00, 2.01it/s] 98%|█████████▊| 11547/11788 [1:37:55<01:59, 2.02it/s] 98%|█████████▊| 11548/11788 [1:37:55<01:58, 2.02it/s] 98%|█████████▊| 11549/11788 [1:37:56<01:58, 2.02it/s] 98%|█████████▊| 11550/11788 [1:37:56<01:57, 2.02it/s]{'loss': 2.5434, 'grad_norm': 0.25352612137794495, 'learning_rate': 1.2412663892048537e-06, 'epoch': 13.72} + 98%|█████████▊| 11550/11788 [1:37:56<01:57, 2.02it/s] 98%|█████████▊| 11551/11788 [1:37:57<01:57, 2.02it/s] 98%|█████████▊| 11552/11788 [1:37:57<01:56, 2.02it/s] 98%|█████████▊| 11553/11788 [1:37:58<01:56, 2.02it/s] 98%|█████████▊| 11554/11788 [1:37:58<01:55, 2.02it/s] 98%|█████████▊| 11555/11788 [1:37:59<01:55, 2.02it/s] 98%|█████████▊| 11556/11788 [1:37:59<01:54, 2.02it/s] 98%|█████████▊| 11557/11788 [1:38:00<01:54, 2.02it/s] 98%|█████████▊| 11558/11788 [1:38:00<01:53, 2.02it/s] 98%|█████████▊| 11559/11788 [1:38:01<01:53, 2.02it/s] 98%|█████████▊| 11560/11788 [1:38:01<01:52, 2.02it/s] 98%|█████████▊| 11561/11788 [1:38:02<01:52, 2.03it/s] 98%|█████████▊| 11562/11788 [1:38:02<01:51, 2.02it/s] 98%|█████████▊| 11563/11788 [1:38:03<01:51, 2.02it/s] 98%|█████████▊| 11564/11788 [1:38:03<01:50, 2.02it/s] 98%|█████████▊| 11565/11788 [1:38:04<01:50, 2.02it/s] 98%|█████████▊| 11566/11788 [1:38:04<01:49, 2.02it/s] 98%|█████████▊| 11567/11788 [1:38:05<01:49, 2.02it/s] 98%|█████████▊| 11568/11788 [1:38:05<01:49, 2.02it/s] 98%|█████████▊| 11569/11788 [1:38:06<01:48, 2.02it/s] 98%|█████████▊| 11570/11788 [1:38:06<01:47, 2.02it/s] 98%|█████████▊| 11571/11788 [1:38:07<01:47, 2.02it/s] 98%|█████████▊| 11572/11788 [1:38:07<01:47, 2.02it/s] 98%|█████████▊| 11573/11788 [1:38:08<01:46, 2.02it/s] 98%|█████████▊| 11574/11788 [1:38:08<01:45, 2.02it/s] 98%|█████████▊| 11575/11788 [1:38:09<01:45, 2.02it/s]{'loss': 2.5338, 'grad_norm': 0.2542591094970703, 'learning_rate': 9.942739790173016e-07, 'epoch': 13.75} + 98%|█████████▊| 11575/11788 [1:38:09<01:45, 2.02it/s] 98%|█████████▊| 11576/11788 [1:38:09<01:44, 2.02it/s] 98%|█████████▊| 11577/11788 [1:38:10<01:44, 2.02it/s] 98%|█████████▊| 11578/11788 [1:38:10<01:43, 2.02it/s] 98%|█████████▊| 11579/11788 [1:38:11<01:43, 2.02it/s] 98%|█████████▊| 11580/11788 [1:38:11<01:42, 2.02it/s] 98%|█████████▊| 11581/11788 [1:38:12<01:42, 2.02it/s] 98%|█████████▊| 11582/11788 [1:38:12<01:41, 2.02it/s] 98%|█████████▊| 11583/11788 [1:38:13<01:41, 2.02it/s] 98%|█████████▊| 11584/11788 [1:38:13<01:40, 2.02it/s] 98%|█████████▊| 11585/11788 [1:38:14<01:40, 2.02it/s] 98%|█████████▊| 11586/11788 [1:38:14<01:39, 2.02it/s] 98%|█████████▊| 11587/11788 [1:38:15<01:39, 2.02it/s] 98%|█████████▊| 11588/11788 [1:38:15<01:38, 2.02it/s] 98%|█████████▊| 11589/11788 [1:38:16<01:38, 2.02it/s] 98%|█████████▊| 11590/11788 [1:38:16<01:38, 2.02it/s] 98%|█████████▊| 11591/11788 [1:38:17<01:37, 2.02it/s] 98%|█████████▊| 11592/11788 [1:38:17<01:37, 2.02it/s] 98%|█████████▊| 11593/11788 [1:38:18<01:36, 2.02it/s] 98%|█████████▊| 11594/11788 [1:38:18<01:36, 2.02it/s] 98%|█████████▊| 11595/11788 [1:38:19<01:35, 2.02it/s] 98%|█████████▊| 11596/11788 [1:38:19<01:35, 2.02it/s] 98%|█████████▊| 11597/11788 [1:38:20<01:34, 2.02it/s] 98%|█████████▊| 11598/11788 [1:38:20<01:34, 2.02it/s] 98%|█████████▊| 11599/11788 [1:38:21<01:33, 2.02it/s] 98%|█████████▊| 11600/11788 [1:38:21<01:33, 2.02it/s]{'loss': 2.5526, 'grad_norm': 0.257892370223999, 'learning_rate': 7.746301253925835e-07, 'epoch': 13.78} + 98%|█████████▊| 11600/11788 [1:38:21<01:33, 2.02it/s] 98%|█████████▊| 11601/11788 [1:38:22<01:32, 2.02it/s] 98%|█████████▊| 11602/11788 [1:38:22<01:31, 2.02it/s] 98%|█████████▊| 11603/11788 [1:38:23<01:31, 2.02it/s] 98%|█████████▊| 11604/11788 [1:38:23<01:31, 2.02it/s] 98%|█████████▊| 11605/11788 [1:38:24<01:30, 2.02it/s] 98%|█████████▊| 11606/11788 [1:38:24<01:30, 2.02it/s] 98%|█████████▊| 11607/11788 [1:38:25<01:29, 2.02it/s] 98%|█████████▊| 11608/11788 [1:38:25<01:29, 2.02it/s] 98%|█████████▊| 11609/11788 [1:38:26<01:28, 2.02it/s] 98%|█████████▊| 11610/11788 [1:38:26<01:28, 2.02it/s] 98%|█████████▊| 11611/11788 [1:38:27<01:27, 2.02it/s] 99%|█████████▊| 11612/11788 [1:38:27<01:27, 2.02it/s] 99%|█████████▊| 11613/11788 [1:38:28<01:26, 2.02it/s] 99%|█████████▊| 11614/11788 [1:38:28<01:26, 2.02it/s] 99%|█████████▊| 11615/11788 [1:38:29<01:25, 2.02it/s] 99%|█████████▊| 11616/11788 [1:38:29<01:25, 2.02it/s] 99%|█████████▊| 11617/11788 [1:38:30<01:24, 2.02it/s] 99%|█████████▊| 11618/11788 [1:38:30<01:24, 2.02it/s] 99%|█████████▊| 11619/11788 [1:38:31<01:23, 2.02it/s] 99%|█████████▊| 11620/11788 [1:38:31<01:23, 2.02it/s] 99%|█████████▊| 11621/11788 [1:38:32<01:22, 2.02it/s] 99%|█████████▊| 11622/11788 [1:38:32<01:22, 2.02it/s] 99%|█████████▊| 11623/11788 [1:38:33<01:21, 2.02it/s] 99%|█████████▊| 11624/11788 [1:38:33<01:21, 2.02it/s] 99%|█████████▊| 11625/11788 [1:38:33<01:20, 2.02it/s]{'loss': 2.5584, 'grad_norm': 0.2570687532424927, 'learning_rate': 5.823468661531606e-07, 'epoch': 13.81} + 99%|█████████▊| 11625/11788 [1:38:33<01:20, 2.02it/s] 99%|█████████▊| 11626/11788 [1:38:34<01:20, 2.02it/s] 99%|█████████▊| 11627/11788 [1:38:34<01:19, 2.02it/s] 99%|█████████▊| 11628/11788 [1:38:35<01:19, 2.02it/s] 99%|█████████▊| 11629/11788 [1:38:35<01:18, 2.02it/s] 99%|█████████▊| 11630/11788 [1:38:36<01:18, 2.02it/s] 99%|█████████▊| 11631/11788 [1:38:36<01:17, 2.02it/s] 99%|█████████▊| 11632/11788 [1:38:37<01:17, 2.02it/s] 99%|█████████▊| 11633/11788 [1:38:37<01:16, 2.02it/s] 99%|█████████▊| 11634/11788 [1:38:38<01:16, 2.02it/s] 99%|█████████▊| 11635/11788 [1:38:38<01:15, 2.02it/s] 99%|█████████▊| 11636/11788 [1:38:39<01:15, 2.02it/s] 99%|█████████▊| 11637/11788 [1:38:39<01:14, 2.02it/s] 99%|█████████▊| 11638/11788 [1:38:40<01:14, 2.02it/s] 99%|█████████▊| 11639/11788 [1:38:40<01:13, 2.02it/s] 99%|█████████▊| 11640/11788 [1:38:41<01:13, 2.02it/s] 99%|█████████▉| 11641/11788 [1:38:41<01:12, 2.02it/s] 99%|█████████▉| 11642/11788 [1:38:42<01:12, 2.02it/s] 99%|█████████▉| 11643/11788 [1:38:42<01:11, 2.02it/s] 99%|█████████▉| 11644/11788 [1:38:43<01:11, 2.02it/s] 99%|█████████▉| 11645/11788 [1:38:43<01:10, 2.02it/s] 99%|█████████▉| 11646/11788 [1:38:44<01:10, 2.02it/s] 99%|█████████▉| 11647/11788 [1:38:44<01:09, 2.02it/s] 99%|█████████▉| 11648/11788 [1:38:45<01:09, 2.02it/s] 99%|█████████▉| 11649/11788 [1:38:45<01:08, 2.02it/s] 99%|█████████▉| 11650/11788 [1:38:46<01:08, 2.02it/s]{'loss': 2.5429, 'grad_norm': 0.25447869300842285, 'learning_rate': 4.17434739594158e-07, 'epoch': 13.84} + 99%|█████████▉| 11650/11788 [1:38:46<01:08, 2.02it/s] 99%|█████████▉| 11651/11788 [1:38:46<01:07, 2.02it/s] 99%|█████████▉| 11652/11788 [1:38:47<01:07, 2.02it/s] 99%|█████████▉| 11653/11788 [1:38:47<01:06, 2.02it/s] 99%|█████████▉| 11654/11788 [1:38:48<01:06, 2.02it/s] 99%|█████████▉| 11655/11788 [1:38:48<01:05, 2.02it/s] 99%|█████████▉| 11656/11788 [1:38:49<01:05, 2.02it/s] 99%|█████████▉| 11657/11788 [1:38:49<01:04, 2.02it/s] 99%|█████████▉| 11658/11788 [1:38:50<01:04, 2.02it/s] 99%|█████████▉| 11659/11788 [1:38:50<01:03, 2.02it/s] 99%|█████████▉| 11660/11788 [1:38:51<01:03, 2.02it/s] 99%|█████████▉| 11661/11788 [1:38:51<01:02, 2.02it/s] 99%|█████████▉| 11662/11788 [1:38:52<01:02, 2.02it/s] 99%|█████████▉| 11663/11788 [1:38:52<01:01, 2.02it/s] 99%|█████████▉| 11664/11788 [1:38:53<01:01, 2.02it/s] 99%|█████████▉| 11665/11788 [1:38:53<01:00, 2.02it/s] 99%|█████████▉| 11666/11788 [1:38:54<01:00, 2.02it/s] 99%|█████████▉| 11667/11788 [1:38:54<00:59, 2.02it/s] 99%|█████████▉| 11668/11788 [1:38:55<00:59, 2.02it/s] 99%|█████████▉| 11669/11788 [1:38:55<00:58, 2.02it/s] 99%|█████████▉| 11670/11788 [1:38:56<00:58, 2.02it/s] 99%|█████████▉| 11671/11788 [1:38:56<00:57, 2.02it/s] 99%|█████████▉| 11672/11788 [1:38:57<00:57, 2.02it/s] 99%|█████████▉| 11673/11788 [1:38:57<00:57, 2.02it/s] 99%|█████████▉| 11674/11788 [1:38:58<00:56, 2.02it/s] 99%|█████████▉| 11675/11788 [1:38:58<00:56, 2.02it/s]{'loss': 2.5355, 'grad_norm': 0.2554433345794678, 'learning_rate': 2.799027839056612e-07, 'epoch': 13.87} + 99%|█████████▉| 11675/11788 [1:38:58<00:56, 2.02it/s] 99%|█████████▉| 11676/11788 [1:38:59<00:55, 2.02it/s] 99%|█████████▉| 11677/11788 [1:38:59<00:55, 2.02it/s] 99%|█████████▉| 11678/11788 [1:39:00<00:54, 2.02it/s] 99%|█████████▉| 11679/11788 [1:39:00<00:53, 2.02it/s] 99%|█████████▉| 11680/11788 [1:39:01<00:53, 2.02it/s] 99%|█████████▉| 11681/11788 [1:39:01<00:52, 2.02it/s] 99%|█████████▉| 11682/11788 [1:39:02<00:52, 2.02it/s] 99%|█████████▉| 11683/11788 [1:39:02<00:51, 2.02it/s] 99%|█████████▉| 11684/11788 [1:39:03<00:51, 2.02it/s] 99%|█████████▉| 11685/11788 [1:39:03<00:51, 2.02it/s] 99%|█████████▉| 11686/11788 [1:39:04<00:50, 2.02it/s] 99%|█████████▉| 11687/11788 [1:39:04<00:50, 2.02it/s] 99%|█████████▉| 11688/11788 [1:39:05<00:49, 2.02it/s] 99%|█████████▉| 11689/11788 [1:39:05<00:49, 2.02it/s] 99%|█████████▉| 11690/11788 [1:39:06<00:48, 2.02it/s] 99%|█████████▉| 11691/11788 [1:39:06<00:48, 2.02it/s] 99%|█████████▉| 11692/11788 [1:39:07<00:47, 2.02it/s] 99%|█████████▉| 11693/11788 [1:39:07<00:46, 2.02it/s] 99%|█████████▉| 11694/11788 [1:39:08<00:46, 2.02it/s] 99%|█████████▉| 11695/11788 [1:39:08<00:46, 2.02it/s] 99%|█████████▉| 11696/11788 [1:39:09<00:45, 2.02it/s] 99%|█████████▉| 11697/11788 [1:39:09<00:45, 2.02it/s] 99%|█████████▉| 11698/11788 [1:39:10<00:44, 2.02it/s] 99%|█████████▉| 11699/11788 [1:39:10<00:44, 2.02it/s] 99%|█████████▉| 11700/11788 [1:39:11<00:43, 2.02it/s]{'loss': 2.5307, 'grad_norm': 0.2563607692718506, 'learning_rate': 1.6975853667750009e-07, 'epoch': 13.9} + 99%|█████████▉| 11700/11788 [1:39:11<00:43, 2.02it/s] 99%|█████████▉| 11701/11788 [1:39:11<00:43, 2.01it/s] 99%|█████████▉| 11702/11788 [1:39:12<00:42, 2.01it/s] 99%|█████████▉| 11703/11788 [1:39:12<00:42, 2.01it/s] 99%|█████████▉| 11704/11788 [1:39:13<00:41, 2.02it/s] 99%|█████████▉| 11705/11788 [1:39:13<00:41, 2.02it/s] 99%|█████████▉| 11706/11788 [1:39:14<00:40, 2.02it/s] 99%|█████████▉| 11707/11788 [1:39:14<00:40, 2.02it/s] 99%|█████████▉| 11708/11788 [1:39:15<00:39, 2.02it/s] 99%|█████████▉| 11709/11788 [1:39:15<00:39, 2.02it/s] 99%|█████████▉| 11710/11788 [1:39:16<00:38, 2.02it/s] 99%|█████████▉| 11711/11788 [1:39:16<00:38, 2.02it/s] 99%|█████████▉| 11712/11788 [1:39:17<00:37, 2.02it/s] 99%|█████████▉| 11713/11788 [1:39:17<00:37, 2.02it/s] 99%|█████████▉| 11714/11788 [1:39:18<00:36, 2.02it/s] 99%|█████████▉| 11715/11788 [1:39:18<00:36, 2.02it/s] 99%|█████████▉| 11716/11788 [1:39:19<00:35, 2.02it/s] 99%|█████████▉| 11717/11788 [1:39:19<00:35, 2.02it/s] 99%|█████████▉| 11718/11788 [1:39:20<00:34, 2.02it/s] 99%|█████████▉| 11719/11788 [1:39:20<00:34, 2.02it/s] 99%|█████████▉| 11720/11788 [1:39:21<00:33, 2.02it/s] 99%|█████████▉| 11721/11788 [1:39:21<00:33, 2.02it/s] 99%|█████████▉| 11722/11788 [1:39:21<00:32, 2.02it/s] 99%|█████████▉| 11723/11788 [1:39:22<00:32, 2.02it/s] 99%|█████████▉| 11724/11788 [1:39:22<00:31, 2.02it/s] 99%|█████████▉| 11725/11788 [1:39:23<00:31, 2.02it/s]{'loss': 2.5371, 'grad_norm': 0.25308409333229065, 'learning_rate': 8.700803448596916e-08, 'epoch': 13.93} + 99%|█████████▉| 11725/11788 [1:39:23<00:31, 2.02it/s] 99%|█████████▉| 11726/11788 [1:39:23<00:30, 2.02it/s] 99%|█████████▉| 11727/11788 [1:39:24<00:30, 2.02it/s] 99%|█████████▉| 11728/11788 [1:39:24<00:29, 2.02it/s] 99%|█████████▉| 11729/11788 [1:39:25<00:29, 2.02it/s] 100%|█████████▉| 11730/11788 [1:39:25<00:28, 2.02it/s] 100%|█████████▉| 11731/11788 [1:39:26<00:28, 2.02it/s] 100%|█████████▉| 11732/11788 [1:39:26<00:27, 2.02it/s] 100%|█████████▉| 11733/11788 [1:39:27<00:27, 2.02it/s] 100%|█████████▉| 11734/11788 [1:39:27<00:26, 2.02it/s] 100%|█████████▉| 11735/11788 [1:39:28<00:26, 2.02it/s] 100%|█████████▉| 11736/11788 [1:39:28<00:25, 2.02it/s] 100%|█████████▉| 11737/11788 [1:39:29<00:25, 2.02it/s] 100%|█████████▉| 11738/11788 [1:39:29<00:24, 2.02it/s] 100%|█████████▉| 11739/11788 [1:39:30<00:24, 2.02it/s] 100%|█████████▉| 11740/11788 [1:39:30<00:23, 2.02it/s] 100%|█████████▉| 11741/11788 [1:39:31<00:23, 2.02it/s] 100%|█████████▉| 11742/11788 [1:39:31<00:22, 2.02it/s] 100%|█████████▉| 11743/11788 [1:39:32<00:22, 2.02it/s] 100%|█████████▉| 11744/11788 [1:39:32<00:21, 2.02it/s] 100%|█████████▉| 11745/11788 [1:39:33<00:21, 2.02it/s] 100%|█████████▉| 11746/11788 [1:39:33<00:20, 2.02it/s] 100%|█████████▉| 11747/11788 [1:39:34<00:20, 2.02it/s] 100%|█████████▉| 11748/11788 [1:39:34<00:19, 2.02it/s] 100%|█████████▉| 11749/11788 [1:39:35<00:19, 2.02it/s] 100%|█████████▉| 11750/11788 [1:39:35<00:18, 2.02it/s]{'loss': 2.5636, 'grad_norm': 0.2561865746974945, 'learning_rate': 3.165581256331374e-08, 'epoch': 13.95} + 100%|█████████▉| 11750/11788 [1:39:35<00:18, 2.02it/s] 100%|█████████▉| 11751/11788 [1:39:36<00:18, 2.02it/s] 100%|█████████▉| 11752/11788 [1:39:36<00:17, 2.02it/s] 100%|█████████▉| 11753/11788 [1:39:37<00:17, 2.02it/s] 100%|█████████▉| 11754/11788 [1:39:37<00:16, 2.02it/s] 100%|█████████▉| 11755/11788 [1:39:38<00:16, 2.02it/s] 100%|█████████▉| 11756/11788 [1:39:38<00:15, 2.02it/s] 100%|█████████▉| 11757/11788 [1:39:39<00:15, 2.02it/s] 100%|█████████▉| 11758/11788 [1:39:39<00:14, 2.02it/s] 100%|█████████▉| 11759/11788 [1:39:40<00:14, 2.02it/s] 100%|█████████▉| 11760/11788 [1:39:40<00:13, 2.02it/s] 100%|█████████▉| 11761/11788 [1:39:41<00:13, 2.02it/s] 100%|█████████▉| 11762/11788 [1:39:41<00:12, 2.01it/s] 100%|█████████▉| 11763/11788 [1:39:42<00:12, 2.01it/s] 100%|█████████▉| 11764/11788 [1:39:42<00:11, 2.02it/s] 100%|█████████▉| 11765/11788 [1:39:43<00:11, 2.02it/s] 100%|█████████▉| 11766/11788 [1:39:43<00:10, 2.02it/s] 100%|█████████▉| 11767/11788 [1:39:44<00:10, 2.02it/s] 100%|█████████▉| 11768/11788 [1:39:44<00:09, 2.02it/s] 100%|█████████▉| 11769/11788 [1:39:45<00:09, 2.02it/s] 100%|█████████▉| 11770/11788 [1:39:45<00:08, 2.02it/s] 100%|█████████▉| 11771/11788 [1:39:46<00:08, 2.02it/s] 100%|█████████▉| 11772/11788 [1:39:46<00:07, 2.02it/s] 100%|█████████▉| 11773/11788 [1:39:47<00:07, 2.02it/s] 100%|█████████▉| 11774/11788 [1:39:47<00:06, 2.02it/s] 100%|█████████▉| 11775/11788 [1:39:48<00:06, 2.02it/s]{'loss': 2.5378, 'grad_norm': 0.259833961725235, 'learning_rate': 3.704904548706978e-09, 'epoch': 13.98} + 100%|█████████▉| 11775/11788 [1:39:48<00:06, 2.02it/s] 100%|█████████▉| 11776/11788 [1:39:48<00:05, 2.02it/s] 100%|█████████▉| 11777/11788 [1:39:49<00:05, 2.02it/s] 100%|█████████▉| 11778/11788 [1:39:49<00:04, 2.02it/s] 100%|█████████▉| 11779/11788 [1:39:50<00:04, 2.02it/s] 100%|█████████▉| 11780/11788 [1:39:50<00:03, 2.02it/s] 100%|█████████▉| 11781/11788 [1:39:51<00:03, 2.02it/s] 100%|█████████▉| 11782/11788 [1:39:51<00:02, 2.02it/s] 100%|█████████▉| 11783/11788 [1:39:52<00:02, 2.02it/s] 100%|█████████▉| 11784/11788 [1:39:52<00:01, 2.02it/s] 100%|█████████▉| 11785/11788 [1:39:53<00:01, 2.02it/s] 100%|█████████▉| 11786/11788 [1:39:53<00:00, 2.02it/s] 100%|█████████▉| 11787/11788 [1:39:54<00:00, 2.02it/s] 100%|██████████| 11788/11788 [1:39:54<00:00, 2.04it/s]{'train_runtime': 6006.0595, 'train_samples_per_second': 2009.05, 'train_steps_per_second': 1.963, 'train_loss': 3.164302644383353, 'epoch': 14.0} + 100%|██████████| 11788/11788 [1:40:05<00:00, 2.04it/s] 100%|██████████| 11788/11788 [1:40:06<00:00, 1.96it/s] +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.