diff --git "a/checkpoint-2109/trainer_state.json" "b/checkpoint-2109/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2109/trainer_state.json" @@ -0,0 +1,13513 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 20, + "global_step": 2109, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 5.8403, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4e-05, + "loss": 5.8714, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 6e-05, + "loss": 5.7358, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 8e-05, + "loss": 5.368, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 4.4602, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.00012, + "loss": 3.3684, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 0.00014, + "loss": 2.4129, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 0.00016, + "loss": 1.4315, + "step": 8 + }, + { + "epoch": 0.01, + "learning_rate": 0.00018, + "loss": 0.529, + "step": 9 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002, + "loss": 0.2216, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001999998879930964, + "loss": 0.2577, + "step": 11 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999955197263648, + "loss": 0.1995, + "step": 12 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999899193937299, + "loss": 0.0976, + "step": 13 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999820789456046, + "loss": 0.2842, + "step": 14 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999719983995528, + "loss": 0.121, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999596777781563, + "loss": 0.0472, + "step": 16 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999451171090149, + "loss": 0.0736, + "step": 17 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999283164247466, + "loss": 0.1005, + "step": 18 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999092757629872, + "loss": 0.087, + "step": 19 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019998879951663902, + "loss": 0.0682, + "step": 20 + }, + { + "epoch": 0.03, + "eval_loss": 0.09823722392320633, + "eval_runtime": 23.24, + "eval_samples_per_second": 43.029, + "eval_steps_per_second": 10.757, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019998644746826275, + "loss": 0.1509, + "step": 21 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001999838714364388, + "loss": 0.0885, + "step": 22 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019998107142693786, + "loss": 0.0734, + "step": 23 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019997804744603225, + "loss": 0.0817, + "step": 24 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019997479950049622, + "loss": 0.0609, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019997132759760558, + "loss": 0.0923, + "step": 26 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019996763174513787, + "loss": 0.1019, + "step": 27 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019996371195137226, + "loss": 0.0931, + "step": 28 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999595682250897, + "loss": 0.073, + "step": 29 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019995520057557262, + "loss": 0.0877, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019995060901260527, + "loss": 0.0846, + "step": 31 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999457935464733, + "loss": 0.0853, + "step": 32 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999407541879641, + "loss": 0.0783, + "step": 33 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019993549094836642, + "loss": 0.0503, + "step": 34 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019993000383947073, + "loss": 0.1231, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999242928735689, + "loss": 0.0707, + "step": 36 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019991835806345423, + "loss": 0.0699, + "step": 37 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019991219942242156, + "loss": 0.0747, + "step": 38 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001999058169642671, + "loss": 0.0722, + "step": 39 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019989921070328844, + "loss": 0.0895, + "step": 40 + }, + { + "epoch": 0.06, + "eval_loss": 0.07923433184623718, + "eval_runtime": 23.2627, + "eval_samples_per_second": 42.987, + "eval_steps_per_second": 10.747, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019989238065428445, + "loss": 0.0726, + "step": 41 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019988532683255547, + "loss": 0.0697, + "step": 42 + }, + { + "epoch": 0.06, + "learning_rate": 0.000199878049253903, + "loss": 0.0525, + "step": 43 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001998705479346298, + "loss": 0.101, + "step": 44 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001998628228915399, + "loss": 0.0805, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019985487414193845, + "loss": 0.055, + "step": 46 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019984670170363172, + "loss": 0.0556, + "step": 47 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001998383055949271, + "loss": 0.0735, + "step": 48 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001998296858346331, + "loss": 0.0539, + "step": 49 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019982084244205909, + "loss": 0.0525, + "step": 50 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019981177543701556, + "loss": 0.0426, + "step": 51 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019980248483981376, + "loss": 0.0343, + "step": 52 + }, + { + "epoch": 0.08, + "learning_rate": 0.000199792970671266, + "loss": 0.0268, + "step": 53 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019978323295268533, + "loss": 0.0287, + "step": 54 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001997732717058855, + "loss": 0.024, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001997630869531812, + "loss": 0.0164, + "step": 56 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019975267871738756, + "loss": 0.0272, + "step": 57 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019974204702182056, + "loss": 0.0233, + "step": 58 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001997311918902966, + "loss": 0.0211, + "step": 59 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001997201133471327, + "loss": 0.015, + "step": 60 + }, + { + "epoch": 0.09, + "eval_loss": 0.040460266172885895, + "eval_runtime": 23.2455, + "eval_samples_per_second": 43.019, + "eval_steps_per_second": 10.755, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019970881141714636, + "loss": 0.0275, + "step": 61 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001996972861256554, + "loss": 0.0264, + "step": 62 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019968553749847808, + "loss": 0.0314, + "step": 63 + }, + { + "epoch": 0.09, + "learning_rate": 0.000199673565561933, + "loss": 0.033, + "step": 64 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001996613703428389, + "loss": 0.0379, + "step": 65 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019964895186851475, + "loss": 0.0156, + "step": 66 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019963631016677968, + "loss": 0.0147, + "step": 67 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019962344526595283, + "loss": 0.0255, + "step": 68 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019961035719485336, + "loss": 0.0269, + "step": 69 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019959704598280034, + "loss": 0.0273, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019958351165961274, + "loss": 0.0342, + "step": 71 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001995697542556093, + "loss": 0.0194, + "step": 72 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019955577380160853, + "loss": 0.0256, + "step": 73 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019954157032892855, + "loss": 0.0126, + "step": 74 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001995271438693871, + "loss": 0.0141, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019951249445530146, + "loss": 0.0283, + "step": 76 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019949762211948833, + "loss": 0.0486, + "step": 77 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019948252689526381, + "loss": 0.0211, + "step": 78 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019946720881644324, + "loss": 0.0336, + "step": 79 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019945166791734132, + "loss": 0.0376, + "step": 80 + }, + { + "epoch": 0.11, + "eval_loss": 0.035690754652023315, + "eval_runtime": 23.2365, + "eval_samples_per_second": 43.036, + "eval_steps_per_second": 10.759, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001994359042327717, + "loss": 0.0414, + "step": 81 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019941991779804727, + "loss": 0.0199, + "step": 82 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019940370864897987, + "loss": 0.0214, + "step": 83 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001993872768218802, + "loss": 0.0112, + "step": 84 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001993706223535578, + "loss": 0.0512, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019935374528132105, + "loss": 0.0236, + "step": 86 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019933664564297687, + "loss": 0.0254, + "step": 87 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001993193234768308, + "loss": 0.0257, + "step": 88 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019930177882168692, + "loss": 0.0311, + "step": 89 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019928401171684766, + "loss": 0.0294, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001992660222021138, + "loss": 0.0207, + "step": 91 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001992478103177843, + "loss": 0.0151, + "step": 92 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019922937610465638, + "loss": 0.0315, + "step": 93 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019921071960402512, + "loss": 0.0228, + "step": 94 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001991918408576837, + "loss": 0.0126, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019917273990792316, + "loss": 0.0126, + "step": 96 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019915341679753218, + "loss": 0.0282, + "step": 97 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019913387156979728, + "loss": 0.0155, + "step": 98 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019911410426850243, + "loss": 0.0218, + "step": 99 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001990941149379291, + "loss": 0.0196, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 0.03417763113975525, + "eval_runtime": 23.1746, + "eval_samples_per_second": 43.151, + "eval_steps_per_second": 10.788, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019907390362285617, + "loss": 0.0175, + "step": 101 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019905347036855978, + "loss": 0.0185, + "step": 102 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019903281522081322, + "loss": 0.0159, + "step": 103 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001990119382258869, + "loss": 0.0164, + "step": 104 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019899083943054815, + "loss": 0.0224, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001989695188820612, + "loss": 0.0211, + "step": 106 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019894797662818703, + "loss": 0.0525, + "step": 107 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001989262127171832, + "loss": 0.0146, + "step": 108 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019890422719780396, + "loss": 0.0181, + "step": 109 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001988820201192999, + "loss": 0.0141, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019885959153141783, + "loss": 0.0185, + "step": 111 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019883694148440105, + "loss": 0.0297, + "step": 112 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019881407002898867, + "loss": 0.0169, + "step": 113 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019879097721641595, + "loss": 0.0241, + "step": 114 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019876766309841396, + "loss": 0.0136, + "step": 115 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001987441277272096, + "loss": 0.0223, + "step": 116 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001987203711555253, + "loss": 0.0186, + "step": 117 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019869639343657907, + "loss": 0.0145, + "step": 118 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019867219462408432, + "loss": 0.0164, + "step": 119 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001986477747722497, + "loss": 0.0219, + "step": 120 + }, + { + "epoch": 0.17, + "eval_loss": 0.0333634614944458, + "eval_runtime": 23.4378, + "eval_samples_per_second": 42.666, + "eval_steps_per_second": 10.667, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001986231339357791, + "loss": 0.0233, + "step": 121 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019859827216987136, + "loss": 0.0269, + "step": 122 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019857318953022028, + "loss": 0.0201, + "step": 123 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019854788607301445, + "loss": 0.0241, + "step": 124 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019852236185493707, + "loss": 0.0147, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019849661693316595, + "loss": 0.0257, + "step": 126 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019847065136537325, + "loss": 0.0144, + "step": 127 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019844446520972544, + "loss": 0.0214, + "step": 128 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019841805852488308, + "loss": 0.0156, + "step": 129 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019839143137000088, + "loss": 0.0228, + "step": 130 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019836458380472724, + "loss": 0.0257, + "step": 131 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019833751588920452, + "loss": 0.022, + "step": 132 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019831022768406845, + "loss": 0.0202, + "step": 133 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019828271925044852, + "loss": 0.0118, + "step": 134 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019825499064996733, + "loss": 0.0279, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019822704194474082, + "loss": 0.0207, + "step": 136 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019819887319737795, + "loss": 0.0272, + "step": 137 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019817048447098054, + "loss": 0.0112, + "step": 138 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001981418758291433, + "loss": 0.0255, + "step": 139 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019811304733595355, + "loss": 0.0188, + "step": 140 + }, + { + "epoch": 0.2, + "eval_loss": 0.031686607748270035, + "eval_runtime": 23.3537, + "eval_samples_per_second": 42.82, + "eval_steps_per_second": 10.705, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019808399905599111, + "loss": 0.0224, + "step": 141 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019805473105432805, + "loss": 0.0155, + "step": 142 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019802524339652885, + "loss": 0.0163, + "step": 143 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019799553614864985, + "loss": 0.0272, + "step": 144 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019796560937723946, + "loss": 0.0145, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001979354631493377, + "loss": 0.0232, + "step": 146 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019790509753247633, + "loss": 0.0182, + "step": 147 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019787451259467852, + "loss": 0.0241, + "step": 148 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019784370840445875, + "loss": 0.0237, + "step": 149 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019781268503082263, + "loss": 0.0197, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019778144254326683, + "loss": 0.009, + "step": 151 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019774998101177886, + "loss": 0.0216, + "step": 152 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019771830050683686, + "loss": 0.0155, + "step": 153 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019768640109940955, + "loss": 0.0226, + "step": 154 + }, + { + "epoch": 0.22, + "learning_rate": 0.000197654282860956, + "loss": 0.0207, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019762194586342546, + "loss": 0.0229, + "step": 156 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019758939017925736, + "loss": 0.021, + "step": 157 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019755661588138085, + "loss": 0.0213, + "step": 158 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019752362304321493, + "loss": 0.0099, + "step": 159 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019749041173866807, + "loss": 0.0147, + "step": 160 + }, + { + "epoch": 0.23, + "eval_loss": 0.036477215588092804, + "eval_runtime": 23.6345, + "eval_samples_per_second": 42.311, + "eval_steps_per_second": 10.578, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019745698204213822, + "loss": 0.0183, + "step": 161 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019742333402851246, + "loss": 0.0124, + "step": 162 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019738946777316706, + "loss": 0.0066, + "step": 163 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019735538335196706, + "loss": 0.0175, + "step": 164 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019732108084126625, + "loss": 0.0215, + "step": 165 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019728656031790706, + "loss": 0.026, + "step": 166 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019725182185922018, + "loss": 0.0094, + "step": 167 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019721686554302457, + "loss": 0.0301, + "step": 168 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019718169144762718, + "loss": 0.0131, + "step": 169 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019714629965182282, + "loss": 0.0206, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019711069023489408, + "loss": 0.0147, + "step": 171 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019707486327661094, + "loss": 0.0219, + "step": 172 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001970388188572307, + "loss": 0.0253, + "step": 173 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019700255705749786, + "loss": 0.0188, + "step": 174 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019696607795864382, + "loss": 0.0149, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001969293816423869, + "loss": 0.0211, + "step": 176 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001968924681909318, + "loss": 0.015, + "step": 177 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019685533768696982, + "loss": 0.0201, + "step": 178 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019681799021367837, + "loss": 0.0253, + "step": 179 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019678042585472098, + "loss": 0.0224, + "step": 180 + }, + { + "epoch": 0.26, + "eval_loss": 0.0387922078371048, + "eval_runtime": 23.0013, + "eval_samples_per_second": 43.476, + "eval_steps_per_second": 10.869, + "step": 180 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019674264469424698, + "loss": 0.0069, + "step": 181 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019670464681689144, + "loss": 0.0082, + "step": 182 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019666643230777475, + "loss": 0.0199, + "step": 183 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019662800125250276, + "loss": 0.0101, + "step": 184 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019658935373716632, + "loss": 0.0157, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019655048984834118, + "loss": 0.0268, + "step": 186 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001965114096730879, + "loss": 0.0135, + "step": 187 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019647211329895136, + "loss": 0.0144, + "step": 188 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019643260081396094, + "loss": 0.0116, + "step": 189 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019639287230663004, + "loss": 0.0191, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019635292786595598, + "loss": 0.0216, + "step": 191 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019631276758141987, + "loss": 0.0114, + "step": 192 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019627239154298623, + "loss": 0.0367, + "step": 193 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019623179984110296, + "loss": 0.0071, + "step": 194 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019619099256670114, + "loss": 0.0171, + "step": 195 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019614996981119468, + "loss": 0.0124, + "step": 196 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001961087316664802, + "loss": 0.024, + "step": 197 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019606727822493687, + "loss": 0.0121, + "step": 198 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019602560957942606, + "loss": 0.0079, + "step": 199 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019598372582329133, + "loss": 0.0116, + "step": 200 + }, + { + "epoch": 0.28, + "eval_loss": 0.05043657124042511, + "eval_runtime": 23.2026, + "eval_samples_per_second": 43.099, + "eval_steps_per_second": 10.775, + "step": 200 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019594162705035808, + "loss": 0.0163, + "step": 201 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019589931335493334, + "loss": 0.0158, + "step": 202 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001958567848318057, + "loss": 0.0055, + "step": 203 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019581404157624483, + "loss": 0.011, + "step": 204 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019577108368400162, + "loss": 0.0075, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001957279112513076, + "loss": 0.0219, + "step": 206 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019568452437487504, + "loss": 0.0054, + "step": 207 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019564092315189647, + "loss": 0.0111, + "step": 208 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019559710768004472, + "loss": 0.0115, + "step": 209 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019555307805747246, + "loss": 0.0114, + "step": 210 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001955088343828121, + "loss": 0.0055, + "step": 211 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019546437675517563, + "loss": 0.0084, + "step": 212 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019541970527415424, + "loss": 0.0189, + "step": 213 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019537482003981825, + "loss": 0.0172, + "step": 214 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019532972115271675, + "loss": 0.0054, + "step": 215 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019528440871387746, + "loss": 0.0149, + "step": 216 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019523888282480656, + "loss": 0.0264, + "step": 217 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019519314358748825, + "loss": 0.0094, + "step": 218 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019514719110438482, + "loss": 0.0221, + "step": 219 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001951010254784361, + "loss": 0.0158, + "step": 220 + }, + { + "epoch": 0.31, + "eval_loss": 0.06923255324363708, + "eval_runtime": 23.5744, + "eval_samples_per_second": 42.419, + "eval_steps_per_second": 10.605, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019505464681305953, + "loss": 0.0207, + "step": 221 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001950080552121497, + "loss": 0.0301, + "step": 222 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019496125078007816, + "loss": 0.021, + "step": 223 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001949142336216934, + "loss": 0.0166, + "step": 224 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019486700384232033, + "loss": 0.0037, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019481956154776016, + "loss": 0.0083, + "step": 226 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019477190684429014, + "loss": 0.0085, + "step": 227 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019472403983866344, + "loss": 0.0103, + "step": 228 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019467596063810872, + "loss": 0.0169, + "step": 229 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001946276693503301, + "loss": 0.008, + "step": 230 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019457916608350666, + "loss": 0.0243, + "step": 231 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019453045094629246, + "loss": 0.0211, + "step": 232 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019448152404781608, + "loss": 0.0148, + "step": 233 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019443238549768057, + "loss": 0.0178, + "step": 234 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019438303540596309, + "loss": 0.0132, + "step": 235 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019433347388321458, + "loss": 0.0207, + "step": 236 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019428370104045977, + "loss": 0.0058, + "step": 237 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019423371698919665, + "loss": 0.0182, + "step": 238 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001941835218413964, + "loss": 0.0164, + "step": 239 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019413311570950316, + "loss": 0.0193, + "step": 240 + }, + { + "epoch": 0.34, + "eval_loss": 0.04074312746524811, + "eval_runtime": 23.6367, + "eval_samples_per_second": 42.307, + "eval_steps_per_second": 10.577, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019408249870643353, + "loss": 0.0135, + "step": 241 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001940316709455766, + "loss": 0.0185, + "step": 242 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019398063254079362, + "loss": 0.0125, + "step": 243 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019392938360641765, + "loss": 0.0099, + "step": 244 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019387792425725332, + "loss": 0.0124, + "step": 245 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019382625460857676, + "loss": 0.0116, + "step": 246 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019377437477613506, + "loss": 0.0067, + "step": 247 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019372228487614623, + "loss": 0.0135, + "step": 248 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019366998502529884, + "loss": 0.0096, + "step": 249 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019361747534075177, + "loss": 0.0258, + "step": 250 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019356475594013397, + "loss": 0.0061, + "step": 251 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019351182694154418, + "loss": 0.0065, + "step": 252 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019345868846355063, + "loss": 0.0167, + "step": 253 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019340534062519088, + "loss": 0.009, + "step": 254 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019335178354597146, + "loss": 0.0079, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001932980173458676, + "loss": 0.028, + "step": 256 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019324404214532303, + "loss": 0.0204, + "step": 257 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019318985806524966, + "loss": 0.0174, + "step": 258 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019313546522702727, + "loss": 0.017, + "step": 259 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019308086375250335, + "loss": 0.0181, + "step": 260 + }, + { + "epoch": 0.37, + "eval_loss": 0.044319622218608856, + "eval_runtime": 23.1398, + "eval_samples_per_second": 43.216, + "eval_steps_per_second": 10.804, + "step": 260 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019302605376399277, + "loss": 0.0072, + "step": 261 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019297103538427744, + "loss": 0.0227, + "step": 262 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019291580873660613, + "loss": 0.0113, + "step": 263 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019286037394469413, + "loss": 0.0084, + "step": 264 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001928047311327231, + "loss": 0.0131, + "step": 265 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019274888042534056, + "loss": 0.0124, + "step": 266 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019269282194765984, + "loss": 0.011, + "step": 267 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019263655582525964, + "loss": 0.0062, + "step": 268 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019258008218418383, + "loss": 0.012, + "step": 269 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019252340115094124, + "loss": 0.011, + "step": 270 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019246651285250514, + "loss": 0.0039, + "step": 271 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019240941741631323, + "loss": 0.0261, + "step": 272 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019235211497026708, + "loss": 0.0098, + "step": 273 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019229460564273217, + "loss": 0.023, + "step": 274 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019223688956253727, + "loss": 0.0198, + "step": 275 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019217896685897444, + "loss": 0.0037, + "step": 276 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019212083766179846, + "loss": 0.0067, + "step": 277 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019206250210122684, + "loss": 0.0131, + "step": 278 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019200396030793918, + "loss": 0.0087, + "step": 279 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019194521241307726, + "loss": 0.0124, + "step": 280 + }, + { + "epoch": 0.4, + "eval_loss": 0.048221979290246964, + "eval_runtime": 23.4398, + "eval_samples_per_second": 42.663, + "eval_steps_per_second": 10.666, + "step": 280 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019188625854824447, + "loss": 0.009, + "step": 281 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019182709884550558, + "loss": 0.0134, + "step": 282 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019176773343738653, + "loss": 0.012, + "step": 283 + }, + { + "epoch": 0.4, + "learning_rate": 0.000191708162456874, + "loss": 0.0084, + "step": 284 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001916483860374152, + "loss": 0.008, + "step": 285 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019158840431291761, + "loss": 0.0069, + "step": 286 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001915282174177485, + "loss": 0.0065, + "step": 287 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001914678254867349, + "loss": 0.0161, + "step": 288 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019140722865516305, + "loss": 0.0156, + "step": 289 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001913464270587782, + "loss": 0.0127, + "step": 290 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019128542083378435, + "loss": 0.003, + "step": 291 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019122421011684386, + "loss": 0.0114, + "step": 292 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019116279504507717, + "loss": 0.0057, + "step": 293 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019110117575606253, + "loss": 0.0053, + "step": 294 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019103935238783563, + "loss": 0.0077, + "step": 295 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019097732507888942, + "loss": 0.0173, + "step": 296 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019091509396817357, + "loss": 0.0117, + "step": 297 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001908526591950944, + "loss": 0.0152, + "step": 298 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001907900208995144, + "loss": 0.0162, + "step": 299 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019072717922175201, + "loss": 0.0094, + "step": 300 + }, + { + "epoch": 0.43, + "eval_loss": 0.054927486926317215, + "eval_runtime": 23.3503, + "eval_samples_per_second": 42.826, + "eval_steps_per_second": 10.707, + "step": 300 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019066413430258127, + "loss": 0.0107, + "step": 301 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019060088628323145, + "loss": 0.0178, + "step": 302 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019053743530538693, + "loss": 0.0031, + "step": 303 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019047378151118663, + "loss": 0.0072, + "step": 304 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019040992504322382, + "loss": 0.0185, + "step": 305 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001903458660445458, + "loss": 0.0125, + "step": 306 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019028160465865362, + "loss": 0.0047, + "step": 307 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001902171410295016, + "loss": 0.0325, + "step": 308 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001901524753014972, + "loss": 0.011, + "step": 309 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019008760761950058, + "loss": 0.003, + "step": 310 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019002253812882427, + "loss": 0.0067, + "step": 311 + }, + { + "epoch": 0.44, + "learning_rate": 0.00018995726697523297, + "loss": 0.0092, + "step": 312 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018989179430494303, + "loss": 0.0224, + "step": 313 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001898261202646223, + "loss": 0.0255, + "step": 314 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001897602450013897, + "loss": 0.0097, + "step": 315 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018969416866281485, + "loss": 0.0074, + "step": 316 + }, + { + "epoch": 0.45, + "learning_rate": 0.000189627891396918, + "loss": 0.0121, + "step": 317 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018956141335216926, + "loss": 0.0123, + "step": 318 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018949473467748867, + "loss": 0.0042, + "step": 319 + }, + { + "epoch": 0.46, + "learning_rate": 0.00018942785552224564, + "loss": 0.0081, + "step": 320 + }, + { + "epoch": 0.46, + "eval_loss": 0.03408419340848923, + "eval_runtime": 23.1715, + "eval_samples_per_second": 43.156, + "eval_steps_per_second": 10.789, + "step": 320 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001893607760362588, + "loss": 0.0142, + "step": 321 + }, + { + "epoch": 0.46, + "learning_rate": 0.00018929349636979536, + "loss": 0.0097, + "step": 322 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001892260166735711, + "loss": 0.0164, + "step": 323 + }, + { + "epoch": 0.46, + "learning_rate": 0.00018915833709874988, + "loss": 0.0148, + "step": 324 + }, + { + "epoch": 0.46, + "learning_rate": 0.00018909045779694325, + "loss": 0.0212, + "step": 325 + }, + { + "epoch": 0.46, + "learning_rate": 0.00018902237892021024, + "loss": 0.0203, + "step": 326 + }, + { + "epoch": 0.47, + "learning_rate": 0.00018895410062105694, + "loss": 0.0116, + "step": 327 + }, + { + "epoch": 0.47, + "learning_rate": 0.00018888562305243616, + "loss": 0.0169, + "step": 328 + }, + { + "epoch": 0.47, + "learning_rate": 0.00018881694636774712, + "loss": 0.0192, + "step": 329 + }, + { + "epoch": 0.47, + "learning_rate": 0.00018874807072083503, + "loss": 0.0079, + "step": 330 + }, + { + "epoch": 0.47, + "learning_rate": 0.00018867899626599094, + "loss": 0.0138, + "step": 331 + }, + { + "epoch": 0.47, + "learning_rate": 0.00018860972315795107, + "loss": 0.0148, + "step": 332 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001885402515518968, + "loss": 0.0146, + "step": 333 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001884705816034541, + "loss": 0.0094, + "step": 334 + }, + { + "epoch": 0.48, + "learning_rate": 0.00018840071346869328, + "loss": 0.023, + "step": 335 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001883306473041286, + "loss": 0.0092, + "step": 336 + }, + { + "epoch": 0.48, + "learning_rate": 0.00018826038326671797, + "loss": 0.0192, + "step": 337 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001881899215138625, + "loss": 0.0116, + "step": 338 + }, + { + "epoch": 0.48, + "learning_rate": 0.00018811926220340628, + "loss": 0.0245, + "step": 339 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001880484054936359, + "loss": 0.0188, + "step": 340 + }, + { + "epoch": 0.48, + "eval_loss": 0.0401308573782444, + "eval_runtime": 23.3351, + "eval_samples_per_second": 42.854, + "eval_steps_per_second": 10.713, + "step": 340 + }, + { + "epoch": 0.49, + "learning_rate": 0.00018797735154328014, + "loss": 0.0094, + "step": 341 + }, + { + "epoch": 0.49, + "learning_rate": 0.00018790610051150973, + "loss": 0.0048, + "step": 342 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001878346525579368, + "loss": 0.0168, + "step": 343 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001877630078426146, + "loss": 0.0134, + "step": 344 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001876911665260372, + "loss": 0.0209, + "step": 345 + }, + { + "epoch": 0.49, + "learning_rate": 0.00018761912876913908, + "loss": 0.0158, + "step": 346 + }, + { + "epoch": 0.49, + "learning_rate": 0.00018754689473329475, + "loss": 0.0085, + "step": 347 + }, + { + "epoch": 0.5, + "learning_rate": 0.00018747446458031842, + "loss": 0.0014, + "step": 348 + }, + { + "epoch": 0.5, + "learning_rate": 0.00018740183847246362, + "loss": 0.0198, + "step": 349 + }, + { + "epoch": 0.5, + "learning_rate": 0.00018732901657242287, + "loss": 0.0301, + "step": 350 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001872559990433273, + "loss": 0.0069, + "step": 351 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001871827860487463, + "loss": 0.0082, + "step": 352 + }, + { + "epoch": 0.5, + "learning_rate": 0.00018710937775268696, + "loss": 0.0092, + "step": 353 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001870357743195941, + "loss": 0.0065, + "step": 354 + }, + { + "epoch": 0.5, + "learning_rate": 0.00018696197591434955, + "loss": 0.015, + "step": 355 + }, + { + "epoch": 0.51, + "learning_rate": 0.00018688798270227188, + "loss": 0.013, + "step": 356 + }, + { + "epoch": 0.51, + "learning_rate": 0.00018681379484911616, + "loss": 0.0238, + "step": 357 + }, + { + "epoch": 0.51, + "learning_rate": 0.00018673941252107343, + "loss": 0.0123, + "step": 358 + }, + { + "epoch": 0.51, + "learning_rate": 0.00018666483588477032, + "loss": 0.0067, + "step": 359 + }, + { + "epoch": 0.51, + "learning_rate": 0.00018659006510726887, + "loss": 0.021, + "step": 360 + }, + { + "epoch": 0.51, + "eval_loss": 0.050777580589056015, + "eval_runtime": 23.3931, + "eval_samples_per_second": 42.748, + "eval_steps_per_second": 10.687, + "step": 360 + }, + { + "epoch": 0.51, + "learning_rate": 0.00018651510035606585, + "loss": 0.0186, + "step": 361 + }, + { + "epoch": 0.51, + "learning_rate": 0.00018643994179909276, + "loss": 0.0159, + "step": 362 + }, + { + "epoch": 0.52, + "learning_rate": 0.00018636458960471505, + "loss": 0.0146, + "step": 363 + }, + { + "epoch": 0.52, + "learning_rate": 0.00018628904394173205, + "loss": 0.0193, + "step": 364 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001862133049793765, + "loss": 0.0156, + "step": 365 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001861373728873142, + "loss": 0.0079, + "step": 366 + }, + { + "epoch": 0.52, + "learning_rate": 0.00018606124783564337, + "loss": 0.0092, + "step": 367 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001859849299948948, + "loss": 0.0141, + "step": 368 + }, + { + "epoch": 0.52, + "learning_rate": 0.00018590841953603087, + "loss": 0.0108, + "step": 369 + }, + { + "epoch": 0.53, + "learning_rate": 0.00018583171663044565, + "loss": 0.0352, + "step": 370 + }, + { + "epoch": 0.53, + "learning_rate": 0.00018575482144996417, + "loss": 0.013, + "step": 371 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001856777341668423, + "loss": 0.0087, + "step": 372 + }, + { + "epoch": 0.53, + "learning_rate": 0.00018560045495376616, + "loss": 0.0135, + "step": 373 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001855229839838519, + "loss": 0.0073, + "step": 374 + }, + { + "epoch": 0.53, + "learning_rate": 0.00018544532143064516, + "loss": 0.0135, + "step": 375 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001853674674681208, + "loss": 0.0109, + "step": 376 + }, + { + "epoch": 0.54, + "learning_rate": 0.00018528942227068247, + "loss": 0.0123, + "step": 377 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001852111860131621, + "loss": 0.0101, + "step": 378 + }, + { + "epoch": 0.54, + "learning_rate": 0.00018513275887081977, + "loss": 0.0153, + "step": 379 + }, + { + "epoch": 0.54, + "learning_rate": 0.00018505414101934316, + "loss": 0.0125, + "step": 380 + }, + { + "epoch": 0.54, + "eval_loss": 0.040892839431762695, + "eval_runtime": 23.0536, + "eval_samples_per_second": 43.377, + "eval_steps_per_second": 10.844, + "step": 380 + }, + { + "epoch": 0.54, + "learning_rate": 0.00018497533263484698, + "loss": 0.007, + "step": 381 + }, + { + "epoch": 0.54, + "learning_rate": 0.00018489633389387299, + "loss": 0.0122, + "step": 382 + }, + { + "epoch": 0.54, + "learning_rate": 0.00018481714497338927, + "loss": 0.0091, + "step": 383 + }, + { + "epoch": 0.55, + "learning_rate": 0.00018473776605078992, + "loss": 0.0086, + "step": 384 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001846581973038947, + "loss": 0.0134, + "step": 385 + }, + { + "epoch": 0.55, + "learning_rate": 0.00018457843891094851, + "loss": 0.0088, + "step": 386 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001844984910506213, + "loss": 0.005, + "step": 387 + }, + { + "epoch": 0.55, + "learning_rate": 0.00018441835390200722, + "loss": 0.0081, + "step": 388 + }, + { + "epoch": 0.55, + "learning_rate": 0.00018433802764462455, + "loss": 0.0123, + "step": 389 + }, + { + "epoch": 0.55, + "learning_rate": 0.00018425751245841526, + "loss": 0.0101, + "step": 390 + }, + { + "epoch": 0.56, + "learning_rate": 0.00018417680852374438, + "loss": 0.0148, + "step": 391 + }, + { + "epoch": 0.56, + "learning_rate": 0.00018409591602139996, + "loss": 0.0067, + "step": 392 + }, + { + "epoch": 0.56, + "learning_rate": 0.00018401483513259237, + "loss": 0.0145, + "step": 393 + }, + { + "epoch": 0.56, + "learning_rate": 0.00018393356603895396, + "loss": 0.0181, + "step": 394 + }, + { + "epoch": 0.56, + "learning_rate": 0.00018385210892253871, + "loss": 0.0094, + "step": 395 + }, + { + "epoch": 0.56, + "learning_rate": 0.00018377046396582185, + "loss": 0.0212, + "step": 396 + }, + { + "epoch": 0.56, + "learning_rate": 0.00018368863135169932, + "loss": 0.0062, + "step": 397 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001836066112634875, + "loss": 0.0176, + "step": 398 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001835244038849227, + "loss": 0.0144, + "step": 399 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001834420094001608, + "loss": 0.0071, + "step": 400 + }, + { + "epoch": 0.57, + "eval_loss": 0.04236825183033943, + "eval_runtime": 23.1651, + "eval_samples_per_second": 43.168, + "eval_steps_per_second": 10.792, + "step": 400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00018335942799377678, + "loss": 0.0126, + "step": 401 + }, + { + "epoch": 0.57, + "learning_rate": 0.00018327665985076448, + "loss": 0.0192, + "step": 402 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001831937051565359, + "loss": 0.0266, + "step": 403 + }, + { + "epoch": 0.57, + "learning_rate": 0.00018311056409692106, + "loss": 0.0056, + "step": 404 + }, + { + "epoch": 0.58, + "learning_rate": 0.00018302723685816735, + "loss": 0.0069, + "step": 405 + }, + { + "epoch": 0.58, + "learning_rate": 0.00018294372362693935, + "loss": 0.0046, + "step": 406 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001828600245903182, + "loss": 0.01, + "step": 407 + }, + { + "epoch": 0.58, + "learning_rate": 0.00018277613993580128, + "loss": 0.0124, + "step": 408 + }, + { + "epoch": 0.58, + "learning_rate": 0.00018269206985130186, + "loss": 0.0037, + "step": 409 + }, + { + "epoch": 0.58, + "learning_rate": 0.00018260781452514847, + "loss": 0.024, + "step": 410 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001825233741460847, + "loss": 0.0127, + "step": 411 + }, + { + "epoch": 0.59, + "learning_rate": 0.00018243874890326865, + "loss": 0.0159, + "step": 412 + }, + { + "epoch": 0.59, + "learning_rate": 0.00018235393898627256, + "loss": 0.0328, + "step": 413 + }, + { + "epoch": 0.59, + "learning_rate": 0.00018226894458508235, + "loss": 0.0187, + "step": 414 + }, + { + "epoch": 0.59, + "learning_rate": 0.00018218376589009723, + "loss": 0.0102, + "step": 415 + }, + { + "epoch": 0.59, + "learning_rate": 0.00018209840309212923, + "loss": 0.0117, + "step": 416 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001820128563824028, + "loss": 0.0108, + "step": 417 + }, + { + "epoch": 0.59, + "learning_rate": 0.00018192712595255434, + "loss": 0.0105, + "step": 418 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018184121199463191, + "loss": 0.0052, + "step": 419 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018175511470109462, + "loss": 0.0165, + "step": 420 + }, + { + "epoch": 0.6, + "eval_loss": 0.056613512337207794, + "eval_runtime": 23.1787, + "eval_samples_per_second": 43.143, + "eval_steps_per_second": 10.786, + "step": 420 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018166883426481227, + "loss": 0.0064, + "step": 421 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018158237087906496, + "loss": 0.0081, + "step": 422 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018149572473754268, + "loss": 0.0086, + "step": 423 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018140889603434466, + "loss": 0.0086, + "step": 424 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001813218849639792, + "loss": 0.012, + "step": 425 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018123469172136317, + "loss": 0.0061, + "step": 426 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001811473165018214, + "loss": 0.012, + "step": 427 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001810597595010865, + "loss": 0.0072, + "step": 428 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018097202091529822, + "loss": 0.0067, + "step": 429 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018088410094100309, + "loss": 0.0134, + "step": 430 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018079599977515397, + "loss": 0.0009, + "step": 431 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018070771761510973, + "loss": 0.0177, + "step": 432 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018061925465863448, + "loss": 0.0124, + "step": 433 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018053061110389758, + "loss": 0.0053, + "step": 434 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018044178714947276, + "loss": 0.0058, + "step": 435 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001803527829943379, + "loss": 0.0168, + "step": 436 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001802635988378747, + "loss": 0.0121, + "step": 437 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001801742348798679, + "loss": 0.0132, + "step": 438 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018008469132050516, + "loss": 0.003, + "step": 439 + }, + { + "epoch": 0.63, + "learning_rate": 0.00017999496836037637, + "loss": 0.0075, + "step": 440 + }, + { + "epoch": 0.63, + "eval_loss": 0.053666725754737854, + "eval_runtime": 23.2275, + "eval_samples_per_second": 43.052, + "eval_steps_per_second": 10.763, + "step": 440 + }, + { + "epoch": 0.63, + "learning_rate": 0.00017990506620047339, + "loss": 0.01, + "step": 441 + }, + { + "epoch": 0.63, + "learning_rate": 0.00017981498504218943, + "loss": 0.036, + "step": 442 + }, + { + "epoch": 0.63, + "learning_rate": 0.00017972472508731878, + "loss": 0.0158, + "step": 443 + }, + { + "epoch": 0.63, + "learning_rate": 0.00017963428653805614, + "loss": 0.0085, + "step": 444 + }, + { + "epoch": 0.63, + "learning_rate": 0.00017954366959699637, + "loss": 0.0137, + "step": 445 + }, + { + "epoch": 0.63, + "learning_rate": 0.00017945287446713393, + "loss": 0.0236, + "step": 446 + }, + { + "epoch": 0.64, + "learning_rate": 0.00017936190135186246, + "loss": 0.0163, + "step": 447 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001792707504549743, + "loss": 0.0066, + "step": 448 + }, + { + "epoch": 0.64, + "learning_rate": 0.00017917942198066, + "loss": 0.0067, + "step": 449 + }, + { + "epoch": 0.64, + "learning_rate": 0.00017908791613350803, + "loss": 0.0085, + "step": 450 + }, + { + "epoch": 0.64, + "learning_rate": 0.00017899623311850405, + "loss": 0.0193, + "step": 451 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001789043731410307, + "loss": 0.0127, + "step": 452 + }, + { + "epoch": 0.64, + "learning_rate": 0.00017881233640686705, + "loss": 0.0104, + "step": 453 + }, + { + "epoch": 0.65, + "learning_rate": 0.000178720123122188, + "loss": 0.0138, + "step": 454 + }, + { + "epoch": 0.65, + "learning_rate": 0.00017862773349356414, + "loss": 0.0056, + "step": 455 + }, + { + "epoch": 0.65, + "learning_rate": 0.00017853516772796093, + "loss": 0.0126, + "step": 456 + }, + { + "epoch": 0.65, + "learning_rate": 0.00017844242603273848, + "loss": 0.0055, + "step": 457 + }, + { + "epoch": 0.65, + "learning_rate": 0.00017834950861565102, + "loss": 0.0143, + "step": 458 + }, + { + "epoch": 0.65, + "learning_rate": 0.00017825641568484634, + "loss": 0.0047, + "step": 459 + }, + { + "epoch": 0.65, + "learning_rate": 0.00017816314744886552, + "loss": 0.0096, + "step": 460 + }, + { + "epoch": 0.65, + "eval_loss": 0.03381817042827606, + "eval_runtime": 23.3676, + "eval_samples_per_second": 42.794, + "eval_steps_per_second": 10.699, + "step": 460 + }, + { + "epoch": 0.66, + "learning_rate": 0.00017806970411664224, + "loss": 0.0041, + "step": 461 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001779760858975025, + "loss": 0.0116, + "step": 462 + }, + { + "epoch": 0.66, + "learning_rate": 0.00017788229300116402, + "loss": 0.0107, + "step": 463 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001777883256377358, + "loss": 0.0034, + "step": 464 + }, + { + "epoch": 0.66, + "learning_rate": 0.00017769418401771778, + "loss": 0.0104, + "step": 465 + }, + { + "epoch": 0.66, + "learning_rate": 0.00017759986835200016, + "loss": 0.012, + "step": 466 + }, + { + "epoch": 0.66, + "learning_rate": 0.00017750537885186302, + "loss": 0.0083, + "step": 467 + }, + { + "epoch": 0.67, + "learning_rate": 0.00017741071572897592, + "loss": 0.0076, + "step": 468 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001773158791953973, + "loss": 0.0176, + "step": 469 + }, + { + "epoch": 0.67, + "learning_rate": 0.00017722086946357415, + "loss": 0.0024, + "step": 470 + }, + { + "epoch": 0.67, + "learning_rate": 0.00017712568674634134, + "loss": 0.0112, + "step": 471 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001770303312569213, + "loss": 0.0172, + "step": 472 + }, + { + "epoch": 0.67, + "learning_rate": 0.00017693480320892348, + "loss": 0.0071, + "step": 473 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001768391028163439, + "loss": 0.0055, + "step": 474 + }, + { + "epoch": 0.68, + "learning_rate": 0.00017674323029356472, + "loss": 0.0022, + "step": 475 + }, + { + "epoch": 0.68, + "learning_rate": 0.00017664718585535353, + "loss": 0.0087, + "step": 476 + }, + { + "epoch": 0.68, + "learning_rate": 0.00017655096971686321, + "loss": 0.0158, + "step": 477 + }, + { + "epoch": 0.68, + "learning_rate": 0.00017645458209363115, + "loss": 0.0121, + "step": 478 + }, + { + "epoch": 0.68, + "learning_rate": 0.00017635802320157894, + "loss": 0.0146, + "step": 479 + }, + { + "epoch": 0.68, + "learning_rate": 0.00017626129325701184, + "loss": 0.012, + "step": 480 + }, + { + "epoch": 0.68, + "eval_loss": 0.048898741602897644, + "eval_runtime": 23.3056, + "eval_samples_per_second": 42.908, + "eval_steps_per_second": 10.727, + "step": 480 + }, + { + "epoch": 0.68, + "learning_rate": 0.00017616439247661826, + "loss": 0.0067, + "step": 481 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001760673210774694, + "loss": 0.0096, + "step": 482 + }, + { + "epoch": 0.69, + "learning_rate": 0.00017597007927701853, + "loss": 0.0056, + "step": 483 + }, + { + "epoch": 0.69, + "learning_rate": 0.00017587266729310067, + "loss": 0.0078, + "step": 484 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001757750853439322, + "loss": 0.0043, + "step": 485 + }, + { + "epoch": 0.69, + "learning_rate": 0.00017567733364811015, + "loss": 0.0075, + "step": 486 + }, + { + "epoch": 0.69, + "learning_rate": 0.00017557941242461178, + "loss": 0.0095, + "step": 487 + }, + { + "epoch": 0.69, + "learning_rate": 0.00017548132189279417, + "loss": 0.0083, + "step": 488 + }, + { + "epoch": 0.7, + "learning_rate": 0.00017538306227239363, + "loss": 0.0029, + "step": 489 + }, + { + "epoch": 0.7, + "learning_rate": 0.00017528463378352528, + "loss": 0.0038, + "step": 490 + }, + { + "epoch": 0.7, + "learning_rate": 0.00017518603664668257, + "loss": 0.005, + "step": 491 + }, + { + "epoch": 0.7, + "learning_rate": 0.00017508727108273665, + "loss": 0.0191, + "step": 492 + }, + { + "epoch": 0.7, + "learning_rate": 0.00017498833731293605, + "loss": 0.0092, + "step": 493 + }, + { + "epoch": 0.7, + "learning_rate": 0.00017488923555890605, + "loss": 0.0255, + "step": 494 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001747899660426483, + "loss": 0.0262, + "step": 495 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001746905289865402, + "loss": 0.0106, + "step": 496 + }, + { + "epoch": 0.71, + "learning_rate": 0.00017459092461333446, + "loss": 0.013, + "step": 497 + }, + { + "epoch": 0.71, + "learning_rate": 0.00017449115314615866, + "loss": 0.0077, + "step": 498 + }, + { + "epoch": 0.71, + "learning_rate": 0.00017439121480851465, + "loss": 0.0135, + "step": 499 + }, + { + "epoch": 0.71, + "learning_rate": 0.00017429110982427815, + "loss": 0.0041, + "step": 500 + }, + { + "epoch": 0.71, + "eval_loss": 0.044191259890794754, + "eval_runtime": 23.2484, + "eval_samples_per_second": 43.014, + "eval_steps_per_second": 10.753, + "step": 500 + }, + { + "epoch": 0.71, + "learning_rate": 0.00017419083841769804, + "loss": 0.0108, + "step": 501 + }, + { + "epoch": 0.71, + "learning_rate": 0.00017409040081339623, + "loss": 0.0149, + "step": 502 + }, + { + "epoch": 0.72, + "learning_rate": 0.00017398979723636676, + "loss": 0.0073, + "step": 503 + }, + { + "epoch": 0.72, + "learning_rate": 0.00017388902791197553, + "loss": 0.0121, + "step": 504 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001737880930659598, + "loss": 0.0131, + "step": 505 + }, + { + "epoch": 0.72, + "learning_rate": 0.00017368699292442748, + "loss": 0.0114, + "step": 506 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001735857277138569, + "loss": 0.0072, + "step": 507 + }, + { + "epoch": 0.72, + "learning_rate": 0.00017348429766109608, + "loss": 0.0047, + "step": 508 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001733827029933624, + "loss": 0.0033, + "step": 509 + }, + { + "epoch": 0.73, + "learning_rate": 0.00017328094393824186, + "loss": 0.0025, + "step": 510 + }, + { + "epoch": 0.73, + "learning_rate": 0.00017317902072368885, + "loss": 0.0051, + "step": 511 + }, + { + "epoch": 0.73, + "learning_rate": 0.00017307693357802544, + "loss": 0.0161, + "step": 512 + }, + { + "epoch": 0.73, + "learning_rate": 0.00017297468272994092, + "loss": 0.0056, + "step": 513 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001728722684084913, + "loss": 0.0166, + "step": 514 + }, + { + "epoch": 0.73, + "learning_rate": 0.00017276969084309882, + "loss": 0.0122, + "step": 515 + }, + { + "epoch": 0.73, + "learning_rate": 0.00017266695026355136, + "loss": 0.0028, + "step": 516 + }, + { + "epoch": 0.74, + "learning_rate": 0.00017256404690000205, + "loss": 0.0058, + "step": 517 + }, + { + "epoch": 0.74, + "learning_rate": 0.00017246098098296862, + "loss": 0.0174, + "step": 518 + }, + { + "epoch": 0.74, + "learning_rate": 0.00017235775274333288, + "loss": 0.0172, + "step": 519 + }, + { + "epoch": 0.74, + "learning_rate": 0.00017225436241234045, + "loss": 0.0012, + "step": 520 + }, + { + "epoch": 0.74, + "eval_loss": 0.04391771927475929, + "eval_runtime": 23.2947, + "eval_samples_per_second": 42.928, + "eval_steps_per_second": 10.732, + "step": 520 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001721508102215999, + "loss": 0.0177, + "step": 521 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001720470964030824, + "loss": 0.0142, + "step": 522 + }, + { + "epoch": 0.74, + "learning_rate": 0.00017194322118912128, + "loss": 0.0123, + "step": 523 + }, + { + "epoch": 0.75, + "learning_rate": 0.00017183918481241133, + "loss": 0.0049, + "step": 524 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001717349875060084, + "loss": 0.0118, + "step": 525 + }, + { + "epoch": 0.75, + "learning_rate": 0.00017163062950332884, + "loss": 0.016, + "step": 526 + }, + { + "epoch": 0.75, + "learning_rate": 0.00017152611103814902, + "loss": 0.0057, + "step": 527 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001714214323446047, + "loss": 0.0062, + "step": 528 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001713165936571906, + "loss": 0.0195, + "step": 529 + }, + { + "epoch": 0.75, + "learning_rate": 0.00017121159521075988, + "loss": 0.0088, + "step": 530 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017110643724052354, + "loss": 0.0048, + "step": 531 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017100111998204996, + "loss": 0.0074, + "step": 532 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017089564367126433, + "loss": 0.0102, + "step": 533 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017079000854444817, + "loss": 0.011, + "step": 534 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017068421483823872, + "loss": 0.017, + "step": 535 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017057826278962855, + "loss": 0.0103, + "step": 536 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001704721526359648, + "loss": 0.0069, + "step": 537 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001703658846149489, + "loss": 0.0199, + "step": 538 + }, + { + "epoch": 0.77, + "learning_rate": 0.00017025945896463593, + "loss": 0.0076, + "step": 539 + }, + { + "epoch": 0.77, + "learning_rate": 0.00017015287592343396, + "loss": 0.0096, + "step": 540 + }, + { + "epoch": 0.77, + "eval_loss": 0.03808086737990379, + "eval_runtime": 23.0584, + "eval_samples_per_second": 43.368, + "eval_steps_per_second": 10.842, + "step": 540 + }, + { + "epoch": 0.77, + "learning_rate": 0.00017004613573010378, + "loss": 0.0056, + "step": 541 + }, + { + "epoch": 0.77, + "learning_rate": 0.00016993923862375812, + "loss": 0.0045, + "step": 542 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001698321848438613, + "loss": 0.0084, + "step": 543 + }, + { + "epoch": 0.77, + "learning_rate": 0.00016972497463022852, + "loss": 0.014, + "step": 544 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001696176082230255, + "loss": 0.0159, + "step": 545 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001695100858627678, + "loss": 0.0061, + "step": 546 + }, + { + "epoch": 0.78, + "learning_rate": 0.00016940240779032037, + "loss": 0.0215, + "step": 547 + }, + { + "epoch": 0.78, + "learning_rate": 0.00016929457424689695, + "loss": 0.0134, + "step": 548 + }, + { + "epoch": 0.78, + "learning_rate": 0.00016918658547405955, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 0.78, + "learning_rate": 0.00016907844171371793, + "loss": 0.0017, + "step": 550 + }, + { + "epoch": 0.78, + "learning_rate": 0.00016897014320812906, + "loss": 0.0116, + "step": 551 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016886169019989658, + "loss": 0.0058, + "step": 552 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016875308293197013, + "loss": 0.0119, + "step": 553 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016864432164764506, + "loss": 0.0126, + "step": 554 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001685354065905616, + "loss": 0.0124, + "step": 555 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016842633800470455, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001683171161344026, + "loss": 0.0081, + "step": 557 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001682077412243278, + "loss": 0.0043, + "step": 558 + }, + { + "epoch": 0.8, + "learning_rate": 0.00016809821351949507, + "loss": 0.004, + "step": 559 + }, + { + "epoch": 0.8, + "learning_rate": 0.00016798853326526158, + "loss": 0.005, + "step": 560 + }, + { + "epoch": 0.8, + "eval_loss": 0.04487919807434082, + "eval_runtime": 23.3744, + "eval_samples_per_second": 42.782, + "eval_steps_per_second": 10.695, + "step": 560 + }, + { + "epoch": 0.8, + "learning_rate": 0.00016787870070732625, + "loss": 0.0051, + "step": 561 + }, + { + "epoch": 0.8, + "learning_rate": 0.00016776871609172918, + "loss": 0.004, + "step": 562 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001676585796648511, + "loss": 0.0174, + "step": 563 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001675482916734128, + "loss": 0.0152, + "step": 564 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001674378523644746, + "loss": 0.0178, + "step": 565 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001673272619854358, + "loss": 0.0144, + "step": 566 + }, + { + "epoch": 0.81, + "learning_rate": 0.00016721652078403412, + "loss": 0.0103, + "step": 567 + }, + { + "epoch": 0.81, + "learning_rate": 0.00016710562900834519, + "loss": 0.0094, + "step": 568 + }, + { + "epoch": 0.81, + "learning_rate": 0.00016699458690678184, + "loss": 0.0191, + "step": 569 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001668833947280937, + "loss": 0.0189, + "step": 570 + }, + { + "epoch": 0.81, + "learning_rate": 0.00016677205272136667, + "loss": 0.0102, + "step": 571 + }, + { + "epoch": 0.81, + "learning_rate": 0.00016666056113602218, + "loss": 0.012, + "step": 572 + }, + { + "epoch": 0.82, + "learning_rate": 0.00016654892022181678, + "loss": 0.0064, + "step": 573 + }, + { + "epoch": 0.82, + "learning_rate": 0.00016643713022884148, + "loss": 0.0066, + "step": 574 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001663251914075214, + "loss": 0.0179, + "step": 575 + }, + { + "epoch": 0.82, + "learning_rate": 0.00016621310400861486, + "loss": 0.004, + "step": 576 + }, + { + "epoch": 0.82, + "learning_rate": 0.00016610086828321315, + "loss": 0.0062, + "step": 577 + }, + { + "epoch": 0.82, + "learning_rate": 0.00016598848448273984, + "loss": 0.0049, + "step": 578 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001658759528589501, + "loss": 0.0213, + "step": 579 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001657632736639303, + "loss": 0.0239, + "step": 580 + }, + { + "epoch": 0.83, + "eval_loss": 0.04524041712284088, + "eval_runtime": 23.2755, + "eval_samples_per_second": 42.964, + "eval_steps_per_second": 10.741, + "step": 580 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001656504471500974, + "loss": 0.0164, + "step": 581 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001655374735701984, + "loss": 0.007, + "step": 582 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001654243531773097, + "loss": 0.0106, + "step": 583 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001653110862248366, + "loss": 0.0066, + "step": 584 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001651976729665127, + "loss": 0.009, + "step": 585 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001650841136563994, + "loss": 0.0033, + "step": 586 + }, + { + "epoch": 0.83, + "learning_rate": 0.00016497040854888517, + "loss": 0.0079, + "step": 587 + }, + { + "epoch": 0.84, + "learning_rate": 0.00016485655789868518, + "loss": 0.004, + "step": 588 + }, + { + "epoch": 0.84, + "learning_rate": 0.00016474256196084063, + "loss": 0.0111, + "step": 589 + }, + { + "epoch": 0.84, + "learning_rate": 0.00016462842099071817, + "loss": 0.0094, + "step": 590 + }, + { + "epoch": 0.84, + "learning_rate": 0.00016451413524400923, + "loss": 0.0143, + "step": 591 + }, + { + "epoch": 0.84, + "learning_rate": 0.00016439970497672977, + "loss": 0.0051, + "step": 592 + }, + { + "epoch": 0.84, + "learning_rate": 0.00016428513044521937, + "loss": 0.0116, + "step": 593 + }, + { + "epoch": 0.84, + "learning_rate": 0.00016417041190614076, + "loss": 0.0064, + "step": 594 + }, + { + "epoch": 0.85, + "learning_rate": 0.00016405554961647934, + "loss": 0.0068, + "step": 595 + }, + { + "epoch": 0.85, + "learning_rate": 0.00016394054383354247, + "loss": 0.0033, + "step": 596 + }, + { + "epoch": 0.85, + "learning_rate": 0.00016382539481495903, + "loss": 0.0017, + "step": 597 + }, + { + "epoch": 0.85, + "learning_rate": 0.00016371010281867866, + "loss": 0.0167, + "step": 598 + }, + { + "epoch": 0.85, + "learning_rate": 0.00016359466810297136, + "loss": 0.0029, + "step": 599 + }, + { + "epoch": 0.85, + "learning_rate": 0.00016347909092642694, + "loss": 0.0166, + "step": 600 + }, + { + "epoch": 0.85, + "eval_loss": 0.038260262459516525, + "eval_runtime": 23.3542, + "eval_samples_per_second": 42.819, + "eval_steps_per_second": 10.705, + "step": 600 + }, + { + "epoch": 0.85, + "learning_rate": 0.00016336337154795408, + "loss": 0.0065, + "step": 601 + }, + { + "epoch": 0.86, + "learning_rate": 0.00016324751022678028, + "loss": 0.0157, + "step": 602 + }, + { + "epoch": 0.86, + "learning_rate": 0.00016313150722245082, + "loss": 0.0046, + "step": 603 + }, + { + "epoch": 0.86, + "learning_rate": 0.00016301536279482846, + "loss": 0.0033, + "step": 604 + }, + { + "epoch": 0.86, + "learning_rate": 0.00016289907720409277, + "loss": 0.006, + "step": 605 + }, + { + "epoch": 0.86, + "learning_rate": 0.00016278265071073954, + "loss": 0.0199, + "step": 606 + }, + { + "epoch": 0.86, + "learning_rate": 0.00016266608357558016, + "loss": 0.0096, + "step": 607 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001625493760597412, + "loss": 0.0048, + "step": 608 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016243252842466346, + "loss": 0.0157, + "step": 609 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016231554093210188, + "loss": 0.0149, + "step": 610 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016219841384412456, + "loss": 0.0103, + "step": 611 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016208114742311236, + "loss": 0.007, + "step": 612 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016196374193175824, + "loss": 0.0293, + "step": 613 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016184619763306675, + "loss": 0.0025, + "step": 614 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016172851479035328, + "loss": 0.0149, + "step": 615 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016161069366724375, + "loss": 0.0039, + "step": 616 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016149273452767363, + "loss": 0.0145, + "step": 617 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016137463763588777, + "loss": 0.0168, + "step": 618 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016125640325643942, + "loss": 0.0131, + "step": 619 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016113803165419003, + "loss": 0.0081, + "step": 620 + }, + { + "epoch": 0.88, + "eval_loss": 0.024927595630288124, + "eval_runtime": 22.9947, + "eval_samples_per_second": 43.488, + "eval_steps_per_second": 10.872, + "step": 620 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016101952309430822, + "loss": 0.0065, + "step": 621 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016090087784226959, + "loss": 0.0083, + "step": 622 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001607820961638559, + "loss": 0.0173, + "step": 623 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016066317832515446, + "loss": 0.0125, + "step": 624 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001605441245925577, + "loss": 0.0053, + "step": 625 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016042493523276238, + "loss": 0.0241, + "step": 626 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001603056105127691, + "loss": 0.0089, + "step": 627 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001601861506998818, + "loss": 0.003, + "step": 628 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016006655606170687, + "loss": 0.0141, + "step": 629 + }, + { + "epoch": 0.9, + "learning_rate": 0.00015994682686615286, + "loss": 0.0139, + "step": 630 + }, + { + "epoch": 0.9, + "learning_rate": 0.00015982696338142963, + "loss": 0.0096, + "step": 631 + }, + { + "epoch": 0.9, + "learning_rate": 0.00015970696587604803, + "loss": 0.0136, + "step": 632 + }, + { + "epoch": 0.9, + "learning_rate": 0.000159586834618819, + "loss": 0.0055, + "step": 633 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001594665698788531, + "loss": 0.0128, + "step": 634 + }, + { + "epoch": 0.9, + "learning_rate": 0.00015934617192556, + "loss": 0.0066, + "step": 635 + }, + { + "epoch": 0.9, + "learning_rate": 0.00015922564102864773, + "loss": 0.0086, + "step": 636 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015910497745812217, + "loss": 0.0068, + "step": 637 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015898418148428632, + "loss": 0.016, + "step": 638 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015886325337773988, + "loss": 0.0137, + "step": 639 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001587421934093785, + "loss": 0.0166, + "step": 640 + }, + { + "epoch": 0.91, + "eval_loss": 0.04424288496375084, + "eval_runtime": 23.3621, + "eval_samples_per_second": 42.804, + "eval_steps_per_second": 10.701, + "step": 640 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015862100185039322, + "loss": 0.0116, + "step": 641 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015849967897226986, + "loss": 0.0092, + "step": 642 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015837822504678842, + "loss": 0.0061, + "step": 643 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015825664034602245, + "loss": 0.0141, + "step": 644 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001581349251423385, + "loss": 0.0075, + "step": 645 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001580130797083954, + "loss": 0.002, + "step": 646 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015789110431714377, + "loss": 0.0139, + "step": 647 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015776899924182532, + "loss": 0.0092, + "step": 648 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015764676475597228, + "loss": 0.0056, + "step": 649 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015752440113340677, + "loss": 0.016, + "step": 650 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001574019086482402, + "loss": 0.0152, + "step": 651 + }, + { + "epoch": 0.93, + "learning_rate": 0.00015727928757487266, + "loss": 0.0028, + "step": 652 + }, + { + "epoch": 0.93, + "learning_rate": 0.00015715653818799226, + "loss": 0.0105, + "step": 653 + }, + { + "epoch": 0.93, + "learning_rate": 0.00015703366076257456, + "loss": 0.0082, + "step": 654 + }, + { + "epoch": 0.93, + "learning_rate": 0.00015691065557388206, + "loss": 0.0134, + "step": 655 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001567875228974632, + "loss": 0.0052, + "step": 656 + }, + { + "epoch": 0.93, + "learning_rate": 0.00015666426300915237, + "loss": 0.0089, + "step": 657 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015654087618506858, + "loss": 0.0094, + "step": 658 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015641736270161544, + "loss": 0.013, + "step": 659 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015629372283548017, + "loss": 0.0106, + "step": 660 + }, + { + "epoch": 0.94, + "eval_loss": 0.03268582001328468, + "eval_runtime": 23.2536, + "eval_samples_per_second": 43.004, + "eval_steps_per_second": 10.751, + "step": 660 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015616995686363314, + "loss": 0.0145, + "step": 661 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015604606506332722, + "loss": 0.0081, + "step": 662 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015592204771209715, + "loss": 0.0115, + "step": 663 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015579790508775894, + "loss": 0.0128, + "step": 664 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015567363746840922, + "loss": 0.0006, + "step": 665 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015554924513242456, + "loss": 0.0071, + "step": 666 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015542472835846098, + "loss": 0.0076, + "step": 667 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015530008742545328, + "loss": 0.0083, + "step": 668 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015517532261261435, + "loss": 0.007, + "step": 669 + }, + { + "epoch": 0.95, + "learning_rate": 0.0001550504341994346, + "loss": 0.0052, + "step": 670 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015492542246568126, + "loss": 0.0162, + "step": 671 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015480028769139796, + "loss": 0.0155, + "step": 672 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015467503015690386, + "loss": 0.001, + "step": 673 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001545496501427931, + "loss": 0.0089, + "step": 674 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015442414792993416, + "loss": 0.0121, + "step": 675 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015429852379946946, + "loss": 0.012, + "step": 676 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001541727780328143, + "loss": 0.0076, + "step": 677 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015404691091165662, + "loss": 0.0006, + "step": 678 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001539209227179561, + "loss": 0.0081, + "step": 679 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001537948137339437, + "loss": 0.0161, + "step": 680 + }, + { + "epoch": 0.97, + "eval_loss": 0.03857529163360596, + "eval_runtime": 23.3772, + "eval_samples_per_second": 42.777, + "eval_steps_per_second": 10.694, + "step": 680 + }, + { + "epoch": 0.97, + "learning_rate": 0.000153668584242121, + "loss": 0.0015, + "step": 681 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015354223452525943, + "loss": 0.0105, + "step": 682 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015341576486639985, + "loss": 0.0162, + "step": 683 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015328917554885174, + "loss": 0.0062, + "step": 684 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015316246685619263, + "loss": 0.008, + "step": 685 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015303563907226753, + "loss": 0.0146, + "step": 686 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015290869248118813, + "loss": 0.0079, + "step": 687 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015278162736733237, + "loss": 0.0097, + "step": 688 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015265444401534362, + "loss": 0.006, + "step": 689 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015252714271013016, + "loss": 0.0074, + "step": 690 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015239972373686452, + "loss": 0.0015, + "step": 691 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015227218738098273, + "loss": 0.0067, + "step": 692 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001521445339281839, + "loss": 0.0111, + "step": 693 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015201676366442932, + "loss": 0.016, + "step": 694 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001518888768759421, + "loss": 0.01, + "step": 695 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015176087384920624, + "loss": 0.0103, + "step": 696 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015163275487096623, + "loss": 0.0143, + "step": 697 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015150452022822625, + "loss": 0.0079, + "step": 698 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015137617020824964, + "loss": 0.0082, + "step": 699 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015124770509855812, + "loss": 0.0038, + "step": 700 + }, + { + "epoch": 1.0, + "eval_loss": 0.037697676569223404, + "eval_runtime": 23.1666, + "eval_samples_per_second": 43.166, + "eval_steps_per_second": 10.791, + "step": 700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001511191251869313, + "loss": 0.0111, + "step": 701 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015099043076140595, + "loss": 0.0028, + "step": 702 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001508616221102753, + "loss": 0.0068, + "step": 703 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015073269952208858, + "loss": 0.0009, + "step": 704 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015060366328565014, + "loss": 0.005, + "step": 705 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015047451369001896, + "loss": 0.0035, + "step": 706 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015034525102450804, + "loss": 0.0118, + "step": 707 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015021587557868352, + "loss": 0.0069, + "step": 708 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015008638764236427, + "loss": 0.0053, + "step": 709 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014995678750562118, + "loss": 0.0099, + "step": 710 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001498270754587764, + "loss": 0.0033, + "step": 711 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014969725179240291, + "loss": 0.0065, + "step": 712 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001495673167973236, + "loss": 0.0172, + "step": 713 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014943727076461075, + "loss": 0.0231, + "step": 714 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014930711398558548, + "loss": 0.0194, + "step": 715 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014917684675181696, + "loss": 0.0092, + "step": 716 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014904646935512175, + "loss": 0.0034, + "step": 717 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014891598208756324, + "loss": 0.0096, + "step": 718 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001487853852414509, + "loss": 0.0103, + "step": 719 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014865467910933974, + "loss": 0.0029, + "step": 720 + }, + { + "epoch": 1.02, + "eval_loss": 0.03674602881073952, + "eval_runtime": 23.3777, + "eval_samples_per_second": 42.776, + "eval_steps_per_second": 10.694, + "step": 720 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001485238639840295, + "loss": 0.0061, + "step": 721 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014839294015856413, + "loss": 0.0048, + "step": 722 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001482619079262311, + "loss": 0.0037, + "step": 723 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014813076758056067, + "loss": 0.0053, + "step": 724 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014799951941532534, + "loss": 0.0142, + "step": 725 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014786816372453916, + "loss": 0.0012, + "step": 726 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014773670080245693, + "loss": 0.0094, + "step": 727 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014760513094357384, + "loss": 0.0035, + "step": 728 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014747345444262442, + "loss": 0.0097, + "step": 729 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014734167159458235, + "loss": 0.007, + "step": 730 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014720978269465928, + "loss": 0.0057, + "step": 731 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014707778803830466, + "loss": 0.0107, + "step": 732 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001469456879212047, + "loss": 0.0035, + "step": 733 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014681348263928187, + "loss": 0.0085, + "step": 734 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001466811724886943, + "loss": 0.0158, + "step": 735 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014654875776583498, + "loss": 0.0089, + "step": 736 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014641623876733123, + "loss": 0.0024, + "step": 737 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001462836157900438, + "loss": 0.0086, + "step": 738 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001461508891310665, + "loss": 0.0053, + "step": 739 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014601805908772542, + "loss": 0.0164, + "step": 740 + }, + { + "epoch": 1.05, + "eval_loss": 0.027637075632810593, + "eval_runtime": 23.2539, + "eval_samples_per_second": 43.004, + "eval_steps_per_second": 10.751, + "step": 740 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014588512595757814, + "loss": 0.0171, + "step": 741 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001457520900384133, + "loss": 0.0099, + "step": 742 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014561895162824964, + "loss": 0.0144, + "step": 743 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014548571102533566, + "loss": 0.0188, + "step": 744 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014535236852814863, + "loss": 0.0075, + "step": 745 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014521892443539425, + "loss": 0.008, + "step": 746 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001450853790460056, + "loss": 0.0135, + "step": 747 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014495173265914287, + "loss": 0.0165, + "step": 748 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014481798557419243, + "loss": 0.0069, + "step": 749 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014468413809076616, + "loss": 0.0055, + "step": 750 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001445501905087009, + "loss": 0.0193, + "step": 751 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014441614312805782, + "loss": 0.0132, + "step": 752 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014428199624912143, + "loss": 0.0062, + "step": 753 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001441477501723994, + "loss": 0.0092, + "step": 754 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014401340519862137, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014387896162873867, + "loss": 0.0071, + "step": 756 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014374441976392342, + "loss": 0.0118, + "step": 757 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014360977990556808, + "loss": 0.0071, + "step": 758 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001434750423552844, + "loss": 0.0087, + "step": 759 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001433402074149032, + "loss": 0.0128, + "step": 760 + }, + { + "epoch": 1.08, + "eval_loss": 0.02587747946381569, + "eval_runtime": 23.4034, + "eval_samples_per_second": 42.729, + "eval_steps_per_second": 10.682, + "step": 760 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014320527538647326, + "loss": 0.0106, + "step": 761 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014307024657226103, + "loss": 0.0085, + "step": 762 + }, + { + "epoch": 1.09, + "learning_rate": 0.00014293512127474967, + "loss": 0.0076, + "step": 763 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001427998997966385, + "loss": 0.0085, + "step": 764 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001426645824408423, + "loss": 0.0034, + "step": 765 + }, + { + "epoch": 1.09, + "learning_rate": 0.00014252916951049068, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.09, + "learning_rate": 0.00014239366130892723, + "loss": 0.0129, + "step": 767 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001422580581397091, + "loss": 0.0064, + "step": 768 + }, + { + "epoch": 1.09, + "learning_rate": 0.000142122360306606, + "loss": 0.0145, + "step": 769 + }, + { + "epoch": 1.1, + "learning_rate": 0.00014198656811359995, + "loss": 0.0032, + "step": 770 + }, + { + "epoch": 1.1, + "learning_rate": 0.00014185068186488411, + "loss": 0.0076, + "step": 771 + }, + { + "epoch": 1.1, + "learning_rate": 0.00014171470186486254, + "loss": 0.0151, + "step": 772 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001415786284181491, + "loss": 0.0045, + "step": 773 + }, + { + "epoch": 1.1, + "learning_rate": 0.00014144246182956716, + "loss": 0.0072, + "step": 774 + }, + { + "epoch": 1.1, + "learning_rate": 0.00014130620240414865, + "loss": 0.0088, + "step": 775 + }, + { + "epoch": 1.1, + "learning_rate": 0.00014116985044713352, + "loss": 0.0168, + "step": 776 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014103340626396902, + "loss": 0.0034, + "step": 777 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014089687016030888, + "loss": 0.0021, + "step": 778 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014076024244201286, + "loss": 0.0048, + "step": 779 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014062352341514593, + "loss": 0.0108, + "step": 780 + }, + { + "epoch": 1.11, + "eval_loss": 0.029424183070659637, + "eval_runtime": 23.2882, + "eval_samples_per_second": 42.94, + "eval_steps_per_second": 10.735, + "step": 780 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014048671338597757, + "loss": 0.0051, + "step": 781 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014034981266098113, + "loss": 0.035, + "step": 782 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014021282154683313, + "loss": 0.0153, + "step": 783 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014007574035041262, + "loss": 0.0062, + "step": 784 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013993856937880033, + "loss": 0.0017, + "step": 785 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013980130893927827, + "loss": 0.0092, + "step": 786 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001396639593393287, + "loss": 0.0169, + "step": 787 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013952652088663377, + "loss": 0.0173, + "step": 788 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001393889938890745, + "loss": 0.005, + "step": 789 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013925137865473039, + "loss": 0.0079, + "step": 790 + }, + { + "epoch": 1.13, + "learning_rate": 0.00013911367549187859, + "loss": 0.0052, + "step": 791 + }, + { + "epoch": 1.13, + "learning_rate": 0.00013897588470899316, + "loss": 0.0223, + "step": 792 + }, + { + "epoch": 1.13, + "learning_rate": 0.00013883800661474452, + "loss": 0.0213, + "step": 793 + }, + { + "epoch": 1.13, + "learning_rate": 0.00013870004151799855, + "loss": 0.0063, + "step": 794 + }, + { + "epoch": 1.13, + "learning_rate": 0.00013856198972781627, + "loss": 0.004, + "step": 795 + }, + { + "epoch": 1.13, + "learning_rate": 0.00013842385155345258, + "loss": 0.0297, + "step": 796 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001382856273043562, + "loss": 0.019, + "step": 797 + }, + { + "epoch": 1.14, + "learning_rate": 0.00013814731729016843, + "loss": 0.0053, + "step": 798 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001380089218207229, + "loss": 0.0093, + "step": 799 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001378704412060445, + "loss": 0.026, + "step": 800 + }, + { + "epoch": 1.14, + "eval_loss": 0.02853754535317421, + "eval_runtime": 23.2068, + "eval_samples_per_second": 43.091, + "eval_steps_per_second": 10.773, + "step": 800 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001377318757563489, + "loss": 0.0045, + "step": 801 + }, + { + "epoch": 1.14, + "learning_rate": 0.00013759322578204192, + "loss": 0.007, + "step": 802 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001374544915937186, + "loss": 0.0063, + "step": 803 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001373156735021627, + "loss": 0.0147, + "step": 804 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001371767718183459, + "loss": 0.0087, + "step": 805 + }, + { + "epoch": 1.15, + "learning_rate": 0.00013703778685342712, + "loss": 0.0087, + "step": 806 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001368987189187519, + "loss": 0.006, + "step": 807 + }, + { + "epoch": 1.15, + "learning_rate": 0.00013675956832585163, + "loss": 0.0118, + "step": 808 + }, + { + "epoch": 1.15, + "learning_rate": 0.00013662033538644285, + "loss": 0.0072, + "step": 809 + }, + { + "epoch": 1.15, + "learning_rate": 0.00013648102041242653, + "loss": 0.0245, + "step": 810 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001363416237158875, + "loss": 0.0086, + "step": 811 + }, + { + "epoch": 1.16, + "learning_rate": 0.00013620214560909356, + "loss": 0.0025, + "step": 812 + }, + { + "epoch": 1.16, + "learning_rate": 0.00013606258640449495, + "loss": 0.0079, + "step": 813 + }, + { + "epoch": 1.16, + "learning_rate": 0.00013592294641472355, + "loss": 0.0096, + "step": 814 + }, + { + "epoch": 1.16, + "learning_rate": 0.00013578322595259222, + "loss": 0.0147, + "step": 815 + }, + { + "epoch": 1.16, + "learning_rate": 0.00013564342533109406, + "loss": 0.0127, + "step": 816 + }, + { + "epoch": 1.16, + "learning_rate": 0.00013550354486340185, + "loss": 0.0213, + "step": 817 + }, + { + "epoch": 1.16, + "learning_rate": 0.000135363584862867, + "loss": 0.0103, + "step": 818 + }, + { + "epoch": 1.17, + "learning_rate": 0.00013522354564301942, + "loss": 0.0205, + "step": 819 + }, + { + "epoch": 1.17, + "learning_rate": 0.00013508342751756617, + "loss": 0.0104, + "step": 820 + }, + { + "epoch": 1.17, + "eval_loss": 0.02971697226166725, + "eval_runtime": 23.6969, + "eval_samples_per_second": 42.2, + "eval_steps_per_second": 10.55, + "step": 820 + }, + { + "epoch": 1.17, + "learning_rate": 0.00013494323080039128, + "loss": 0.0023, + "step": 821 + }, + { + "epoch": 1.17, + "learning_rate": 0.00013480295580555463, + "loss": 0.0116, + "step": 822 + }, + { + "epoch": 1.17, + "learning_rate": 0.00013466260284729172, + "loss": 0.0106, + "step": 823 + }, + { + "epoch": 1.17, + "learning_rate": 0.00013452217224001247, + "loss": 0.0092, + "step": 824 + }, + { + "epoch": 1.17, + "learning_rate": 0.00013438166429830084, + "loss": 0.018, + "step": 825 + }, + { + "epoch": 1.17, + "learning_rate": 0.00013424107933691402, + "loss": 0.0042, + "step": 826 + }, + { + "epoch": 1.18, + "learning_rate": 0.00013410041767078176, + "loss": 0.0093, + "step": 827 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001339596796150056, + "loss": 0.0078, + "step": 828 + }, + { + "epoch": 1.18, + "learning_rate": 0.00013381886548485822, + "loss": 0.0202, + "step": 829 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001336779755957827, + "loss": 0.0072, + "step": 830 + }, + { + "epoch": 1.18, + "learning_rate": 0.00013353701026339187, + "loss": 0.0116, + "step": 831 + }, + { + "epoch": 1.18, + "learning_rate": 0.00013339596980346748, + "loss": 0.0129, + "step": 832 + }, + { + "epoch": 1.18, + "learning_rate": 0.00013325485453195973, + "loss": 0.0088, + "step": 833 + }, + { + "epoch": 1.19, + "learning_rate": 0.00013311366476498622, + "loss": 0.0166, + "step": 834 + }, + { + "epoch": 1.19, + "learning_rate": 0.00013297240081883155, + "loss": 0.0094, + "step": 835 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001328310630099465, + "loss": 0.0143, + "step": 836 + }, + { + "epoch": 1.19, + "learning_rate": 0.00013268965165494725, + "loss": 0.0062, + "step": 837 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001325481670706147, + "loss": 0.0141, + "step": 838 + }, + { + "epoch": 1.19, + "learning_rate": 0.00013240660957389403, + "loss": 0.0112, + "step": 839 + }, + { + "epoch": 1.19, + "learning_rate": 0.00013226497948189337, + "loss": 0.0102, + "step": 840 + }, + { + "epoch": 1.19, + "eval_loss": 0.027126112952828407, + "eval_runtime": 23.2919, + "eval_samples_per_second": 42.933, + "eval_steps_per_second": 10.733, + "step": 840 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001321232771118838, + "loss": 0.0002, + "step": 841 + }, + { + "epoch": 1.2, + "learning_rate": 0.00013198150278129819, + "loss": 0.0046, + "step": 842 + }, + { + "epoch": 1.2, + "learning_rate": 0.00013183965680773057, + "loss": 0.0131, + "step": 843 + }, + { + "epoch": 1.2, + "learning_rate": 0.00013169773950893552, + "loss": 0.0101, + "step": 844 + }, + { + "epoch": 1.2, + "learning_rate": 0.00013155575120282742, + "loss": 0.0094, + "step": 845 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001314136922074796, + "loss": 0.0047, + "step": 846 + }, + { + "epoch": 1.2, + "learning_rate": 0.00013127156284112394, + "loss": 0.0097, + "step": 847 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013112936342214975, + "loss": 0.0027, + "step": 848 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013098709426910343, + "loss": 0.0065, + "step": 849 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013084475570068743, + "loss": 0.016, + "step": 850 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013070234803575992, + "loss": 0.0159, + "step": 851 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013055987159333362, + "loss": 0.0188, + "step": 852 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013041732669257552, + "loss": 0.0053, + "step": 853 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013027471365280582, + "loss": 0.0022, + "step": 854 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013013203279349744, + "loss": 0.0089, + "step": 855 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001299892844342752, + "loss": 0.0054, + "step": 856 + }, + { + "epoch": 1.22, + "learning_rate": 0.00012984646889491517, + "loss": 0.0071, + "step": 857 + }, + { + "epoch": 1.22, + "learning_rate": 0.00012970358649534383, + "loss": 0.0042, + "step": 858 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001295606375556375, + "loss": 0.0096, + "step": 859 + }, + { + "epoch": 1.22, + "learning_rate": 0.00012941762239602152, + "loss": 0.0111, + "step": 860 + }, + { + "epoch": 1.22, + "eval_loss": 0.029291899874806404, + "eval_runtime": 23.1545, + "eval_samples_per_second": 43.188, + "eval_steps_per_second": 10.797, + "step": 860 + }, + { + "epoch": 1.22, + "learning_rate": 0.00012927454133686967, + "loss": 0.0134, + "step": 861 + }, + { + "epoch": 1.23, + "learning_rate": 0.00012913139469870319, + "loss": 0.0016, + "step": 862 + }, + { + "epoch": 1.23, + "learning_rate": 0.00012898818280219037, + "loss": 0.0043, + "step": 863 + }, + { + "epoch": 1.23, + "learning_rate": 0.00012884490596814558, + "loss": 0.006, + "step": 864 + }, + { + "epoch": 1.23, + "learning_rate": 0.00012870156451752878, + "loss": 0.007, + "step": 865 + }, + { + "epoch": 1.23, + "learning_rate": 0.00012855815877144456, + "loss": 0.0044, + "step": 866 + }, + { + "epoch": 1.23, + "learning_rate": 0.00012841468905114156, + "loss": 0.0019, + "step": 867 + }, + { + "epoch": 1.23, + "learning_rate": 0.00012827115567801186, + "loss": 0.0031, + "step": 868 + }, + { + "epoch": 1.24, + "learning_rate": 0.00012812755897358995, + "loss": 0.0033, + "step": 869 + }, + { + "epoch": 1.24, + "learning_rate": 0.00012798389925955235, + "loss": 0.0053, + "step": 870 + }, + { + "epoch": 1.24, + "learning_rate": 0.00012784017685771654, + "loss": 0.0152, + "step": 871 + }, + { + "epoch": 1.24, + "learning_rate": 0.00012769639209004067, + "loss": 0.0038, + "step": 872 + }, + { + "epoch": 1.24, + "learning_rate": 0.00012755254527862243, + "loss": 0.0108, + "step": 873 + }, + { + "epoch": 1.24, + "learning_rate": 0.00012740863674569846, + "loss": 0.0089, + "step": 874 + }, + { + "epoch": 1.24, + "learning_rate": 0.00012726466681364383, + "loss": 0.0083, + "step": 875 + }, + { + "epoch": 1.25, + "learning_rate": 0.00012712063580497108, + "loss": 0.0127, + "step": 876 + }, + { + "epoch": 1.25, + "learning_rate": 0.00012697654404232947, + "loss": 0.0065, + "step": 877 + }, + { + "epoch": 1.25, + "learning_rate": 0.00012683239184850455, + "loss": 0.0198, + "step": 878 + }, + { + "epoch": 1.25, + "learning_rate": 0.00012668817954641706, + "loss": 0.018, + "step": 879 + }, + { + "epoch": 1.25, + "learning_rate": 0.00012654390745912248, + "loss": 0.0088, + "step": 880 + }, + { + "epoch": 1.25, + "eval_loss": 0.03049076348543167, + "eval_runtime": 23.1763, + "eval_samples_per_second": 43.148, + "eval_steps_per_second": 10.787, + "step": 880 + }, + { + "epoch": 1.25, + "learning_rate": 0.00012639957590981023, + "loss": 0.0187, + "step": 881 + }, + { + "epoch": 1.25, + "learning_rate": 0.00012625518522180288, + "loss": 0.0142, + "step": 882 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001261107357185555, + "loss": 0.0143, + "step": 883 + }, + { + "epoch": 1.26, + "learning_rate": 0.000125966227723655, + "loss": 0.005, + "step": 884 + }, + { + "epoch": 1.26, + "learning_rate": 0.00012582166156081915, + "loss": 0.0017, + "step": 885 + }, + { + "epoch": 1.26, + "learning_rate": 0.00012567703755389615, + "loss": 0.0177, + "step": 886 + }, + { + "epoch": 1.26, + "learning_rate": 0.00012553235602686373, + "loss": 0.0092, + "step": 887 + }, + { + "epoch": 1.26, + "learning_rate": 0.00012538761730382852, + "loss": 0.0135, + "step": 888 + }, + { + "epoch": 1.26, + "learning_rate": 0.00012524282170902522, + "loss": 0.0142, + "step": 889 + }, + { + "epoch": 1.27, + "learning_rate": 0.00012509796956681598, + "loss": 0.0073, + "step": 890 + }, + { + "epoch": 1.27, + "learning_rate": 0.00012495306120168954, + "loss": 0.0093, + "step": 891 + }, + { + "epoch": 1.27, + "learning_rate": 0.00012480809693826074, + "loss": 0.0098, + "step": 892 + }, + { + "epoch": 1.27, + "learning_rate": 0.00012466307710126944, + "loss": 0.008, + "step": 893 + }, + { + "epoch": 1.27, + "learning_rate": 0.00012451800201558017, + "loss": 0.0156, + "step": 894 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001243728720061811, + "loss": 0.0033, + "step": 895 + }, + { + "epoch": 1.27, + "learning_rate": 0.00012422768739818357, + "loss": 0.0143, + "step": 896 + }, + { + "epoch": 1.28, + "learning_rate": 0.00012408244851682106, + "loss": 0.0018, + "step": 897 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001239371556874488, + "loss": 0.0095, + "step": 898 + }, + { + "epoch": 1.28, + "learning_rate": 0.00012379180923554267, + "loss": 0.0072, + "step": 899 + }, + { + "epoch": 1.28, + "learning_rate": 0.00012364640948669893, + "loss": 0.0116, + "step": 900 + }, + { + "epoch": 1.28, + "eval_loss": 0.02501627989113331, + "eval_runtime": 23.277, + "eval_samples_per_second": 42.961, + "eval_steps_per_second": 10.74, + "step": 900 + }, + { + "epoch": 1.28, + "learning_rate": 0.000123500956766633, + "loss": 0.0092, + "step": 901 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001233554514011791, + "loss": 0.0018, + "step": 902 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001232098937162893, + "loss": 0.0086, + "step": 903 + }, + { + "epoch": 1.29, + "learning_rate": 0.00012306428403803296, + "loss": 0.0109, + "step": 904 + }, + { + "epoch": 1.29, + "learning_rate": 0.00012291862269259583, + "loss": 0.0089, + "step": 905 + }, + { + "epoch": 1.29, + "learning_rate": 0.00012277291000627942, + "loss": 0.0078, + "step": 906 + }, + { + "epoch": 1.29, + "learning_rate": 0.00012262714630550028, + "loss": 0.0016, + "step": 907 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001224813319167893, + "loss": 0.003, + "step": 908 + }, + { + "epoch": 1.29, + "learning_rate": 0.00012233546716679074, + "loss": 0.0105, + "step": 909 + }, + { + "epoch": 1.29, + "learning_rate": 0.00012218955238226182, + "loss": 0.007, + "step": 910 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001220435878900718, + "loss": 0.0057, + "step": 911 + }, + { + "epoch": 1.3, + "learning_rate": 0.00012189757401720133, + "loss": 0.0057, + "step": 912 + }, + { + "epoch": 1.3, + "learning_rate": 0.00012175151109074157, + "loss": 0.0088, + "step": 913 + }, + { + "epoch": 1.3, + "learning_rate": 0.00012160539943789373, + "loss": 0.0003, + "step": 914 + }, + { + "epoch": 1.3, + "learning_rate": 0.00012145923938596802, + "loss": 0.0023, + "step": 915 + }, + { + "epoch": 1.3, + "learning_rate": 0.00012131303126238315, + "loss": 0.0093, + "step": 916 + }, + { + "epoch": 1.3, + "learning_rate": 0.00012116677539466555, + "loss": 0.0093, + "step": 917 + }, + { + "epoch": 1.31, + "learning_rate": 0.00012102047211044849, + "loss": 0.0048, + "step": 918 + }, + { + "epoch": 1.31, + "learning_rate": 0.00012087412173747159, + "loss": 0.0024, + "step": 919 + }, + { + "epoch": 1.31, + "learning_rate": 0.00012072772460357981, + "loss": 0.0066, + "step": 920 + }, + { + "epoch": 1.31, + "eval_loss": 0.04417268931865692, + "eval_runtime": 23.162, + "eval_samples_per_second": 43.174, + "eval_steps_per_second": 10.794, + "step": 920 + }, + { + "epoch": 1.31, + "learning_rate": 0.00012058128103672306, + "loss": 0.0082, + "step": 921 + }, + { + "epoch": 1.31, + "learning_rate": 0.00012043479136495506, + "loss": 0.0044, + "step": 922 + }, + { + "epoch": 1.31, + "learning_rate": 0.00012028825591643291, + "loss": 0.0083, + "step": 923 + }, + { + "epoch": 1.31, + "learning_rate": 0.00012014167501941629, + "loss": 0.0113, + "step": 924 + }, + { + "epoch": 1.32, + "learning_rate": 0.00011999504900226659, + "loss": 0.0204, + "step": 925 + }, + { + "epoch": 1.32, + "learning_rate": 0.00011984837819344639, + "loss": 0.0125, + "step": 926 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001197016629215185, + "loss": 0.006, + "step": 927 + }, + { + "epoch": 1.32, + "learning_rate": 0.00011955490351514545, + "loss": 0.0048, + "step": 928 + }, + { + "epoch": 1.32, + "learning_rate": 0.00011940810030308849, + "loss": 0.0038, + "step": 929 + }, + { + "epoch": 1.32, + "learning_rate": 0.00011926125361420717, + "loss": 0.0062, + "step": 930 + }, + { + "epoch": 1.32, + "learning_rate": 0.00011911436377745828, + "loss": 0.007, + "step": 931 + }, + { + "epoch": 1.33, + "learning_rate": 0.00011896743112189541, + "loss": 0.0119, + "step": 932 + }, + { + "epoch": 1.33, + "learning_rate": 0.00011882045597666791, + "loss": 0.0021, + "step": 933 + }, + { + "epoch": 1.33, + "learning_rate": 0.00011867343867102048, + "loss": 0.012, + "step": 934 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001185263795342921, + "loss": 0.0101, + "step": 935 + }, + { + "epoch": 1.33, + "learning_rate": 0.00011837927889591563, + "loss": 0.0052, + "step": 936 + }, + { + "epoch": 1.33, + "learning_rate": 0.00011823213708541671, + "loss": 0.0119, + "step": 937 + }, + { + "epoch": 1.33, + "learning_rate": 0.00011808495443241341, + "loss": 0.0066, + "step": 938 + }, + { + "epoch": 1.34, + "learning_rate": 0.00011793773126661514, + "loss": 0.0057, + "step": 939 + }, + { + "epoch": 1.34, + "learning_rate": 0.00011779046791782216, + "loss": 0.0061, + "step": 940 + }, + { + "epoch": 1.34, + "eval_loss": 0.030940894037485123, + "eval_runtime": 23.2686, + "eval_samples_per_second": 42.976, + "eval_steps_per_second": 10.744, + "step": 940 + }, + { + "epoch": 1.34, + "learning_rate": 0.00011764316471592463, + "loss": 0.0144, + "step": 941 + }, + { + "epoch": 1.34, + "learning_rate": 0.00011749582199090213, + "loss": 0.0058, + "step": 942 + }, + { + "epoch": 1.34, + "learning_rate": 0.00011734844007282268, + "loss": 0.0221, + "step": 943 + }, + { + "epoch": 1.34, + "learning_rate": 0.00011720101929184211, + "loss": 0.0138, + "step": 944 + }, + { + "epoch": 1.34, + "learning_rate": 0.00011705355997820338, + "loss": 0.0097, + "step": 945 + }, + { + "epoch": 1.35, + "learning_rate": 0.00011690606246223566, + "loss": 0.0087, + "step": 946 + }, + { + "epoch": 1.35, + "learning_rate": 0.00011675852707435375, + "loss": 0.0047, + "step": 947 + }, + { + "epoch": 1.35, + "learning_rate": 0.00011661095414505731, + "loss": 0.0115, + "step": 948 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001164633440049301, + "loss": 0.0071, + "step": 949 + }, + { + "epoch": 1.35, + "learning_rate": 0.00011631569698463915, + "loss": 0.0073, + "step": 950 + }, + { + "epoch": 1.35, + "learning_rate": 0.00011616801341493426, + "loss": 0.0083, + "step": 951 + }, + { + "epoch": 1.35, + "learning_rate": 0.00011602029362664693, + "loss": 0.0104, + "step": 952 + }, + { + "epoch": 1.36, + "learning_rate": 0.00011587253795068993, + "loss": 0.0132, + "step": 953 + }, + { + "epoch": 1.36, + "learning_rate": 0.00011572474671805636, + "loss": 0.0079, + "step": 954 + }, + { + "epoch": 1.36, + "learning_rate": 0.00011557692025981901, + "loss": 0.0079, + "step": 955 + }, + { + "epoch": 1.36, + "learning_rate": 0.00011542905890712952, + "loss": 0.0148, + "step": 956 + }, + { + "epoch": 1.36, + "learning_rate": 0.00011528116299121777, + "loss": 0.0031, + "step": 957 + }, + { + "epoch": 1.36, + "learning_rate": 0.000115133232843391, + "loss": 0.0093, + "step": 958 + }, + { + "epoch": 1.36, + "learning_rate": 0.00011498526879503317, + "loss": 0.0151, + "step": 959 + }, + { + "epoch": 1.37, + "learning_rate": 0.00011483727117760419, + "loss": 0.0173, + "step": 960 + }, + { + "epoch": 1.37, + "eval_loss": 0.023118099197745323, + "eval_runtime": 23.7102, + "eval_samples_per_second": 42.176, + "eval_steps_per_second": 10.544, + "step": 960 + }, + { + "epoch": 1.37, + "learning_rate": 0.00011468924032263919, + "loss": 0.0074, + "step": 961 + }, + { + "epoch": 1.37, + "learning_rate": 0.00011454117656174767, + "loss": 0.0117, + "step": 962 + }, + { + "epoch": 1.37, + "learning_rate": 0.00011439308022661292, + "loss": 0.012, + "step": 963 + }, + { + "epoch": 1.37, + "learning_rate": 0.00011424495164899118, + "loss": 0.0131, + "step": 964 + }, + { + "epoch": 1.37, + "learning_rate": 0.00011409679116071092, + "loss": 0.0202, + "step": 965 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001139485990936721, + "loss": 0.0063, + "step": 966 + }, + { + "epoch": 1.38, + "learning_rate": 0.00011380037577984537, + "loss": 0.0063, + "step": 967 + }, + { + "epoch": 1.38, + "learning_rate": 0.00011365212155127145, + "loss": 0.002, + "step": 968 + }, + { + "epoch": 1.38, + "learning_rate": 0.00011350383674006027, + "loss": 0.0138, + "step": 969 + }, + { + "epoch": 1.38, + "learning_rate": 0.00011335552167839032, + "loss": 0.0098, + "step": 970 + }, + { + "epoch": 1.38, + "learning_rate": 0.00011320717669850776, + "loss": 0.0052, + "step": 971 + }, + { + "epoch": 1.38, + "learning_rate": 0.00011305880213272584, + "loss": 0.0133, + "step": 972 + }, + { + "epoch": 1.38, + "learning_rate": 0.00011291039831342412, + "loss": 0.0111, + "step": 973 + }, + { + "epoch": 1.39, + "learning_rate": 0.00011276196557304757, + "loss": 0.0051, + "step": 974 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001126135042441061, + "loss": 0.0098, + "step": 975 + }, + { + "epoch": 1.39, + "learning_rate": 0.00011246501465917352, + "loss": 0.0075, + "step": 976 + }, + { + "epoch": 1.39, + "learning_rate": 0.00011231649715088705, + "loss": 0.0183, + "step": 977 + }, + { + "epoch": 1.39, + "learning_rate": 0.00011216795205194637, + "loss": 0.0118, + "step": 978 + }, + { + "epoch": 1.39, + "learning_rate": 0.00011201937969511306, + "loss": 0.0077, + "step": 979 + }, + { + "epoch": 1.39, + "learning_rate": 0.00011187078041320968, + "loss": 0.0032, + "step": 980 + }, + { + "epoch": 1.39, + "eval_loss": 0.023013178259134293, + "eval_runtime": 23.1394, + "eval_samples_per_second": 43.216, + "eval_steps_per_second": 10.804, + "step": 980 + }, + { + "epoch": 1.4, + "learning_rate": 0.00011172215453911918, + "loss": 0.0089, + "step": 981 + }, + { + "epoch": 1.4, + "learning_rate": 0.00011157350240578396, + "loss": 0.0089, + "step": 982 + }, + { + "epoch": 1.4, + "learning_rate": 0.00011142482434620543, + "loss": 0.0084, + "step": 983 + }, + { + "epoch": 1.4, + "learning_rate": 0.00011127612069344287, + "loss": 0.0054, + "step": 984 + }, + { + "epoch": 1.4, + "learning_rate": 0.00011112739178061305, + "loss": 0.0067, + "step": 985 + }, + { + "epoch": 1.4, + "learning_rate": 0.00011097863794088924, + "loss": 0.0084, + "step": 986 + }, + { + "epoch": 1.4, + "learning_rate": 0.00011082985950750062, + "loss": 0.0047, + "step": 987 + }, + { + "epoch": 1.41, + "learning_rate": 0.00011068105681373139, + "loss": 0.0082, + "step": 988 + }, + { + "epoch": 1.41, + "learning_rate": 0.00011053223019292016, + "loss": 0.0092, + "step": 989 + }, + { + "epoch": 1.41, + "learning_rate": 0.00011038337997845906, + "loss": 0.0069, + "step": 990 + }, + { + "epoch": 1.41, + "learning_rate": 0.00011023450650379316, + "loss": 0.0069, + "step": 991 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001100856101024196, + "loss": 0.0135, + "step": 992 + }, + { + "epoch": 1.41, + "learning_rate": 0.00010993669110788687, + "loss": 0.0077, + "step": 993 + }, + { + "epoch": 1.41, + "learning_rate": 0.00010978774985379407, + "loss": 0.003, + "step": 994 + }, + { + "epoch": 1.42, + "learning_rate": 0.00010963878667379019, + "loss": 0.007, + "step": 995 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001094898019015733, + "loss": 0.0068, + "step": 996 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001093407958708899, + "loss": 0.0053, + "step": 997 + }, + { + "epoch": 1.42, + "learning_rate": 0.00010919176891553399, + "loss": 0.0057, + "step": 998 + }, + { + "epoch": 1.42, + "learning_rate": 0.00010904272136934664, + "loss": 0.0317, + "step": 999 + }, + { + "epoch": 1.42, + "learning_rate": 0.00010889365356621484, + "loss": 0.0119, + "step": 1000 + }, + { + "epoch": 1.42, + "eval_loss": 0.04013300687074661, + "eval_runtime": 23.3396, + "eval_samples_per_second": 42.846, + "eval_steps_per_second": 10.711, + "step": 1000 + }, + { + "epoch": 1.42, + "learning_rate": 0.00010874456584007112, + "loss": 0.0041, + "step": 1001 + }, + { + "epoch": 1.43, + "learning_rate": 0.00010859545852489248, + "loss": 0.005, + "step": 1002 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001084463319547, + "loss": 0.0054, + "step": 1003 + }, + { + "epoch": 1.43, + "learning_rate": 0.00010829718646355771, + "loss": 0.0061, + "step": 1004 + }, + { + "epoch": 1.43, + "learning_rate": 0.00010814802238557215, + "loss": 0.0116, + "step": 1005 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001079988400548914, + "loss": 0.0043, + "step": 1006 + }, + { + "epoch": 1.43, + "learning_rate": 0.00010784963980570455, + "loss": 0.0075, + "step": 1007 + }, + { + "epoch": 1.43, + "learning_rate": 0.00010770042197224068, + "loss": 0.011, + "step": 1008 + }, + { + "epoch": 1.44, + "learning_rate": 0.00010755118688876842, + "loss": 0.0054, + "step": 1009 + }, + { + "epoch": 1.44, + "learning_rate": 0.00010740193488959491, + "loss": 0.0026, + "step": 1010 + }, + { + "epoch": 1.44, + "learning_rate": 0.00010725266630906524, + "loss": 0.0192, + "step": 1011 + }, + { + "epoch": 1.44, + "learning_rate": 0.00010710338148156163, + "loss": 0.0094, + "step": 1012 + }, + { + "epoch": 1.44, + "learning_rate": 0.00010695408074150272, + "loss": 0.0049, + "step": 1013 + }, + { + "epoch": 1.44, + "learning_rate": 0.00010680476442334282, + "loss": 0.0056, + "step": 1014 + }, + { + "epoch": 1.44, + "learning_rate": 0.00010665543286157105, + "loss": 0.006, + "step": 1015 + }, + { + "epoch": 1.45, + "learning_rate": 0.00010650608639071072, + "loss": 0.0138, + "step": 1016 + }, + { + "epoch": 1.45, + "learning_rate": 0.00010635672534531858, + "loss": 0.0167, + "step": 1017 + }, + { + "epoch": 1.45, + "learning_rate": 0.00010620735005998395, + "loss": 0.0029, + "step": 1018 + }, + { + "epoch": 1.45, + "learning_rate": 0.00010605796086932816, + "loss": 0.0019, + "step": 1019 + }, + { + "epoch": 1.45, + "learning_rate": 0.00010590855810800355, + "loss": 0.0083, + "step": 1020 + }, + { + "epoch": 1.45, + "eval_loss": 0.02736389823257923, + "eval_runtime": 23.1462, + "eval_samples_per_second": 43.204, + "eval_steps_per_second": 10.801, + "step": 1020 + }, + { + "epoch": 1.45, + "learning_rate": 0.00010575914211069302, + "loss": 0.0059, + "step": 1021 + }, + { + "epoch": 1.45, + "learning_rate": 0.00010560971321210896, + "loss": 0.0036, + "step": 1022 + }, + { + "epoch": 1.46, + "learning_rate": 0.00010546027174699276, + "loss": 0.0079, + "step": 1023 + }, + { + "epoch": 1.46, + "learning_rate": 0.00010531081805011393, + "loss": 0.0075, + "step": 1024 + }, + { + "epoch": 1.46, + "learning_rate": 0.00010516135245626939, + "loss": 0.0092, + "step": 1025 + }, + { + "epoch": 1.46, + "learning_rate": 0.00010501187530028271, + "loss": 0.0077, + "step": 1026 + }, + { + "epoch": 1.46, + "learning_rate": 0.00010486238691700335, + "loss": 0.0091, + "step": 1027 + }, + { + "epoch": 1.46, + "learning_rate": 0.00010471288764130595, + "loss": 0.0163, + "step": 1028 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001045633778080895, + "loss": 0.0135, + "step": 1029 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001044138577522767, + "loss": 0.0081, + "step": 1030 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001042643278088131, + "loss": 0.0049, + "step": 1031 + }, + { + "epoch": 1.47, + "learning_rate": 0.00010411478831266643, + "loss": 0.0076, + "step": 1032 + }, + { + "epoch": 1.47, + "learning_rate": 0.00010396523959882578, + "loss": 0.0087, + "step": 1033 + }, + { + "epoch": 1.47, + "learning_rate": 0.00010381568200230097, + "loss": 0.01, + "step": 1034 + }, + { + "epoch": 1.47, + "learning_rate": 0.00010366611585812158, + "loss": 0.0076, + "step": 1035 + }, + { + "epoch": 1.47, + "learning_rate": 0.00010351654150133654, + "loss": 0.0042, + "step": 1036 + }, + { + "epoch": 1.48, + "learning_rate": 0.00010336695926701296, + "loss": 0.0033, + "step": 1037 + }, + { + "epoch": 1.48, + "learning_rate": 0.00010321736949023577, + "loss": 0.0134, + "step": 1038 + }, + { + "epoch": 1.48, + "learning_rate": 0.00010306777250610666, + "loss": 0.0042, + "step": 1039 + }, + { + "epoch": 1.48, + "learning_rate": 0.00010291816864974359, + "loss": 0.0047, + "step": 1040 + }, + { + "epoch": 1.48, + "eval_loss": 0.03592601418495178, + "eval_runtime": 23.3519, + "eval_samples_per_second": 42.823, + "eval_steps_per_second": 10.706, + "step": 1040 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001027685582562798, + "loss": 0.015, + "step": 1041 + }, + { + "epoch": 1.48, + "learning_rate": 0.00010261894166086327, + "loss": 0.0139, + "step": 1042 + }, + { + "epoch": 1.48, + "learning_rate": 0.00010246931919865581, + "loss": 0.0065, + "step": 1043 + }, + { + "epoch": 1.49, + "learning_rate": 0.00010231969120483242, + "loss": 0.0168, + "step": 1044 + }, + { + "epoch": 1.49, + "learning_rate": 0.00010217005801458044, + "loss": 0.0009, + "step": 1045 + }, + { + "epoch": 1.49, + "learning_rate": 0.00010202041996309889, + "loss": 0.016, + "step": 1046 + }, + { + "epoch": 1.49, + "learning_rate": 0.00010187077738559763, + "loss": 0.0073, + "step": 1047 + }, + { + "epoch": 1.49, + "learning_rate": 0.00010172113061729676, + "loss": 0.0138, + "step": 1048 + }, + { + "epoch": 1.49, + "learning_rate": 0.00010157147999342563, + "loss": 0.0123, + "step": 1049 + }, + { + "epoch": 1.49, + "learning_rate": 0.00010142182584922236, + "loss": 0.0025, + "step": 1050 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010127216851993285, + "loss": 0.0001, + "step": 1051 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010112250834081023, + "loss": 0.011, + "step": 1052 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001009728456471139, + "loss": 0.0156, + "step": 1053 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010082318077410905, + "loss": 0.0076, + "step": 1054 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010067351405706558, + "loss": 0.0055, + "step": 1055 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010052384583125764, + "loss": 0.0037, + "step": 1056 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010037417643196269, + "loss": 0.0069, + "step": 1057 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010022450619446091, + "loss": 0.0064, + "step": 1058 + }, + { + "epoch": 1.51, + "learning_rate": 0.00010007483545403422, + "loss": 0.0013, + "step": 1059 + }, + { + "epoch": 1.51, + "learning_rate": 9.992516454596579e-05, + "loss": 0.0221, + "step": 1060 + }, + { + "epoch": 1.51, + "eval_loss": 0.030091550201177597, + "eval_runtime": 23.3939, + "eval_samples_per_second": 42.746, + "eval_steps_per_second": 10.687, + "step": 1060 + }, + { + "epoch": 1.51, + "learning_rate": 9.977549380553913e-05, + "loss": 0.0066, + "step": 1061 + }, + { + "epoch": 1.51, + "learning_rate": 9.96258235680373e-05, + "loss": 0.0024, + "step": 1062 + }, + { + "epoch": 1.51, + "learning_rate": 9.947615416874237e-05, + "loss": 0.0134, + "step": 1063 + }, + { + "epoch": 1.51, + "learning_rate": 9.932648594293444e-05, + "loss": 0.0095, + "step": 1064 + }, + { + "epoch": 1.51, + "learning_rate": 9.917681922589096e-05, + "loss": 0.0056, + "step": 1065 + }, + { + "epoch": 1.52, + "learning_rate": 9.902715435288608e-05, + "loss": 0.0099, + "step": 1066 + }, + { + "epoch": 1.52, + "learning_rate": 9.88774916591898e-05, + "loss": 0.0111, + "step": 1067 + }, + { + "epoch": 1.52, + "learning_rate": 9.872783148006716e-05, + "loss": 0.0061, + "step": 1068 + }, + { + "epoch": 1.52, + "learning_rate": 9.857817415077769e-05, + "loss": 0.0049, + "step": 1069 + }, + { + "epoch": 1.52, + "learning_rate": 9.842852000657438e-05, + "loss": 0.003, + "step": 1070 + }, + { + "epoch": 1.52, + "learning_rate": 9.827886938270328e-05, + "loss": 0.0136, + "step": 1071 + }, + { + "epoch": 1.52, + "learning_rate": 9.81292226144024e-05, + "loss": 0.0121, + "step": 1072 + }, + { + "epoch": 1.53, + "learning_rate": 9.797958003690116e-05, + "loss": 0.0218, + "step": 1073 + }, + { + "epoch": 1.53, + "learning_rate": 9.782994198541957e-05, + "loss": 0.004, + "step": 1074 + }, + { + "epoch": 1.53, + "learning_rate": 9.768030879516759e-05, + "loss": 0.0052, + "step": 1075 + }, + { + "epoch": 1.53, + "learning_rate": 9.75306808013442e-05, + "loss": 0.0042, + "step": 1076 + }, + { + "epoch": 1.53, + "learning_rate": 9.738105833913675e-05, + "loss": 0.0019, + "step": 1077 + }, + { + "epoch": 1.53, + "learning_rate": 9.72314417437202e-05, + "loss": 0.0081, + "step": 1078 + }, + { + "epoch": 1.53, + "learning_rate": 9.708183135025643e-05, + "loss": 0.0049, + "step": 1079 + }, + { + "epoch": 1.54, + "learning_rate": 9.693222749389335e-05, + "loss": 0.0038, + "step": 1080 + }, + { + "epoch": 1.54, + "eval_loss": 0.02803967334330082, + "eval_runtime": 23.248, + "eval_samples_per_second": 43.014, + "eval_steps_per_second": 10.754, + "step": 1080 + }, + { + "epoch": 1.54, + "learning_rate": 9.678263050976428e-05, + "loss": 0.0098, + "step": 1081 + }, + { + "epoch": 1.54, + "learning_rate": 9.663304073298703e-05, + "loss": 0.0091, + "step": 1082 + }, + { + "epoch": 1.54, + "learning_rate": 9.648345849866348e-05, + "loss": 0.0095, + "step": 1083 + }, + { + "epoch": 1.54, + "learning_rate": 9.633388414187843e-05, + "loss": 0.0067, + "step": 1084 + }, + { + "epoch": 1.54, + "learning_rate": 9.61843179976991e-05, + "loss": 0.0097, + "step": 1085 + }, + { + "epoch": 1.54, + "learning_rate": 9.603476040117423e-05, + "loss": 0.0081, + "step": 1086 + }, + { + "epoch": 1.55, + "learning_rate": 9.58852116873336e-05, + "loss": 0.008, + "step": 1087 + }, + { + "epoch": 1.55, + "learning_rate": 9.573567219118693e-05, + "loss": 0.0063, + "step": 1088 + }, + { + "epoch": 1.55, + "learning_rate": 9.558614224772334e-05, + "loss": 0.0058, + "step": 1089 + }, + { + "epoch": 1.55, + "learning_rate": 9.54366221919105e-05, + "loss": 0.0019, + "step": 1090 + }, + { + "epoch": 1.55, + "learning_rate": 9.528711235869407e-05, + "loss": 0.0028, + "step": 1091 + }, + { + "epoch": 1.55, + "learning_rate": 9.513761308299666e-05, + "loss": 0.0059, + "step": 1092 + }, + { + "epoch": 1.55, + "learning_rate": 9.498812469971733e-05, + "loss": 0.0058, + "step": 1093 + }, + { + "epoch": 1.56, + "learning_rate": 9.483864754373062e-05, + "loss": 0.0071, + "step": 1094 + }, + { + "epoch": 1.56, + "learning_rate": 9.46891819498861e-05, + "loss": 0.0058, + "step": 1095 + }, + { + "epoch": 1.56, + "learning_rate": 9.453972825300728e-05, + "loss": 0.0088, + "step": 1096 + }, + { + "epoch": 1.56, + "learning_rate": 9.439028678789108e-05, + "loss": 0.0089, + "step": 1097 + }, + { + "epoch": 1.56, + "learning_rate": 9.424085788930702e-05, + "loss": 0.0087, + "step": 1098 + }, + { + "epoch": 1.56, + "learning_rate": 9.409144189199644e-05, + "loss": 0.0142, + "step": 1099 + }, + { + "epoch": 1.56, + "learning_rate": 9.394203913067186e-05, + "loss": 0.0052, + "step": 1100 + }, + { + "epoch": 1.56, + "eval_loss": 0.023527521640062332, + "eval_runtime": 23.6506, + "eval_samples_per_second": 42.282, + "eval_steps_per_second": 10.571, + "step": 1100 + }, + { + "epoch": 1.57, + "learning_rate": 9.379264994001607e-05, + "loss": 0.0198, + "step": 1101 + }, + { + "epoch": 1.57, + "learning_rate": 9.364327465468148e-05, + "loss": 0.0145, + "step": 1102 + }, + { + "epoch": 1.57, + "learning_rate": 9.349391360928932e-05, + "loss": 0.0008, + "step": 1103 + }, + { + "epoch": 1.57, + "learning_rate": 9.334456713842899e-05, + "loss": 0.0101, + "step": 1104 + }, + { + "epoch": 1.57, + "learning_rate": 9.319523557665721e-05, + "loss": 0.0046, + "step": 1105 + }, + { + "epoch": 1.57, + "learning_rate": 9.30459192584973e-05, + "loss": 0.0108, + "step": 1106 + }, + { + "epoch": 1.57, + "learning_rate": 9.289661851843838e-05, + "loss": 0.0031, + "step": 1107 + }, + { + "epoch": 1.58, + "learning_rate": 9.27473336909348e-05, + "loss": 0.0028, + "step": 1108 + }, + { + "epoch": 1.58, + "learning_rate": 9.25980651104051e-05, + "loss": 0.0018, + "step": 1109 + }, + { + "epoch": 1.58, + "learning_rate": 9.24488131112316e-05, + "loss": 0.005, + "step": 1110 + }, + { + "epoch": 1.58, + "learning_rate": 9.22995780277593e-05, + "loss": 0.0035, + "step": 1111 + }, + { + "epoch": 1.58, + "learning_rate": 9.215036019429547e-05, + "loss": 0.0005, + "step": 1112 + }, + { + "epoch": 1.58, + "learning_rate": 9.20011599451086e-05, + "loss": 0.0125, + "step": 1113 + }, + { + "epoch": 1.58, + "learning_rate": 9.185197761442787e-05, + "loss": 0.0065, + "step": 1114 + }, + { + "epoch": 1.59, + "learning_rate": 9.170281353644228e-05, + "loss": 0.0039, + "step": 1115 + }, + { + "epoch": 1.59, + "learning_rate": 9.155366804530002e-05, + "loss": 0.0178, + "step": 1116 + }, + { + "epoch": 1.59, + "learning_rate": 9.140454147510753e-05, + "loss": 0.0075, + "step": 1117 + }, + { + "epoch": 1.59, + "learning_rate": 9.125543415992895e-05, + "loss": 0.0025, + "step": 1118 + }, + { + "epoch": 1.59, + "learning_rate": 9.110634643378516e-05, + "loss": 0.0036, + "step": 1119 + }, + { + "epoch": 1.59, + "learning_rate": 9.095727863065337e-05, + "loss": 0.0084, + "step": 1120 + }, + { + "epoch": 1.59, + "eval_loss": 0.03232593089342117, + "eval_runtime": 23.3247, + "eval_samples_per_second": 42.873, + "eval_steps_per_second": 10.718, + "step": 1120 + }, + { + "epoch": 1.59, + "learning_rate": 9.080823108446602e-05, + "loss": 0.0012, + "step": 1121 + }, + { + "epoch": 1.6, + "learning_rate": 9.065920412911015e-05, + "loss": 0.0039, + "step": 1122 + }, + { + "epoch": 1.6, + "learning_rate": 9.05101980984267e-05, + "loss": 0.0053, + "step": 1123 + }, + { + "epoch": 1.6, + "learning_rate": 9.036121332620983e-05, + "loss": 0.0046, + "step": 1124 + }, + { + "epoch": 1.6, + "learning_rate": 9.021225014620595e-05, + "loss": 0.0079, + "step": 1125 + }, + { + "epoch": 1.6, + "learning_rate": 9.006330889211316e-05, + "loss": 0.0025, + "step": 1126 + }, + { + "epoch": 1.6, + "learning_rate": 8.99143898975804e-05, + "loss": 0.0037, + "step": 1127 + }, + { + "epoch": 1.6, + "learning_rate": 8.976549349620685e-05, + "loss": 0.0081, + "step": 1128 + }, + { + "epoch": 1.61, + "learning_rate": 8.961662002154096e-05, + "loss": 0.0028, + "step": 1129 + }, + { + "epoch": 1.61, + "learning_rate": 8.946776980707989e-05, + "loss": 0.0127, + "step": 1130 + }, + { + "epoch": 1.61, + "learning_rate": 8.931894318626862e-05, + "loss": 0.0066, + "step": 1131 + }, + { + "epoch": 1.61, + "learning_rate": 8.91701404924994e-05, + "loss": 0.0013, + "step": 1132 + }, + { + "epoch": 1.61, + "learning_rate": 8.902136205911077e-05, + "loss": 0.008, + "step": 1133 + }, + { + "epoch": 1.61, + "learning_rate": 8.8872608219387e-05, + "loss": 0.0004, + "step": 1134 + }, + { + "epoch": 1.61, + "learning_rate": 8.872387930655714e-05, + "loss": 0.0188, + "step": 1135 + }, + { + "epoch": 1.62, + "learning_rate": 8.857517565379461e-05, + "loss": 0.0109, + "step": 1136 + }, + { + "epoch": 1.62, + "learning_rate": 8.842649759421604e-05, + "loss": 0.0057, + "step": 1137 + }, + { + "epoch": 1.62, + "learning_rate": 8.827784546088087e-05, + "loss": 0.0077, + "step": 1138 + }, + { + "epoch": 1.62, + "learning_rate": 8.812921958679032e-05, + "loss": 0.0126, + "step": 1139 + }, + { + "epoch": 1.62, + "learning_rate": 8.798062030488695e-05, + "loss": 0.012, + "step": 1140 + }, + { + "epoch": 1.62, + "eval_loss": 0.031952742487192154, + "eval_runtime": 23.4079, + "eval_samples_per_second": 42.721, + "eval_steps_per_second": 10.68, + "step": 1140 + }, + { + "epoch": 1.62, + "learning_rate": 8.783204794805364e-05, + "loss": 0.0096, + "step": 1141 + }, + { + "epoch": 1.62, + "learning_rate": 8.7683502849113e-05, + "loss": 0.0021, + "step": 1142 + }, + { + "epoch": 1.63, + "learning_rate": 8.753498534082647e-05, + "loss": 0.0042, + "step": 1143 + }, + { + "epoch": 1.63, + "learning_rate": 8.738649575589393e-05, + "loss": 0.0036, + "step": 1144 + }, + { + "epoch": 1.63, + "learning_rate": 8.723803442695245e-05, + "loss": 0.0312, + "step": 1145 + }, + { + "epoch": 1.63, + "learning_rate": 8.708960168657592e-05, + "loss": 0.0092, + "step": 1146 + }, + { + "epoch": 1.63, + "learning_rate": 8.694119786727417e-05, + "loss": 0.0037, + "step": 1147 + }, + { + "epoch": 1.63, + "learning_rate": 8.679282330149225e-05, + "loss": 0.0052, + "step": 1148 + }, + { + "epoch": 1.63, + "learning_rate": 8.664447832160972e-05, + "loss": 0.0214, + "step": 1149 + }, + { + "epoch": 1.64, + "learning_rate": 8.649616325993974e-05, + "loss": 0.0113, + "step": 1150 + }, + { + "epoch": 1.64, + "learning_rate": 8.634787844872856e-05, + "loss": 0.0039, + "step": 1151 + }, + { + "epoch": 1.64, + "learning_rate": 8.619962422015466e-05, + "loss": 0.0069, + "step": 1152 + }, + { + "epoch": 1.64, + "learning_rate": 8.605140090632793e-05, + "loss": 0.006, + "step": 1153 + }, + { + "epoch": 1.64, + "learning_rate": 8.590320883928909e-05, + "loss": 0.0118, + "step": 1154 + }, + { + "epoch": 1.64, + "learning_rate": 8.575504835100882e-05, + "loss": 0.007, + "step": 1155 + }, + { + "epoch": 1.64, + "learning_rate": 8.560691977338709e-05, + "loss": 0.0078, + "step": 1156 + }, + { + "epoch": 1.65, + "learning_rate": 8.545882343825235e-05, + "loss": 0.0089, + "step": 1157 + }, + { + "epoch": 1.65, + "learning_rate": 8.531075967736083e-05, + "loss": 0.0063, + "step": 1158 + }, + { + "epoch": 1.65, + "learning_rate": 8.51627288223958e-05, + "loss": 0.0074, + "step": 1159 + }, + { + "epoch": 1.65, + "learning_rate": 8.501473120496685e-05, + "loss": 0.0019, + "step": 1160 + }, + { + "epoch": 1.65, + "eval_loss": 0.02555186301469803, + "eval_runtime": 23.1703, + "eval_samples_per_second": 43.159, + "eval_steps_per_second": 10.79, + "step": 1160 + }, + { + "epoch": 1.65, + "learning_rate": 8.486676715660904e-05, + "loss": 0.0122, + "step": 1161 + }, + { + "epoch": 1.65, + "learning_rate": 8.47188370087823e-05, + "loss": 0.0011, + "step": 1162 + }, + { + "epoch": 1.65, + "learning_rate": 8.457094109287049e-05, + "loss": 0.0084, + "step": 1163 + }, + { + "epoch": 1.66, + "learning_rate": 8.442307974018101e-05, + "loss": 0.0037, + "step": 1164 + }, + { + "epoch": 1.66, + "learning_rate": 8.427525328194365e-05, + "loss": 0.0081, + "step": 1165 + }, + { + "epoch": 1.66, + "learning_rate": 8.41274620493101e-05, + "loss": 0.0131, + "step": 1166 + }, + { + "epoch": 1.66, + "learning_rate": 8.397970637335307e-05, + "loss": 0.0014, + "step": 1167 + }, + { + "epoch": 1.66, + "learning_rate": 8.383198658506576e-05, + "loss": 0.0082, + "step": 1168 + }, + { + "epoch": 1.66, + "learning_rate": 8.368430301536086e-05, + "loss": 0.0059, + "step": 1169 + }, + { + "epoch": 1.66, + "learning_rate": 8.353665599506995e-05, + "loss": 0.0048, + "step": 1170 + }, + { + "epoch": 1.67, + "learning_rate": 8.338904585494268e-05, + "loss": 0.0127, + "step": 1171 + }, + { + "epoch": 1.67, + "learning_rate": 8.324147292564626e-05, + "loss": 0.0108, + "step": 1172 + }, + { + "epoch": 1.67, + "learning_rate": 8.309393753776438e-05, + "loss": 0.0018, + "step": 1173 + }, + { + "epoch": 1.67, + "learning_rate": 8.294644002179667e-05, + "loss": 0.0075, + "step": 1174 + }, + { + "epoch": 1.67, + "learning_rate": 8.279898070815789e-05, + "loss": 0.0111, + "step": 1175 + }, + { + "epoch": 1.67, + "learning_rate": 8.265155992717734e-05, + "loss": 0.0108, + "step": 1176 + }, + { + "epoch": 1.67, + "learning_rate": 8.25041780090979e-05, + "loss": 0.0045, + "step": 1177 + }, + { + "epoch": 1.68, + "learning_rate": 8.235683528407541e-05, + "loss": 0.0005, + "step": 1178 + }, + { + "epoch": 1.68, + "learning_rate": 8.22095320821779e-05, + "loss": 0.0202, + "step": 1179 + }, + { + "epoch": 1.68, + "learning_rate": 8.206226873338486e-05, + "loss": 0.0175, + "step": 1180 + }, + { + "epoch": 1.68, + "eval_loss": 0.029980003833770752, + "eval_runtime": 23.4162, + "eval_samples_per_second": 42.706, + "eval_steps_per_second": 10.676, + "step": 1180 + }, + { + "epoch": 1.68, + "learning_rate": 8.19150455675866e-05, + "loss": 0.0077, + "step": 1181 + }, + { + "epoch": 1.68, + "learning_rate": 8.17678629145833e-05, + "loss": 0.0155, + "step": 1182 + }, + { + "epoch": 1.68, + "learning_rate": 8.162072110408443e-05, + "loss": 0.0081, + "step": 1183 + }, + { + "epoch": 1.68, + "learning_rate": 8.14736204657079e-05, + "loss": 0.0109, + "step": 1184 + }, + { + "epoch": 1.69, + "learning_rate": 8.132656132897955e-05, + "loss": 0.01, + "step": 1185 + }, + { + "epoch": 1.69, + "learning_rate": 8.117954402333211e-05, + "loss": 0.0043, + "step": 1186 + }, + { + "epoch": 1.69, + "learning_rate": 8.103256887810464e-05, + "loss": 0.0048, + "step": 1187 + }, + { + "epoch": 1.69, + "learning_rate": 8.08856362225417e-05, + "loss": 0.0033, + "step": 1188 + }, + { + "epoch": 1.69, + "learning_rate": 8.073874638579285e-05, + "loss": 0.0046, + "step": 1189 + }, + { + "epoch": 1.69, + "learning_rate": 8.059189969691154e-05, + "loss": 0.0087, + "step": 1190 + }, + { + "epoch": 1.69, + "learning_rate": 8.04450964848546e-05, + "loss": 0.0022, + "step": 1191 + }, + { + "epoch": 1.7, + "learning_rate": 8.029833707848151e-05, + "loss": 0.003, + "step": 1192 + }, + { + "epoch": 1.7, + "learning_rate": 8.015162180655365e-05, + "loss": 0.0038, + "step": 1193 + }, + { + "epoch": 1.7, + "learning_rate": 8.000495099773344e-05, + "loss": 0.0049, + "step": 1194 + }, + { + "epoch": 1.7, + "learning_rate": 7.985832498058376e-05, + "loss": 0.0086, + "step": 1195 + }, + { + "epoch": 1.7, + "learning_rate": 7.971174408356712e-05, + "loss": 0.007, + "step": 1196 + }, + { + "epoch": 1.7, + "learning_rate": 7.956520863504496e-05, + "loss": 0.0171, + "step": 1197 + }, + { + "epoch": 1.7, + "learning_rate": 7.941871896327698e-05, + "loss": 0.0187, + "step": 1198 + }, + { + "epoch": 1.71, + "learning_rate": 7.92722753964202e-05, + "loss": 0.0055, + "step": 1199 + }, + { + "epoch": 1.71, + "learning_rate": 7.912587826252844e-05, + "loss": 0.0078, + "step": 1200 + }, + { + "epoch": 1.71, + "eval_loss": 0.036186087876558304, + "eval_runtime": 23.2617, + "eval_samples_per_second": 42.989, + "eval_steps_per_second": 10.747, + "step": 1200 + }, + { + "epoch": 1.71, + "learning_rate": 7.897952788955153e-05, + "loss": 0.0039, + "step": 1201 + }, + { + "epoch": 1.71, + "learning_rate": 7.883322460533448e-05, + "loss": 0.0105, + "step": 1202 + }, + { + "epoch": 1.71, + "learning_rate": 7.868696873761686e-05, + "loss": 0.0036, + "step": 1203 + }, + { + "epoch": 1.71, + "learning_rate": 7.8540760614032e-05, + "loss": 0.0041, + "step": 1204 + }, + { + "epoch": 1.71, + "learning_rate": 7.839460056210631e-05, + "loss": 0.0098, + "step": 1205 + }, + { + "epoch": 1.72, + "learning_rate": 7.824848890925844e-05, + "loss": 0.0039, + "step": 1206 + }, + { + "epoch": 1.72, + "learning_rate": 7.81024259827987e-05, + "loss": 0.007, + "step": 1207 + }, + { + "epoch": 1.72, + "learning_rate": 7.795641210992818e-05, + "loss": 0.0067, + "step": 1208 + }, + { + "epoch": 1.72, + "learning_rate": 7.781044761773819e-05, + "loss": 0.0103, + "step": 1209 + }, + { + "epoch": 1.72, + "learning_rate": 7.76645328332093e-05, + "loss": 0.0135, + "step": 1210 + }, + { + "epoch": 1.72, + "learning_rate": 7.751866808321074e-05, + "loss": 0.0057, + "step": 1211 + }, + { + "epoch": 1.72, + "learning_rate": 7.73728536944997e-05, + "loss": 0.0038, + "step": 1212 + }, + { + "epoch": 1.73, + "learning_rate": 7.722708999372059e-05, + "loss": 0.0009, + "step": 1213 + }, + { + "epoch": 1.73, + "learning_rate": 7.708137730740421e-05, + "loss": 0.007, + "step": 1214 + }, + { + "epoch": 1.73, + "learning_rate": 7.693571596196709e-05, + "loss": 0.0147, + "step": 1215 + }, + { + "epoch": 1.73, + "learning_rate": 7.67901062837107e-05, + "loss": 0.0048, + "step": 1216 + }, + { + "epoch": 1.73, + "learning_rate": 7.664454859882092e-05, + "loss": 0.0032, + "step": 1217 + }, + { + "epoch": 1.73, + "learning_rate": 7.649904323336703e-05, + "loss": 0.0109, + "step": 1218 + }, + { + "epoch": 1.73, + "learning_rate": 7.63535905133011e-05, + "loss": 0.0035, + "step": 1219 + }, + { + "epoch": 1.74, + "learning_rate": 7.62081907644573e-05, + "loss": 0.0088, + "step": 1220 + }, + { + "epoch": 1.74, + "eval_loss": 0.030959945172071457, + "eval_runtime": 23.8318, + "eval_samples_per_second": 41.961, + "eval_steps_per_second": 10.49, + "step": 1220 + }, + { + "epoch": 1.74, + "learning_rate": 7.606284431255123e-05, + "loss": 0.0152, + "step": 1221 + }, + { + "epoch": 1.74, + "learning_rate": 7.591755148317896e-05, + "loss": 0.0136, + "step": 1222 + }, + { + "epoch": 1.74, + "learning_rate": 7.577231260181648e-05, + "loss": 0.0014, + "step": 1223 + }, + { + "epoch": 1.74, + "learning_rate": 7.562712799381889e-05, + "loss": 0.0072, + "step": 1224 + }, + { + "epoch": 1.74, + "learning_rate": 7.548199798441985e-05, + "loss": 0.0089, + "step": 1225 + }, + { + "epoch": 1.74, + "learning_rate": 7.53369228987306e-05, + "loss": 0.0135, + "step": 1226 + }, + { + "epoch": 1.75, + "learning_rate": 7.519190306173931e-05, + "loss": 0.0128, + "step": 1227 + }, + { + "epoch": 1.75, + "learning_rate": 7.504693879831044e-05, + "loss": 0.0106, + "step": 1228 + }, + { + "epoch": 1.75, + "learning_rate": 7.490203043318404e-05, + "loss": 0.015, + "step": 1229 + }, + { + "epoch": 1.75, + "learning_rate": 7.47571782909748e-05, + "loss": 0.0055, + "step": 1230 + }, + { + "epoch": 1.75, + "learning_rate": 7.461238269617152e-05, + "loss": 0.0026, + "step": 1231 + }, + { + "epoch": 1.75, + "learning_rate": 7.446764397313627e-05, + "loss": 0.0114, + "step": 1232 + }, + { + "epoch": 1.75, + "learning_rate": 7.432296244610388e-05, + "loss": 0.0072, + "step": 1233 + }, + { + "epoch": 1.76, + "learning_rate": 7.417833843918089e-05, + "loss": 0.0035, + "step": 1234 + }, + { + "epoch": 1.76, + "learning_rate": 7.403377227634505e-05, + "loss": 0.0068, + "step": 1235 + }, + { + "epoch": 1.76, + "learning_rate": 7.388926428144449e-05, + "loss": 0.0075, + "step": 1236 + }, + { + "epoch": 1.76, + "learning_rate": 7.374481477819715e-05, + "loss": 0.0096, + "step": 1237 + }, + { + "epoch": 1.76, + "learning_rate": 7.360042409018982e-05, + "loss": 0.0114, + "step": 1238 + }, + { + "epoch": 1.76, + "learning_rate": 7.345609254087756e-05, + "loss": 0.0055, + "step": 1239 + }, + { + "epoch": 1.76, + "learning_rate": 7.331182045358298e-05, + "loss": 0.0065, + "step": 1240 + }, + { + "epoch": 1.76, + "eval_loss": 0.03008873388171196, + "eval_runtime": 23.2189, + "eval_samples_per_second": 43.068, + "eval_steps_per_second": 10.767, + "step": 1240 + }, + { + "epoch": 1.77, + "learning_rate": 7.316760815149547e-05, + "loss": 0.0098, + "step": 1241 + }, + { + "epoch": 1.77, + "learning_rate": 7.302345595767053e-05, + "loss": 0.0045, + "step": 1242 + }, + { + "epoch": 1.77, + "learning_rate": 7.287936419502897e-05, + "loss": 0.0077, + "step": 1243 + }, + { + "epoch": 1.77, + "learning_rate": 7.273533318635616e-05, + "loss": 0.0062, + "step": 1244 + }, + { + "epoch": 1.77, + "learning_rate": 7.259136325430155e-05, + "loss": 0.006, + "step": 1245 + }, + { + "epoch": 1.77, + "learning_rate": 7.244745472137762e-05, + "loss": 0.0049, + "step": 1246 + }, + { + "epoch": 1.77, + "learning_rate": 7.230360790995935e-05, + "loss": 0.0128, + "step": 1247 + }, + { + "epoch": 1.78, + "learning_rate": 7.215982314228346e-05, + "loss": 0.007, + "step": 1248 + }, + { + "epoch": 1.78, + "learning_rate": 7.201610074044769e-05, + "loss": 0.0079, + "step": 1249 + }, + { + "epoch": 1.78, + "learning_rate": 7.187244102641006e-05, + "loss": 0.0048, + "step": 1250 + }, + { + "epoch": 1.78, + "learning_rate": 7.172884432198817e-05, + "loss": 0.0087, + "step": 1251 + }, + { + "epoch": 1.78, + "learning_rate": 7.158531094885843e-05, + "loss": 0.0117, + "step": 1252 + }, + { + "epoch": 1.78, + "learning_rate": 7.144184122855548e-05, + "loss": 0.0028, + "step": 1253 + }, + { + "epoch": 1.78, + "learning_rate": 7.129843548247126e-05, + "loss": 0.0092, + "step": 1254 + }, + { + "epoch": 1.79, + "learning_rate": 7.115509403185445e-05, + "loss": 0.0011, + "step": 1255 + }, + { + "epoch": 1.79, + "learning_rate": 7.101181719780968e-05, + "loss": 0.0078, + "step": 1256 + }, + { + "epoch": 1.79, + "learning_rate": 7.086860530129682e-05, + "loss": 0.0088, + "step": 1257 + }, + { + "epoch": 1.79, + "learning_rate": 7.072545866313036e-05, + "loss": 0.0119, + "step": 1258 + }, + { + "epoch": 1.79, + "learning_rate": 7.058237760397848e-05, + "loss": 0.0038, + "step": 1259 + }, + { + "epoch": 1.79, + "learning_rate": 7.043936244436254e-05, + "loss": 0.0059, + "step": 1260 + }, + { + "epoch": 1.79, + "eval_loss": 0.03484958037734032, + "eval_runtime": 23.4499, + "eval_samples_per_second": 42.644, + "eval_steps_per_second": 10.661, + "step": 1260 + }, + { + "epoch": 1.79, + "learning_rate": 7.029641350465618e-05, + "loss": 0.0059, + "step": 1261 + }, + { + "epoch": 1.8, + "learning_rate": 7.015353110508484e-05, + "loss": 0.0029, + "step": 1262 + }, + { + "epoch": 1.8, + "learning_rate": 7.00107155657248e-05, + "loss": 0.0035, + "step": 1263 + }, + { + "epoch": 1.8, + "learning_rate": 6.986796720650259e-05, + "loss": 0.0047, + "step": 1264 + }, + { + "epoch": 1.8, + "learning_rate": 6.972528634719418e-05, + "loss": 0.006, + "step": 1265 + }, + { + "epoch": 1.8, + "learning_rate": 6.958267330742449e-05, + "loss": 0.0111, + "step": 1266 + }, + { + "epoch": 1.8, + "learning_rate": 6.944012840666639e-05, + "loss": 0.0108, + "step": 1267 + }, + { + "epoch": 1.8, + "learning_rate": 6.929765196424012e-05, + "loss": 0.0064, + "step": 1268 + }, + { + "epoch": 1.81, + "learning_rate": 6.915524429931255e-05, + "loss": 0.011, + "step": 1269 + }, + { + "epoch": 1.81, + "learning_rate": 6.901290573089661e-05, + "loss": 0.0066, + "step": 1270 + }, + { + "epoch": 1.81, + "learning_rate": 6.887063657785027e-05, + "loss": 0.0032, + "step": 1271 + }, + { + "epoch": 1.81, + "learning_rate": 6.872843715887608e-05, + "loss": 0.0043, + "step": 1272 + }, + { + "epoch": 1.81, + "learning_rate": 6.858630779252038e-05, + "loss": 0.0193, + "step": 1273 + }, + { + "epoch": 1.81, + "learning_rate": 6.844424879717261e-05, + "loss": 0.0032, + "step": 1274 + }, + { + "epoch": 1.81, + "learning_rate": 6.830226049106451e-05, + "loss": 0.0064, + "step": 1275 + }, + { + "epoch": 1.82, + "learning_rate": 6.816034319226947e-05, + "loss": 0.0067, + "step": 1276 + }, + { + "epoch": 1.82, + "learning_rate": 6.801849721870184e-05, + "loss": 0.0075, + "step": 1277 + }, + { + "epoch": 1.82, + "learning_rate": 6.787672288811621e-05, + "loss": 0.0149, + "step": 1278 + }, + { + "epoch": 1.82, + "learning_rate": 6.773502051810665e-05, + "loss": 0.0034, + "step": 1279 + }, + { + "epoch": 1.82, + "learning_rate": 6.759339042610604e-05, + "loss": 0.0066, + "step": 1280 + }, + { + "epoch": 1.82, + "eval_loss": 0.034088339656591415, + "eval_runtime": 23.4125, + "eval_samples_per_second": 42.712, + "eval_steps_per_second": 10.678, + "step": 1280 + }, + { + "epoch": 1.82, + "learning_rate": 6.745183292938526e-05, + "loss": 0.002, + "step": 1281 + }, + { + "epoch": 1.82, + "learning_rate": 6.731034834505277e-05, + "loss": 0.0162, + "step": 1282 + }, + { + "epoch": 1.83, + "learning_rate": 6.716893699005353e-05, + "loss": 0.0182, + "step": 1283 + }, + { + "epoch": 1.83, + "learning_rate": 6.702759918116847e-05, + "loss": 0.0122, + "step": 1284 + }, + { + "epoch": 1.83, + "learning_rate": 6.68863352350138e-05, + "loss": 0.0083, + "step": 1285 + }, + { + "epoch": 1.83, + "learning_rate": 6.67451454680403e-05, + "loss": 0.008, + "step": 1286 + }, + { + "epoch": 1.83, + "learning_rate": 6.660403019653253e-05, + "loss": 0.0049, + "step": 1287 + }, + { + "epoch": 1.83, + "learning_rate": 6.646298973660817e-05, + "loss": 0.0051, + "step": 1288 + }, + { + "epoch": 1.83, + "learning_rate": 6.632202440421731e-05, + "loss": 0.0024, + "step": 1289 + }, + { + "epoch": 1.83, + "learning_rate": 6.618113451514179e-05, + "loss": 0.0041, + "step": 1290 + }, + { + "epoch": 1.84, + "learning_rate": 6.604032038499441e-05, + "loss": 0.0044, + "step": 1291 + }, + { + "epoch": 1.84, + "learning_rate": 6.589958232921824e-05, + "loss": 0.0091, + "step": 1292 + }, + { + "epoch": 1.84, + "learning_rate": 6.575892066308597e-05, + "loss": 0.0056, + "step": 1293 + }, + { + "epoch": 1.84, + "learning_rate": 6.561833570169918e-05, + "loss": 0.02, + "step": 1294 + }, + { + "epoch": 1.84, + "learning_rate": 6.547782775998756e-05, + "loss": 0.0065, + "step": 1295 + }, + { + "epoch": 1.84, + "learning_rate": 6.53373971527083e-05, + "loss": 0.009, + "step": 1296 + }, + { + "epoch": 1.84, + "learning_rate": 6.519704419444536e-05, + "loss": 0.0062, + "step": 1297 + }, + { + "epoch": 1.85, + "learning_rate": 6.505676919960877e-05, + "loss": 0.0032, + "step": 1298 + }, + { + "epoch": 1.85, + "learning_rate": 6.491657248243384e-05, + "loss": 0.0034, + "step": 1299 + }, + { + "epoch": 1.85, + "learning_rate": 6.47764543569806e-05, + "loss": 0.0015, + "step": 1300 + }, + { + "epoch": 1.85, + "eval_loss": 0.028048571199178696, + "eval_runtime": 23.3162, + "eval_samples_per_second": 42.889, + "eval_steps_per_second": 10.722, + "step": 1300 + }, + { + "epoch": 1.85, + "learning_rate": 6.463641513713297e-05, + "loss": 0.012, + "step": 1301 + }, + { + "epoch": 1.85, + "learning_rate": 6.449645513659818e-05, + "loss": 0.0023, + "step": 1302 + }, + { + "epoch": 1.85, + "learning_rate": 6.435657466890597e-05, + "loss": 0.0113, + "step": 1303 + }, + { + "epoch": 1.85, + "learning_rate": 6.421677404740783e-05, + "loss": 0.0031, + "step": 1304 + }, + { + "epoch": 1.86, + "learning_rate": 6.407705358527646e-05, + "loss": 0.0113, + "step": 1305 + }, + { + "epoch": 1.86, + "learning_rate": 6.393741359550507e-05, + "loss": 0.004, + "step": 1306 + }, + { + "epoch": 1.86, + "learning_rate": 6.379785439090648e-05, + "loss": 0.0037, + "step": 1307 + }, + { + "epoch": 1.86, + "learning_rate": 6.365837628411254e-05, + "loss": 0.0049, + "step": 1308 + }, + { + "epoch": 1.86, + "learning_rate": 6.351897958757346e-05, + "loss": 0.0058, + "step": 1309 + }, + { + "epoch": 1.86, + "learning_rate": 6.337966461355716e-05, + "loss": 0.0053, + "step": 1310 + }, + { + "epoch": 1.86, + "learning_rate": 6.324043167414837e-05, + "loss": 0.0024, + "step": 1311 + }, + { + "epoch": 1.87, + "learning_rate": 6.310128108124811e-05, + "loss": 0.0147, + "step": 1312 + }, + { + "epoch": 1.87, + "learning_rate": 6.296221314657289e-05, + "loss": 0.011, + "step": 1313 + }, + { + "epoch": 1.87, + "learning_rate": 6.282322818165413e-05, + "loss": 0.0088, + "step": 1314 + }, + { + "epoch": 1.87, + "learning_rate": 6.268432649783734e-05, + "loss": 0.0035, + "step": 1315 + }, + { + "epoch": 1.87, + "learning_rate": 6.254550840628142e-05, + "loss": 0.0058, + "step": 1316 + }, + { + "epoch": 1.87, + "learning_rate": 6.240677421795808e-05, + "loss": 0.0042, + "step": 1317 + }, + { + "epoch": 1.87, + "learning_rate": 6.226812424365109e-05, + "loss": 0.006, + "step": 1318 + }, + { + "epoch": 1.88, + "learning_rate": 6.212955879395554e-05, + "loss": 0.0043, + "step": 1319 + }, + { + "epoch": 1.88, + "learning_rate": 6.199107817927714e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 1.88, + "eval_loss": 0.026580575853586197, + "eval_runtime": 23.5127, + "eval_samples_per_second": 42.53, + "eval_steps_per_second": 10.633, + "step": 1320 + }, + { + "epoch": 1.88, + "learning_rate": 6.185268270983156e-05, + "loss": 0.0195, + "step": 1321 + }, + { + "epoch": 1.88, + "learning_rate": 6.171437269564382e-05, + "loss": 0.0053, + "step": 1322 + }, + { + "epoch": 1.88, + "learning_rate": 6.157614844654743e-05, + "loss": 0.0052, + "step": 1323 + }, + { + "epoch": 1.88, + "learning_rate": 6.143801027218378e-05, + "loss": 0.0057, + "step": 1324 + }, + { + "epoch": 1.88, + "learning_rate": 6.129995848200143e-05, + "loss": 0.0031, + "step": 1325 + }, + { + "epoch": 1.89, + "learning_rate": 6.116199338525552e-05, + "loss": 0.0177, + "step": 1326 + }, + { + "epoch": 1.89, + "learning_rate": 6.102411529100687e-05, + "loss": 0.0105, + "step": 1327 + }, + { + "epoch": 1.89, + "learning_rate": 6.088632450812145e-05, + "loss": 0.0034, + "step": 1328 + }, + { + "epoch": 1.89, + "learning_rate": 6.074862134526962e-05, + "loss": 0.017, + "step": 1329 + }, + { + "epoch": 1.89, + "learning_rate": 6.061100611092552e-05, + "loss": 0.0053, + "step": 1330 + }, + { + "epoch": 1.89, + "learning_rate": 6.047347911336627e-05, + "loss": 0.0012, + "step": 1331 + }, + { + "epoch": 1.89, + "learning_rate": 6.033604066067131e-05, + "loss": 0.0149, + "step": 1332 + }, + { + "epoch": 1.9, + "learning_rate": 6.0198691060721765e-05, + "loss": 0.0134, + "step": 1333 + }, + { + "epoch": 1.9, + "learning_rate": 6.006143062119967e-05, + "loss": 0.0074, + "step": 1334 + }, + { + "epoch": 1.9, + "learning_rate": 5.992425964958741e-05, + "loss": 0.009, + "step": 1335 + }, + { + "epoch": 1.9, + "learning_rate": 5.978717845316689e-05, + "loss": 0.0043, + "step": 1336 + }, + { + "epoch": 1.9, + "learning_rate": 5.9650187339018904e-05, + "loss": 0.0107, + "step": 1337 + }, + { + "epoch": 1.9, + "learning_rate": 5.951328661402246e-05, + "loss": 0.0029, + "step": 1338 + }, + { + "epoch": 1.9, + "learning_rate": 5.93764765848541e-05, + "loss": 0.0064, + "step": 1339 + }, + { + "epoch": 1.91, + "learning_rate": 5.923975755798714e-05, + "loss": 0.0053, + "step": 1340 + }, + { + "epoch": 1.91, + "eval_loss": 0.03497832268476486, + "eval_runtime": 23.1913, + "eval_samples_per_second": 43.12, + "eval_steps_per_second": 10.78, + "step": 1340 + }, + { + "epoch": 1.91, + "learning_rate": 5.910312983969114e-05, + "loss": 0.0095, + "step": 1341 + }, + { + "epoch": 1.91, + "learning_rate": 5.8966593736030994e-05, + "loss": 0.0064, + "step": 1342 + }, + { + "epoch": 1.91, + "learning_rate": 5.8830149552866474e-05, + "loss": 0.0107, + "step": 1343 + }, + { + "epoch": 1.91, + "learning_rate": 5.869379759585137e-05, + "loss": 0.0092, + "step": 1344 + }, + { + "epoch": 1.91, + "learning_rate": 5.855753817043287e-05, + "loss": 0.0033, + "step": 1345 + }, + { + "epoch": 1.91, + "learning_rate": 5.8421371581850916e-05, + "loss": 0.0033, + "step": 1346 + }, + { + "epoch": 1.92, + "learning_rate": 5.8285298135137504e-05, + "loss": 0.0103, + "step": 1347 + }, + { + "epoch": 1.92, + "learning_rate": 5.814931813511587e-05, + "loss": 0.0071, + "step": 1348 + }, + { + "epoch": 1.92, + "learning_rate": 5.801343188640007e-05, + "loss": 0.0021, + "step": 1349 + }, + { + "epoch": 1.92, + "learning_rate": 5.787763969339396e-05, + "loss": 0.0107, + "step": 1350 + }, + { + "epoch": 1.92, + "learning_rate": 5.774194186029094e-05, + "loss": 0.0041, + "step": 1351 + }, + { + "epoch": 1.92, + "learning_rate": 5.760633869107277e-05, + "loss": 0.0034, + "step": 1352 + }, + { + "epoch": 1.92, + "learning_rate": 5.7470830489509353e-05, + "loss": 0.0136, + "step": 1353 + }, + { + "epoch": 1.93, + "learning_rate": 5.733541755915765e-05, + "loss": 0.0209, + "step": 1354 + }, + { + "epoch": 1.93, + "learning_rate": 5.7200100203361506e-05, + "loss": 0.0012, + "step": 1355 + }, + { + "epoch": 1.93, + "learning_rate": 5.7064878725250334e-05, + "loss": 0.0041, + "step": 1356 + }, + { + "epoch": 1.93, + "learning_rate": 5.692975342773901e-05, + "loss": 0.005, + "step": 1357 + }, + { + "epoch": 1.93, + "learning_rate": 5.679472461352677e-05, + "loss": 0.008, + "step": 1358 + }, + { + "epoch": 1.93, + "learning_rate": 5.665979258509684e-05, + "loss": 0.0039, + "step": 1359 + }, + { + "epoch": 1.93, + "learning_rate": 5.652495764471559e-05, + "loss": 0.0077, + "step": 1360 + }, + { + "epoch": 1.93, + "eval_loss": 0.03327401354908943, + "eval_runtime": 23.2292, + "eval_samples_per_second": 43.049, + "eval_steps_per_second": 10.762, + "step": 1360 + }, + { + "epoch": 1.94, + "learning_rate": 5.6390220094431965e-05, + "loss": 0.0048, + "step": 1361 + }, + { + "epoch": 1.94, + "learning_rate": 5.6255580236076576e-05, + "loss": 0.0079, + "step": 1362 + }, + { + "epoch": 1.94, + "learning_rate": 5.612103837126136e-05, + "loss": 0.006, + "step": 1363 + }, + { + "epoch": 1.94, + "learning_rate": 5.5986594801378644e-05, + "loss": 0.0143, + "step": 1364 + }, + { + "epoch": 1.94, + "learning_rate": 5.5852249827600644e-05, + "loss": 0.0071, + "step": 1365 + }, + { + "epoch": 1.94, + "learning_rate": 5.5718003750878566e-05, + "loss": 0.0048, + "step": 1366 + }, + { + "epoch": 1.94, + "learning_rate": 5.558385687194221e-05, + "loss": 0.0057, + "step": 1367 + }, + { + "epoch": 1.95, + "learning_rate": 5.544980949129912e-05, + "loss": 0.001, + "step": 1368 + }, + { + "epoch": 1.95, + "learning_rate": 5.531586190923388e-05, + "loss": 0.0053, + "step": 1369 + }, + { + "epoch": 1.95, + "learning_rate": 5.51820144258076e-05, + "loss": 0.0045, + "step": 1370 + }, + { + "epoch": 1.95, + "learning_rate": 5.5048267340857126e-05, + "loss": 0.0062, + "step": 1371 + }, + { + "epoch": 1.95, + "learning_rate": 5.491462095399443e-05, + "loss": 0.0057, + "step": 1372 + }, + { + "epoch": 1.95, + "learning_rate": 5.47810755646058e-05, + "loss": 0.0036, + "step": 1373 + }, + { + "epoch": 1.95, + "learning_rate": 5.464763147185138e-05, + "loss": 0.0165, + "step": 1374 + }, + { + "epoch": 1.96, + "learning_rate": 5.451428897466436e-05, + "loss": 0.0039, + "step": 1375 + }, + { + "epoch": 1.96, + "learning_rate": 5.43810483717504e-05, + "loss": 0.0016, + "step": 1376 + }, + { + "epoch": 1.96, + "learning_rate": 5.424790996158674e-05, + "loss": 0.0053, + "step": 1377 + }, + { + "epoch": 1.96, + "learning_rate": 5.411487404242187e-05, + "loss": 0.0101, + "step": 1378 + }, + { + "epoch": 1.96, + "learning_rate": 5.39819409122746e-05, + "loss": 0.0072, + "step": 1379 + }, + { + "epoch": 1.96, + "learning_rate": 5.384911086893354e-05, + "loss": 0.0081, + "step": 1380 + }, + { + "epoch": 1.96, + "eval_loss": 0.031983714550733566, + "eval_runtime": 23.2134, + "eval_samples_per_second": 43.079, + "eval_steps_per_second": 10.77, + "step": 1380 + }, + { + "epoch": 1.96, + "learning_rate": 5.371638420995626e-05, + "loss": 0.0004, + "step": 1381 + }, + { + "epoch": 1.97, + "learning_rate": 5.358376123266882e-05, + "loss": 0.0085, + "step": 1382 + }, + { + "epoch": 1.97, + "learning_rate": 5.3451242234165e-05, + "loss": 0.0188, + "step": 1383 + }, + { + "epoch": 1.97, + "learning_rate": 5.331882751130572e-05, + "loss": 0.0009, + "step": 1384 + }, + { + "epoch": 1.97, + "learning_rate": 5.318651736071816e-05, + "loss": 0.0067, + "step": 1385 + }, + { + "epoch": 1.97, + "learning_rate": 5.305431207879532e-05, + "loss": 0.0128, + "step": 1386 + }, + { + "epoch": 1.97, + "learning_rate": 5.2922211961695346e-05, + "loss": 0.0052, + "step": 1387 + }, + { + "epoch": 1.97, + "learning_rate": 5.279021730534074e-05, + "loss": 0.0065, + "step": 1388 + }, + { + "epoch": 1.98, + "learning_rate": 5.265832840541769e-05, + "loss": 0.0177, + "step": 1389 + }, + { + "epoch": 1.98, + "learning_rate": 5.252654555737557e-05, + "loss": 0.0031, + "step": 1390 + }, + { + "epoch": 1.98, + "learning_rate": 5.239486905642619e-05, + "loss": 0.0044, + "step": 1391 + }, + { + "epoch": 1.98, + "learning_rate": 5.2263299197543095e-05, + "loss": 0.0041, + "step": 1392 + }, + { + "epoch": 1.98, + "learning_rate": 5.213183627546087e-05, + "loss": 0.0034, + "step": 1393 + }, + { + "epoch": 1.98, + "learning_rate": 5.200048058467464e-05, + "loss": 0.0016, + "step": 1394 + }, + { + "epoch": 1.98, + "learning_rate": 5.186923241943932e-05, + "loss": 0.005, + "step": 1395 + }, + { + "epoch": 1.99, + "learning_rate": 5.173809207376893e-05, + "loss": 0.0168, + "step": 1396 + }, + { + "epoch": 1.99, + "learning_rate": 5.160705984143588e-05, + "loss": 0.0139, + "step": 1397 + }, + { + "epoch": 1.99, + "learning_rate": 5.147613601597051e-05, + "loss": 0.0083, + "step": 1398 + }, + { + "epoch": 1.99, + "learning_rate": 5.1345320890660266e-05, + "loss": 0.0094, + "step": 1399 + }, + { + "epoch": 1.99, + "learning_rate": 5.1214614758549115e-05, + "loss": 0.0129, + "step": 1400 + }, + { + "epoch": 1.99, + "eval_loss": 0.03905472531914711, + "eval_runtime": 23.3317, + "eval_samples_per_second": 42.86, + "eval_steps_per_second": 10.715, + "step": 1400 + }, + { + "epoch": 1.99, + "learning_rate": 5.108401791243678e-05, + "loss": 0.0057, + "step": 1401 + }, + { + "epoch": 1.99, + "learning_rate": 5.095353064487824e-05, + "loss": 0.0074, + "step": 1402 + }, + { + "epoch": 2.0, + "learning_rate": 5.082315324818303e-05, + "loss": 0.0019, + "step": 1403 + }, + { + "epoch": 2.0, + "learning_rate": 5.0692886014414534e-05, + "loss": 0.0071, + "step": 1404 + }, + { + "epoch": 2.0, + "learning_rate": 5.056272923538927e-05, + "loss": 0.0022, + "step": 1405 + }, + { + "epoch": 2.0, + "learning_rate": 5.043268320267642e-05, + "loss": 0.0023, + "step": 1406 + }, + { + "epoch": 2.0, + "learning_rate": 5.0302748207597116e-05, + "loss": 0.0061, + "step": 1407 + }, + { + "epoch": 2.0, + "learning_rate": 5.0172924541223595e-05, + "loss": 0.0127, + "step": 1408 + }, + { + "epoch": 2.0, + "learning_rate": 5.004321249437883e-05, + "loss": 0.0066, + "step": 1409 + }, + { + "epoch": 2.01, + "learning_rate": 4.9913612357635764e-05, + "loss": 0.0029, + "step": 1410 + }, + { + "epoch": 2.01, + "learning_rate": 4.978412442131653e-05, + "loss": 0.0136, + "step": 1411 + }, + { + "epoch": 2.01, + "learning_rate": 4.965474897549198e-05, + "loss": 0.0018, + "step": 1412 + }, + { + "epoch": 2.01, + "learning_rate": 4.952548630998103e-05, + "loss": 0.0019, + "step": 1413 + }, + { + "epoch": 2.01, + "learning_rate": 4.9396336714349903e-05, + "loss": 0.0063, + "step": 1414 + }, + { + "epoch": 2.01, + "learning_rate": 4.926730047791145e-05, + "loss": 0.0017, + "step": 1415 + }, + { + "epoch": 2.01, + "learning_rate": 4.913837788972471e-05, + "loss": 0.0067, + "step": 1416 + }, + { + "epoch": 2.02, + "learning_rate": 4.90095692385941e-05, + "loss": 0.0048, + "step": 1417 + }, + { + "epoch": 2.02, + "learning_rate": 4.888087481306873e-05, + "loss": 0.0011, + "step": 1418 + }, + { + "epoch": 2.02, + "learning_rate": 4.8752294901441906e-05, + "loss": 0.0024, + "step": 1419 + }, + { + "epoch": 2.02, + "learning_rate": 4.8623829791750386e-05, + "loss": 0.0082, + "step": 1420 + }, + { + "epoch": 2.02, + "eval_loss": 0.03879746049642563, + "eval_runtime": 23.1294, + "eval_samples_per_second": 43.235, + "eval_steps_per_second": 10.809, + "step": 1420 + }, + { + "epoch": 2.02, + "learning_rate": 4.849547977177379e-05, + "loss": 0.0032, + "step": 1421 + }, + { + "epoch": 2.02, + "learning_rate": 4.836724512903381e-05, + "loss": 0.0074, + "step": 1422 + }, + { + "epoch": 2.02, + "learning_rate": 4.823912615079379e-05, + "loss": 0.0013, + "step": 1423 + }, + { + "epoch": 2.03, + "learning_rate": 4.811112312405793e-05, + "loss": 0.0162, + "step": 1424 + }, + { + "epoch": 2.03, + "learning_rate": 4.798323633557073e-05, + "loss": 0.0151, + "step": 1425 + }, + { + "epoch": 2.03, + "learning_rate": 4.785546607181616e-05, + "loss": 0.0195, + "step": 1426 + }, + { + "epoch": 2.03, + "learning_rate": 4.7727812619017296e-05, + "loss": 0.0093, + "step": 1427 + }, + { + "epoch": 2.03, + "learning_rate": 4.76002762631355e-05, + "loss": 0.0052, + "step": 1428 + }, + { + "epoch": 2.03, + "learning_rate": 4.747285728986987e-05, + "loss": 0.0034, + "step": 1429 + }, + { + "epoch": 2.03, + "learning_rate": 4.734555598465641e-05, + "loss": 0.0044, + "step": 1430 + }, + { + "epoch": 2.04, + "learning_rate": 4.721837263266765e-05, + "loss": 0.0073, + "step": 1431 + }, + { + "epoch": 2.04, + "learning_rate": 4.7091307518811875e-05, + "loss": 0.0019, + "step": 1432 + }, + { + "epoch": 2.04, + "learning_rate": 4.696436092773251e-05, + "loss": 0.0024, + "step": 1433 + }, + { + "epoch": 2.04, + "learning_rate": 4.683753314380739e-05, + "loss": 0.0071, + "step": 1434 + }, + { + "epoch": 2.04, + "learning_rate": 4.671082445114827e-05, + "loss": 0.0077, + "step": 1435 + }, + { + "epoch": 2.04, + "learning_rate": 4.658423513360014e-05, + "loss": 0.0117, + "step": 1436 + }, + { + "epoch": 2.04, + "learning_rate": 4.645776547474058e-05, + "loss": 0.0075, + "step": 1437 + }, + { + "epoch": 2.05, + "learning_rate": 4.633141575787901e-05, + "loss": 0.0145, + "step": 1438 + }, + { + "epoch": 2.05, + "learning_rate": 4.6205186266056285e-05, + "loss": 0.0007, + "step": 1439 + }, + { + "epoch": 2.05, + "learning_rate": 4.60790772820439e-05, + "loss": 0.008, + "step": 1440 + }, + { + "epoch": 2.05, + "eval_loss": 0.021184591576457024, + "eval_runtime": 23.4283, + "eval_samples_per_second": 42.683, + "eval_steps_per_second": 10.671, + "step": 1440 + }, + { + "epoch": 2.05, + "learning_rate": 4.595308908834342e-05, + "loss": 0.0107, + "step": 1441 + }, + { + "epoch": 2.05, + "learning_rate": 4.582722196718572e-05, + "loss": 0.0076, + "step": 1442 + }, + { + "epoch": 2.05, + "learning_rate": 4.5701476200530555e-05, + "loss": 0.0064, + "step": 1443 + }, + { + "epoch": 2.05, + "learning_rate": 4.557585207006583e-05, + "loss": 0.0013, + "step": 1444 + }, + { + "epoch": 2.06, + "learning_rate": 4.5450349857206967e-05, + "loss": 0.008, + "step": 1445 + }, + { + "epoch": 2.06, + "learning_rate": 4.5324969843096174e-05, + "loss": 0.0073, + "step": 1446 + }, + { + "epoch": 2.06, + "learning_rate": 4.519971230860203e-05, + "loss": 0.0126, + "step": 1447 + }, + { + "epoch": 2.06, + "learning_rate": 4.507457753431872e-05, + "loss": 0.0132, + "step": 1448 + }, + { + "epoch": 2.06, + "learning_rate": 4.494956580056544e-05, + "loss": 0.006, + "step": 1449 + }, + { + "epoch": 2.06, + "learning_rate": 4.482467738738566e-05, + "loss": 0.0092, + "step": 1450 + }, + { + "epoch": 2.06, + "learning_rate": 4.469991257454672e-05, + "loss": 0.0053, + "step": 1451 + }, + { + "epoch": 2.07, + "learning_rate": 4.457527164153901e-05, + "loss": 0.0004, + "step": 1452 + }, + { + "epoch": 2.07, + "learning_rate": 4.4450754867575474e-05, + "loss": 0.0041, + "step": 1453 + }, + { + "epoch": 2.07, + "learning_rate": 4.4326362531590816e-05, + "loss": 0.0074, + "step": 1454 + }, + { + "epoch": 2.07, + "learning_rate": 4.420209491224105e-05, + "loss": 0.0036, + "step": 1455 + }, + { + "epoch": 2.07, + "learning_rate": 4.407795228790287e-05, + "loss": 0.0008, + "step": 1456 + }, + { + "epoch": 2.07, + "learning_rate": 4.395393493667282e-05, + "loss": 0.0148, + "step": 1457 + }, + { + "epoch": 2.07, + "learning_rate": 4.3830043136366893e-05, + "loss": 0.0101, + "step": 1458 + }, + { + "epoch": 2.08, + "learning_rate": 4.370627716451986e-05, + "loss": 0.0049, + "step": 1459 + }, + { + "epoch": 2.08, + "learning_rate": 4.358263729838461e-05, + "loss": 0.0025, + "step": 1460 + }, + { + "epoch": 2.08, + "eval_loss": 0.03624861687421799, + "eval_runtime": 23.3719, + "eval_samples_per_second": 42.786, + "eval_steps_per_second": 10.697, + "step": 1460 + }, + { + "epoch": 2.08, + "learning_rate": 4.3459123814931444e-05, + "loss": 0.0141, + "step": 1461 + }, + { + "epoch": 2.08, + "learning_rate": 4.333573699084767e-05, + "loss": 0.0159, + "step": 1462 + }, + { + "epoch": 2.08, + "learning_rate": 4.321247710253678e-05, + "loss": 0.0107, + "step": 1463 + }, + { + "epoch": 2.08, + "learning_rate": 4.3089344426118e-05, + "loss": 0.006, + "step": 1464 + }, + { + "epoch": 2.08, + "learning_rate": 4.2966339237425445e-05, + "loss": 0.0075, + "step": 1465 + }, + { + "epoch": 2.09, + "learning_rate": 4.28434618120078e-05, + "loss": 0.005, + "step": 1466 + }, + { + "epoch": 2.09, + "learning_rate": 4.272071242512735e-05, + "loss": 0.0084, + "step": 1467 + }, + { + "epoch": 2.09, + "learning_rate": 4.259809135175983e-05, + "loss": 0.008, + "step": 1468 + }, + { + "epoch": 2.09, + "learning_rate": 4.247559886659323e-05, + "loss": 0.0033, + "step": 1469 + }, + { + "epoch": 2.09, + "learning_rate": 4.2353235244027755e-05, + "loss": 0.0017, + "step": 1470 + }, + { + "epoch": 2.09, + "learning_rate": 4.223100075817467e-05, + "loss": 0.0048, + "step": 1471 + }, + { + "epoch": 2.09, + "learning_rate": 4.210889568285623e-05, + "loss": 0.0022, + "step": 1472 + }, + { + "epoch": 2.1, + "learning_rate": 4.1986920291604595e-05, + "loss": 0.004, + "step": 1473 + }, + { + "epoch": 2.1, + "learning_rate": 4.186507485766153e-05, + "loss": 0.0041, + "step": 1474 + }, + { + "epoch": 2.1, + "learning_rate": 4.174335965397754e-05, + "loss": 0.0072, + "step": 1475 + }, + { + "epoch": 2.1, + "learning_rate": 4.1621774953211615e-05, + "loss": 0.0049, + "step": 1476 + }, + { + "epoch": 2.1, + "learning_rate": 4.150032102773016e-05, + "loss": 0.0022, + "step": 1477 + }, + { + "epoch": 2.1, + "learning_rate": 4.137899814960683e-05, + "loss": 0.0071, + "step": 1478 + }, + { + "epoch": 2.1, + "learning_rate": 4.12578065906215e-05, + "loss": 0.0046, + "step": 1479 + }, + { + "epoch": 2.11, + "learning_rate": 4.113674662226014e-05, + "loss": 0.0006, + "step": 1480 + }, + { + "epoch": 2.11, + "eval_loss": 0.028911272063851357, + "eval_runtime": 23.2197, + "eval_samples_per_second": 43.067, + "eval_steps_per_second": 10.767, + "step": 1480 + }, + { + "epoch": 2.11, + "learning_rate": 4.101581851571369e-05, + "loss": 0.015, + "step": 1481 + }, + { + "epoch": 2.11, + "learning_rate": 4.089502254187787e-05, + "loss": 0.0032, + "step": 1482 + }, + { + "epoch": 2.11, + "learning_rate": 4.077435897135224e-05, + "loss": 0.0022, + "step": 1483 + }, + { + "epoch": 2.11, + "learning_rate": 4.065382807444002e-05, + "loss": 0.0031, + "step": 1484 + }, + { + "epoch": 2.11, + "learning_rate": 4.053343012114692e-05, + "loss": 0.0061, + "step": 1485 + }, + { + "epoch": 2.11, + "learning_rate": 4.041316538118106e-05, + "loss": 0.0132, + "step": 1486 + }, + { + "epoch": 2.12, + "learning_rate": 4.0293034123951964e-05, + "loss": 0.0057, + "step": 1487 + }, + { + "epoch": 2.12, + "learning_rate": 4.017303661857038e-05, + "loss": 0.0016, + "step": 1488 + }, + { + "epoch": 2.12, + "learning_rate": 4.005317313384717e-05, + "loss": 0.0067, + "step": 1489 + }, + { + "epoch": 2.12, + "learning_rate": 3.993344393829317e-05, + "loss": 0.0033, + "step": 1490 + }, + { + "epoch": 2.12, + "learning_rate": 3.981384930011822e-05, + "loss": 0.0109, + "step": 1491 + }, + { + "epoch": 2.12, + "learning_rate": 3.969438948723089e-05, + "loss": 0.0014, + "step": 1492 + }, + { + "epoch": 2.12, + "learning_rate": 3.9575064767237634e-05, + "loss": 0.0077, + "step": 1493 + }, + { + "epoch": 2.13, + "learning_rate": 3.945587540744233e-05, + "loss": 0.013, + "step": 1494 + }, + { + "epoch": 2.13, + "learning_rate": 3.9336821674845545e-05, + "loss": 0.0095, + "step": 1495 + }, + { + "epoch": 2.13, + "learning_rate": 3.921790383614411e-05, + "loss": 0.0012, + "step": 1496 + }, + { + "epoch": 2.13, + "learning_rate": 3.9099122157730396e-05, + "loss": 0.0212, + "step": 1497 + }, + { + "epoch": 2.13, + "learning_rate": 3.89804769056918e-05, + "loss": 0.0032, + "step": 1498 + }, + { + "epoch": 2.13, + "learning_rate": 3.8861968345810004e-05, + "loss": 0.0107, + "step": 1499 + }, + { + "epoch": 2.13, + "learning_rate": 3.874359674356057e-05, + "loss": 0.0034, + "step": 1500 + }, + { + "epoch": 2.13, + "eval_loss": 0.03471008315682411, + "eval_runtime": 23.3735, + "eval_samples_per_second": 42.783, + "eval_steps_per_second": 10.696, + "step": 1500 + }, + { + "epoch": 2.14, + "learning_rate": 3.862536236411224e-05, + "loss": 0.0143, + "step": 1501 + }, + { + "epoch": 2.14, + "learning_rate": 3.850726547232638e-05, + "loss": 0.0124, + "step": 1502 + }, + { + "epoch": 2.14, + "learning_rate": 3.838930633275627e-05, + "loss": 0.0005, + "step": 1503 + }, + { + "epoch": 2.14, + "learning_rate": 3.82714852096467e-05, + "loss": 0.0008, + "step": 1504 + }, + { + "epoch": 2.14, + "learning_rate": 3.8153802366933276e-05, + "loss": 0.0114, + "step": 1505 + }, + { + "epoch": 2.14, + "learning_rate": 3.803625806824177e-05, + "loss": 0.0024, + "step": 1506 + }, + { + "epoch": 2.14, + "learning_rate": 3.7918852576887654e-05, + "loss": 0.0157, + "step": 1507 + }, + { + "epoch": 2.15, + "learning_rate": 3.7801586155875444e-05, + "loss": 0.0005, + "step": 1508 + }, + { + "epoch": 2.15, + "learning_rate": 3.768445906789816e-05, + "loss": 0.003, + "step": 1509 + }, + { + "epoch": 2.15, + "learning_rate": 3.756747157533657e-05, + "loss": 0.0106, + "step": 1510 + }, + { + "epoch": 2.15, + "learning_rate": 3.7450623940258837e-05, + "loss": 0.0052, + "step": 1511 + }, + { + "epoch": 2.15, + "learning_rate": 3.733391642441981e-05, + "loss": 0.001, + "step": 1512 + }, + { + "epoch": 2.15, + "learning_rate": 3.7217349289260485e-05, + "loss": 0.0064, + "step": 1513 + }, + { + "epoch": 2.15, + "learning_rate": 3.710092279590725e-05, + "loss": 0.0057, + "step": 1514 + }, + { + "epoch": 2.16, + "learning_rate": 3.69846372051716e-05, + "loss": 0.0052, + "step": 1515 + }, + { + "epoch": 2.16, + "learning_rate": 3.6868492777549214e-05, + "loss": 0.0031, + "step": 1516 + }, + { + "epoch": 2.16, + "learning_rate": 3.675248977321978e-05, + "loss": 0.0047, + "step": 1517 + }, + { + "epoch": 2.16, + "learning_rate": 3.6636628452045943e-05, + "loss": 0.0072, + "step": 1518 + }, + { + "epoch": 2.16, + "learning_rate": 3.6520909073573115e-05, + "loss": 0.0052, + "step": 1519 + }, + { + "epoch": 2.16, + "learning_rate": 3.6405331897028606e-05, + "loss": 0.0115, + "step": 1520 + }, + { + "epoch": 2.16, + "eval_loss": 0.03125910460948944, + "eval_runtime": 23.1921, + "eval_samples_per_second": 43.118, + "eval_steps_per_second": 10.78, + "step": 1520 + }, + { + "epoch": 2.16, + "learning_rate": 3.6289897181321366e-05, + "loss": 0.0143, + "step": 1521 + }, + { + "epoch": 2.17, + "learning_rate": 3.617460518504101e-05, + "loss": 0.0112, + "step": 1522 + }, + { + "epoch": 2.17, + "learning_rate": 3.6059456166457575e-05, + "loss": 0.0044, + "step": 1523 + }, + { + "epoch": 2.17, + "learning_rate": 3.594445038352068e-05, + "loss": 0.0117, + "step": 1524 + }, + { + "epoch": 2.17, + "learning_rate": 3.582958809385928e-05, + "loss": 0.0099, + "step": 1525 + }, + { + "epoch": 2.17, + "learning_rate": 3.571486955478066e-05, + "loss": 0.0057, + "step": 1526 + }, + { + "epoch": 2.17, + "learning_rate": 3.560029502327027e-05, + "loss": 0.0035, + "step": 1527 + }, + { + "epoch": 2.17, + "learning_rate": 3.548586475599076e-05, + "loss": 0.0145, + "step": 1528 + }, + { + "epoch": 2.17, + "learning_rate": 3.537157900928188e-05, + "loss": 0.0042, + "step": 1529 + }, + { + "epoch": 2.18, + "learning_rate": 3.525743803915937e-05, + "loss": 0.0018, + "step": 1530 + }, + { + "epoch": 2.18, + "learning_rate": 3.514344210131484e-05, + "loss": 0.0091, + "step": 1531 + }, + { + "epoch": 2.18, + "learning_rate": 3.5029591451114816e-05, + "loss": 0.0129, + "step": 1532 + }, + { + "epoch": 2.18, + "learning_rate": 3.491588634360063e-05, + "loss": 0.0076, + "step": 1533 + }, + { + "epoch": 2.18, + "learning_rate": 3.480232703348729e-05, + "loss": 0.0056, + "step": 1534 + }, + { + "epoch": 2.18, + "learning_rate": 3.468891377516342e-05, + "loss": 0.0073, + "step": 1535 + }, + { + "epoch": 2.18, + "learning_rate": 3.457564682269028e-05, + "loss": 0.0062, + "step": 1536 + }, + { + "epoch": 2.19, + "learning_rate": 3.44625264298016e-05, + "loss": 0.0028, + "step": 1537 + }, + { + "epoch": 2.19, + "learning_rate": 3.43495528499026e-05, + "loss": 0.0125, + "step": 1538 + }, + { + "epoch": 2.19, + "learning_rate": 3.4236726336069736e-05, + "loss": 0.0035, + "step": 1539 + }, + { + "epoch": 2.19, + "learning_rate": 3.41240471410499e-05, + "loss": 0.0061, + "step": 1540 + }, + { + "epoch": 2.19, + "eval_loss": 0.029660778120160103, + "eval_runtime": 23.2988, + "eval_samples_per_second": 42.921, + "eval_steps_per_second": 10.73, + "step": 1540 + }, + { + "epoch": 2.19, + "learning_rate": 3.401151551726017e-05, + "loss": 0.0049, + "step": 1541 + }, + { + "epoch": 2.19, + "learning_rate": 3.3899131716786826e-05, + "loss": 0.0097, + "step": 1542 + }, + { + "epoch": 2.19, + "learning_rate": 3.378689599138517e-05, + "loss": 0.0039, + "step": 1543 + }, + { + "epoch": 2.2, + "learning_rate": 3.3674808592478635e-05, + "loss": 0.0042, + "step": 1544 + }, + { + "epoch": 2.2, + "learning_rate": 3.356286977115852e-05, + "loss": 0.0006, + "step": 1545 + }, + { + "epoch": 2.2, + "learning_rate": 3.345107977818325e-05, + "loss": 0.0086, + "step": 1546 + }, + { + "epoch": 2.2, + "learning_rate": 3.3339438863977854e-05, + "loss": 0.006, + "step": 1547 + }, + { + "epoch": 2.2, + "learning_rate": 3.322794727863334e-05, + "loss": 0.0119, + "step": 1548 + }, + { + "epoch": 2.2, + "learning_rate": 3.31166052719063e-05, + "loss": 0.0011, + "step": 1549 + }, + { + "epoch": 2.2, + "learning_rate": 3.3005413093218174e-05, + "loss": 0.0029, + "step": 1550 + }, + { + "epoch": 2.21, + "learning_rate": 3.2894370991654844e-05, + "loss": 0.0041, + "step": 1551 + }, + { + "epoch": 2.21, + "learning_rate": 3.278347921596588e-05, + "loss": 0.0056, + "step": 1552 + }, + { + "epoch": 2.21, + "learning_rate": 3.267273801456422e-05, + "loss": 0.0014, + "step": 1553 + }, + { + "epoch": 2.21, + "learning_rate": 3.256214763552545e-05, + "loss": 0.0133, + "step": 1554 + }, + { + "epoch": 2.21, + "learning_rate": 3.245170832658725e-05, + "loss": 0.0127, + "step": 1555 + }, + { + "epoch": 2.21, + "learning_rate": 3.234142033514892e-05, + "loss": 0.0015, + "step": 1556 + }, + { + "epoch": 2.21, + "learning_rate": 3.2231283908270825e-05, + "loss": 0.0029, + "step": 1557 + }, + { + "epoch": 2.22, + "learning_rate": 3.212129929267378e-05, + "loss": 0.0146, + "step": 1558 + }, + { + "epoch": 2.22, + "learning_rate": 3.201146673473844e-05, + "loss": 0.0083, + "step": 1559 + }, + { + "epoch": 2.22, + "learning_rate": 3.190178648050495e-05, + "loss": 0.0065, + "step": 1560 + }, + { + "epoch": 2.22, + "eval_loss": 0.033540185540914536, + "eval_runtime": 23.2976, + "eval_samples_per_second": 42.923, + "eval_steps_per_second": 10.731, + "step": 1560 + }, + { + "epoch": 2.22, + "learning_rate": 3.179225877567221e-05, + "loss": 0.0121, + "step": 1561 + }, + { + "epoch": 2.22, + "learning_rate": 3.1682883865597444e-05, + "loss": 0.0142, + "step": 1562 + }, + { + "epoch": 2.22, + "learning_rate": 3.157366199529548e-05, + "loss": 0.0009, + "step": 1563 + }, + { + "epoch": 2.22, + "learning_rate": 3.146459340943841e-05, + "loss": 0.0072, + "step": 1564 + }, + { + "epoch": 2.23, + "learning_rate": 3.135567835235495e-05, + "loss": 0.0014, + "step": 1565 + }, + { + "epoch": 2.23, + "learning_rate": 3.124691706802988e-05, + "loss": 0.0038, + "step": 1566 + }, + { + "epoch": 2.23, + "learning_rate": 3.113830980010344e-05, + "loss": 0.0116, + "step": 1567 + }, + { + "epoch": 2.23, + "learning_rate": 3.102985679187096e-05, + "loss": 0.0031, + "step": 1568 + }, + { + "epoch": 2.23, + "learning_rate": 3.092155828628206e-05, + "loss": 0.0022, + "step": 1569 + }, + { + "epoch": 2.23, + "learning_rate": 3.0813414525940474e-05, + "loss": 0.0047, + "step": 1570 + }, + { + "epoch": 2.23, + "learning_rate": 3.070542575310308e-05, + "loss": 0.0059, + "step": 1571 + }, + { + "epoch": 2.24, + "learning_rate": 3.059759220967967e-05, + "loss": 0.0152, + "step": 1572 + }, + { + "epoch": 2.24, + "learning_rate": 3.0489914137232188e-05, + "loss": 0.0054, + "step": 1573 + }, + { + "epoch": 2.24, + "learning_rate": 3.0382391776974518e-05, + "loss": 0.0046, + "step": 1574 + }, + { + "epoch": 2.24, + "learning_rate": 3.0275025369771494e-05, + "loss": 0.0055, + "step": 1575 + }, + { + "epoch": 2.24, + "learning_rate": 3.0167815156138747e-05, + "loss": 0.004, + "step": 1576 + }, + { + "epoch": 2.24, + "learning_rate": 3.0060761376241876e-05, + "loss": 0.0009, + "step": 1577 + }, + { + "epoch": 2.24, + "learning_rate": 2.9953864269896247e-05, + "loss": 0.0022, + "step": 1578 + }, + { + "epoch": 2.25, + "learning_rate": 2.984712407656606e-05, + "loss": 0.021, + "step": 1579 + }, + { + "epoch": 2.25, + "learning_rate": 2.974054103536411e-05, + "loss": 0.0144, + "step": 1580 + }, + { + "epoch": 2.25, + "eval_loss": 0.03788626194000244, + "eval_runtime": 23.1818, + "eval_samples_per_second": 43.137, + "eval_steps_per_second": 10.784, + "step": 1580 + }, + { + "epoch": 2.25, + "learning_rate": 2.9634115385051077e-05, + "loss": 0.0174, + "step": 1581 + }, + { + "epoch": 2.25, + "learning_rate": 2.9527847364035212e-05, + "loss": 0.0029, + "step": 1582 + }, + { + "epoch": 2.25, + "learning_rate": 2.942173721037146e-05, + "loss": 0.0078, + "step": 1583 + }, + { + "epoch": 2.25, + "learning_rate": 2.9315785161761287e-05, + "loss": 0.0126, + "step": 1584 + }, + { + "epoch": 2.25, + "learning_rate": 2.920999145555182e-05, + "loss": 0.0249, + "step": 1585 + }, + { + "epoch": 2.26, + "learning_rate": 2.9104356328735684e-05, + "loss": 0.0032, + "step": 1586 + }, + { + "epoch": 2.26, + "learning_rate": 2.8998880017950048e-05, + "loss": 0.0153, + "step": 1587 + }, + { + "epoch": 2.26, + "learning_rate": 2.88935627594765e-05, + "loss": 0.0129, + "step": 1588 + }, + { + "epoch": 2.26, + "learning_rate": 2.878840478924012e-05, + "loss": 0.0009, + "step": 1589 + }, + { + "epoch": 2.26, + "learning_rate": 2.8683406342809416e-05, + "loss": 0.011, + "step": 1590 + }, + { + "epoch": 2.26, + "learning_rate": 2.8578567655395315e-05, + "loss": 0.0019, + "step": 1591 + }, + { + "epoch": 2.26, + "learning_rate": 2.8473888961851013e-05, + "loss": 0.0046, + "step": 1592 + }, + { + "epoch": 2.27, + "learning_rate": 2.8369370496671177e-05, + "loss": 0.0128, + "step": 1593 + }, + { + "epoch": 2.27, + "learning_rate": 2.8265012493991628e-05, + "loss": 0.004, + "step": 1594 + }, + { + "epoch": 2.27, + "learning_rate": 2.8160815187588695e-05, + "loss": 0.0143, + "step": 1595 + }, + { + "epoch": 2.27, + "learning_rate": 2.8056778810878757e-05, + "loss": 0.0067, + "step": 1596 + }, + { + "epoch": 2.27, + "learning_rate": 2.7952903596917623e-05, + "loss": 0.0011, + "step": 1597 + }, + { + "epoch": 2.27, + "learning_rate": 2.784918977840013e-05, + "loss": 0.0074, + "step": 1598 + }, + { + "epoch": 2.27, + "learning_rate": 2.774563758765959e-05, + "loss": 0.0057, + "step": 1599 + }, + { + "epoch": 2.28, + "learning_rate": 2.764224725666714e-05, + "loss": 0.0075, + "step": 1600 + }, + { + "epoch": 2.28, + "eval_loss": 0.030029315501451492, + "eval_runtime": 23.3727, + "eval_samples_per_second": 42.785, + "eval_steps_per_second": 10.696, + "step": 1600 + }, + { + "epoch": 2.28, + "learning_rate": 2.753901901703143e-05, + "loss": 0.0004, + "step": 1601 + }, + { + "epoch": 2.28, + "learning_rate": 2.743595309999797e-05, + "loss": 0.0056, + "step": 1602 + }, + { + "epoch": 2.28, + "learning_rate": 2.7333049736448667e-05, + "loss": 0.0049, + "step": 1603 + }, + { + "epoch": 2.28, + "learning_rate": 2.7230309156901212e-05, + "loss": 0.0058, + "step": 1604 + }, + { + "epoch": 2.28, + "learning_rate": 2.712773159150872e-05, + "loss": 0.0014, + "step": 1605 + }, + { + "epoch": 2.28, + "learning_rate": 2.70253172700591e-05, + "loss": 0.0068, + "step": 1606 + }, + { + "epoch": 2.29, + "learning_rate": 2.6923066421974596e-05, + "loss": 0.0069, + "step": 1607 + }, + { + "epoch": 2.29, + "learning_rate": 2.6820979276311175e-05, + "loss": 0.0067, + "step": 1608 + }, + { + "epoch": 2.29, + "learning_rate": 2.671905606175815e-05, + "loss": 0.0079, + "step": 1609 + }, + { + "epoch": 2.29, + "learning_rate": 2.661729700663762e-05, + "loss": 0.0045, + "step": 1610 + }, + { + "epoch": 2.29, + "learning_rate": 2.6515702338903937e-05, + "loss": 0.0024, + "step": 1611 + }, + { + "epoch": 2.29, + "learning_rate": 2.6414272286143128e-05, + "loss": 0.0056, + "step": 1612 + }, + { + "epoch": 2.29, + "learning_rate": 2.6313007075572526e-05, + "loss": 0.007, + "step": 1613 + }, + { + "epoch": 2.3, + "learning_rate": 2.621190693404022e-05, + "loss": 0.0043, + "step": 1614 + }, + { + "epoch": 2.3, + "learning_rate": 2.611097208802448e-05, + "loss": 0.0064, + "step": 1615 + }, + { + "epoch": 2.3, + "learning_rate": 2.6010202763633263e-05, + "loss": 0.0033, + "step": 1616 + }, + { + "epoch": 2.3, + "learning_rate": 2.5909599186603785e-05, + "loss": 0.011, + "step": 1617 + }, + { + "epoch": 2.3, + "learning_rate": 2.5809161582301954e-05, + "loss": 0.0023, + "step": 1618 + }, + { + "epoch": 2.3, + "learning_rate": 2.57088901757219e-05, + "loss": 0.0073, + "step": 1619 + }, + { + "epoch": 2.3, + "learning_rate": 2.5608785191485364e-05, + "loss": 0.0093, + "step": 1620 + }, + { + "epoch": 2.3, + "eval_loss": 0.03218885138630867, + "eval_runtime": 23.2886, + "eval_samples_per_second": 42.939, + "eval_steps_per_second": 10.735, + "step": 1620 + }, + { + "epoch": 2.31, + "learning_rate": 2.5508846853841352e-05, + "loss": 0.0064, + "step": 1621 + }, + { + "epoch": 2.31, + "learning_rate": 2.540907538666555e-05, + "loss": 0.0048, + "step": 1622 + }, + { + "epoch": 2.31, + "learning_rate": 2.530947101345984e-05, + "loss": 0.0098, + "step": 1623 + }, + { + "epoch": 2.31, + "learning_rate": 2.521003395735172e-05, + "loss": 0.0035, + "step": 1624 + }, + { + "epoch": 2.31, + "learning_rate": 2.5110764441093947e-05, + "loss": 0.0098, + "step": 1625 + }, + { + "epoch": 2.31, + "learning_rate": 2.5011662687063954e-05, + "loss": 0.0027, + "step": 1626 + }, + { + "epoch": 2.31, + "learning_rate": 2.4912728917263372e-05, + "loss": 0.009, + "step": 1627 + }, + { + "epoch": 2.32, + "learning_rate": 2.481396335331746e-05, + "loss": 0.0071, + "step": 1628 + }, + { + "epoch": 2.32, + "learning_rate": 2.4715366216474724e-05, + "loss": 0.0072, + "step": 1629 + }, + { + "epoch": 2.32, + "learning_rate": 2.4616937727606383e-05, + "loss": 0.0042, + "step": 1630 + }, + { + "epoch": 2.32, + "learning_rate": 2.4518678107205873e-05, + "loss": 0.004, + "step": 1631 + }, + { + "epoch": 2.32, + "learning_rate": 2.4420587575388243e-05, + "loss": 0.0084, + "step": 1632 + }, + { + "epoch": 2.32, + "learning_rate": 2.4322666351889857e-05, + "loss": 0.0056, + "step": 1633 + }, + { + "epoch": 2.32, + "learning_rate": 2.4224914656067778e-05, + "loss": 0.0029, + "step": 1634 + }, + { + "epoch": 2.33, + "learning_rate": 2.412733270689933e-05, + "loss": 0.0083, + "step": 1635 + }, + { + "epoch": 2.33, + "learning_rate": 2.4029920722981502e-05, + "loss": 0.0052, + "step": 1636 + }, + { + "epoch": 2.33, + "learning_rate": 2.3932678922530615e-05, + "loss": 0.0031, + "step": 1637 + }, + { + "epoch": 2.33, + "learning_rate": 2.383560752338174e-05, + "loss": 0.0102, + "step": 1638 + }, + { + "epoch": 2.33, + "learning_rate": 2.373870674298818e-05, + "loss": 0.0119, + "step": 1639 + }, + { + "epoch": 2.33, + "learning_rate": 2.3641976798421072e-05, + "loss": 0.0091, + "step": 1640 + }, + { + "epoch": 2.33, + "eval_loss": 0.03132651746273041, + "eval_runtime": 23.1155, + "eval_samples_per_second": 43.261, + "eval_steps_per_second": 10.815, + "step": 1640 + }, + { + "epoch": 2.33, + "learning_rate": 2.3545417906368862e-05, + "loss": 0.0057, + "step": 1641 + }, + { + "epoch": 2.34, + "learning_rate": 2.344903028313682e-05, + "loss": 0.0114, + "step": 1642 + }, + { + "epoch": 2.34, + "learning_rate": 2.3352814144646484e-05, + "loss": 0.0114, + "step": 1643 + }, + { + "epoch": 2.34, + "learning_rate": 2.32567697064353e-05, + "loss": 0.0114, + "step": 1644 + }, + { + "epoch": 2.34, + "learning_rate": 2.3160897183656115e-05, + "loss": 0.0049, + "step": 1645 + }, + { + "epoch": 2.34, + "learning_rate": 2.3065196791076548e-05, + "loss": 0.0056, + "step": 1646 + }, + { + "epoch": 2.34, + "learning_rate": 2.2969668743078732e-05, + "loss": 0.0026, + "step": 1647 + }, + { + "epoch": 2.34, + "learning_rate": 2.2874313253658708e-05, + "loss": 0.0058, + "step": 1648 + }, + { + "epoch": 2.35, + "learning_rate": 2.2779130536425873e-05, + "loss": 0.0036, + "step": 1649 + }, + { + "epoch": 2.35, + "learning_rate": 2.2684120804602705e-05, + "loss": 0.0084, + "step": 1650 + }, + { + "epoch": 2.35, + "learning_rate": 2.2589284271024092e-05, + "loss": 0.009, + "step": 1651 + }, + { + "epoch": 2.35, + "learning_rate": 2.2494621148137017e-05, + "loss": 0.0036, + "step": 1652 + }, + { + "epoch": 2.35, + "learning_rate": 2.240013164799989e-05, + "loss": 0.0123, + "step": 1653 + }, + { + "epoch": 2.35, + "learning_rate": 2.2305815982282242e-05, + "loss": 0.0052, + "step": 1654 + }, + { + "epoch": 2.35, + "learning_rate": 2.2211674362264203e-05, + "loss": 0.0082, + "step": 1655 + }, + { + "epoch": 2.36, + "learning_rate": 2.211770699883603e-05, + "loss": 0.0045, + "step": 1656 + }, + { + "epoch": 2.36, + "learning_rate": 2.2023914102497535e-05, + "loss": 0.0054, + "step": 1657 + }, + { + "epoch": 2.36, + "learning_rate": 2.1930295883357778e-05, + "loss": 0.0077, + "step": 1658 + }, + { + "epoch": 2.36, + "learning_rate": 2.183685255113449e-05, + "loss": 0.0037, + "step": 1659 + }, + { + "epoch": 2.36, + "learning_rate": 2.1743584315153674e-05, + "loss": 0.0051, + "step": 1660 + }, + { + "epoch": 2.36, + "eval_loss": 0.027848325669765472, + "eval_runtime": 23.0571, + "eval_samples_per_second": 43.371, + "eval_steps_per_second": 10.843, + "step": 1660 + }, + { + "epoch": 2.36, + "learning_rate": 2.1650491384349014e-05, + "loss": 0.002, + "step": 1661 + }, + { + "epoch": 2.36, + "learning_rate": 2.1557573967261523e-05, + "loss": 0.0049, + "step": 1662 + }, + { + "epoch": 2.37, + "learning_rate": 2.146483227203908e-05, + "loss": 0.0126, + "step": 1663 + }, + { + "epoch": 2.37, + "learning_rate": 2.137226650643589e-05, + "loss": 0.0015, + "step": 1664 + }, + { + "epoch": 2.37, + "learning_rate": 2.127987687781201e-05, + "loss": 0.0041, + "step": 1665 + }, + { + "epoch": 2.37, + "learning_rate": 2.1187663593132977e-05, + "loss": 0.0023, + "step": 1666 + }, + { + "epoch": 2.37, + "learning_rate": 2.1095626858969298e-05, + "loss": 0.0281, + "step": 1667 + }, + { + "epoch": 2.37, + "learning_rate": 2.1003766881495967e-05, + "loss": 0.0102, + "step": 1668 + }, + { + "epoch": 2.37, + "learning_rate": 2.0912083866491994e-05, + "loss": 0.0032, + "step": 1669 + }, + { + "epoch": 2.38, + "learning_rate": 2.0820578019339997e-05, + "loss": 0.0096, + "step": 1670 + }, + { + "epoch": 2.38, + "learning_rate": 2.072924954502571e-05, + "loss": 0.008, + "step": 1671 + }, + { + "epoch": 2.38, + "learning_rate": 2.063809864813756e-05, + "loss": 0.0044, + "step": 1672 + }, + { + "epoch": 2.38, + "learning_rate": 2.0547125532866086e-05, + "loss": 0.0045, + "step": 1673 + }, + { + "epoch": 2.38, + "learning_rate": 2.0456330403003644e-05, + "loss": 0.0044, + "step": 1674 + }, + { + "epoch": 2.38, + "learning_rate": 2.036571346194387e-05, + "loss": 0.0099, + "step": 1675 + }, + { + "epoch": 2.38, + "learning_rate": 2.027527491268125e-05, + "loss": 0.0088, + "step": 1676 + }, + { + "epoch": 2.39, + "learning_rate": 2.0185014957810578e-05, + "loss": 0.0058, + "step": 1677 + }, + { + "epoch": 2.39, + "learning_rate": 2.0094933799526627e-05, + "loss": 0.0046, + "step": 1678 + }, + { + "epoch": 2.39, + "learning_rate": 2.000503163962363e-05, + "loss": 0.0067, + "step": 1679 + }, + { + "epoch": 2.39, + "learning_rate": 1.991530867949487e-05, + "loss": 0.0046, + "step": 1680 + }, + { + "epoch": 2.39, + "eval_loss": 0.0293706264346838, + "eval_runtime": 23.163, + "eval_samples_per_second": 43.172, + "eval_steps_per_second": 10.793, + "step": 1680 + }, + { + "epoch": 2.39, + "learning_rate": 1.98257651201321e-05, + "loss": 0.0064, + "step": 1681 + }, + { + "epoch": 2.39, + "learning_rate": 1.9736401162125307e-05, + "loss": 0.009, + "step": 1682 + }, + { + "epoch": 2.39, + "learning_rate": 1.9647217005662077e-05, + "loss": 0.0095, + "step": 1683 + }, + { + "epoch": 2.4, + "learning_rate": 1.955821285052728e-05, + "loss": 0.0093, + "step": 1684 + }, + { + "epoch": 2.4, + "learning_rate": 1.9469388896102424e-05, + "loss": 0.0031, + "step": 1685 + }, + { + "epoch": 2.4, + "learning_rate": 1.938074534136549e-05, + "loss": 0.0025, + "step": 1686 + }, + { + "epoch": 2.4, + "learning_rate": 1.9292282384890303e-05, + "loss": 0.0, + "step": 1687 + }, + { + "epoch": 2.4, + "learning_rate": 1.9204000224846032e-05, + "loss": 0.0064, + "step": 1688 + }, + { + "epoch": 2.4, + "learning_rate": 1.9115899058996944e-05, + "loss": 0.0078, + "step": 1689 + }, + { + "epoch": 2.4, + "learning_rate": 1.9027979084701808e-05, + "loss": 0.014, + "step": 1690 + }, + { + "epoch": 2.41, + "learning_rate": 1.894024049891353e-05, + "loss": 0.0144, + "step": 1691 + }, + { + "epoch": 2.41, + "learning_rate": 1.8852683498178613e-05, + "loss": 0.0054, + "step": 1692 + }, + { + "epoch": 2.41, + "learning_rate": 1.8765308278636838e-05, + "loss": 0.0042, + "step": 1693 + }, + { + "epoch": 2.41, + "learning_rate": 1.8678115036020782e-05, + "loss": 0.0142, + "step": 1694 + }, + { + "epoch": 2.41, + "learning_rate": 1.8591103965655365e-05, + "loss": 0.0129, + "step": 1695 + }, + { + "epoch": 2.41, + "learning_rate": 1.8504275262457336e-05, + "loss": 0.0042, + "step": 1696 + }, + { + "epoch": 2.41, + "learning_rate": 1.841762912093504e-05, + "loss": 0.0069, + "step": 1697 + }, + { + "epoch": 2.42, + "learning_rate": 1.8331165735187728e-05, + "loss": 0.0141, + "step": 1698 + }, + { + "epoch": 2.42, + "learning_rate": 1.8244885298905412e-05, + "loss": 0.013, + "step": 1699 + }, + { + "epoch": 2.42, + "learning_rate": 1.815878800536811e-05, + "loss": 0.0004, + "step": 1700 + }, + { + "epoch": 2.42, + "eval_loss": 0.02830970473587513, + "eval_runtime": 23.062, + "eval_samples_per_second": 43.361, + "eval_steps_per_second": 10.84, + "step": 1700 + }, + { + "epoch": 2.42, + "learning_rate": 1.8072874047445697e-05, + "loss": 0.0022, + "step": 1701 + }, + { + "epoch": 2.42, + "learning_rate": 1.7987143617597225e-05, + "loss": 0.004, + "step": 1702 + }, + { + "epoch": 2.42, + "learning_rate": 1.7901596907870798e-05, + "loss": 0.0022, + "step": 1703 + }, + { + "epoch": 2.42, + "learning_rate": 1.7816234109902773e-05, + "loss": 0.006, + "step": 1704 + }, + { + "epoch": 2.43, + "learning_rate": 1.7731055414917663e-05, + "loss": 0.0072, + "step": 1705 + }, + { + "epoch": 2.43, + "learning_rate": 1.7646061013727433e-05, + "loss": 0.006, + "step": 1706 + }, + { + "epoch": 2.43, + "learning_rate": 1.756125109673137e-05, + "loss": 0.0075, + "step": 1707 + }, + { + "epoch": 2.43, + "learning_rate": 1.747662585391533e-05, + "loss": 0.0035, + "step": 1708 + }, + { + "epoch": 2.43, + "learning_rate": 1.7392185474851574e-05, + "loss": 0.0061, + "step": 1709 + }, + { + "epoch": 2.43, + "learning_rate": 1.7307930148698158e-05, + "loss": 0.0067, + "step": 1710 + }, + { + "epoch": 2.43, + "learning_rate": 1.7223860064198728e-05, + "loss": 0.0037, + "step": 1711 + }, + { + "epoch": 2.44, + "learning_rate": 1.713997540968182e-05, + "loss": 0.0044, + "step": 1712 + }, + { + "epoch": 2.44, + "learning_rate": 1.7056276373060675e-05, + "loss": 0.0051, + "step": 1713 + }, + { + "epoch": 2.44, + "learning_rate": 1.6972763141832638e-05, + "loss": 0.0058, + "step": 1714 + }, + { + "epoch": 2.44, + "learning_rate": 1.6889435903078966e-05, + "loss": 0.0082, + "step": 1715 + }, + { + "epoch": 2.44, + "learning_rate": 1.6806294843464098e-05, + "loss": 0.0105, + "step": 1716 + }, + { + "epoch": 2.44, + "learning_rate": 1.6723340149235546e-05, + "loss": 0.0023, + "step": 1717 + }, + { + "epoch": 2.44, + "learning_rate": 1.66405720062232e-05, + "loss": 0.0072, + "step": 1718 + }, + { + "epoch": 2.45, + "learning_rate": 1.6557990599839235e-05, + "loss": 0.0042, + "step": 1719 + }, + { + "epoch": 2.45, + "learning_rate": 1.6475596115077318e-05, + "loss": 0.0054, + "step": 1720 + }, + { + "epoch": 2.45, + "eval_loss": 0.0295818243175745, + "eval_runtime": 23.3493, + "eval_samples_per_second": 42.828, + "eval_steps_per_second": 10.707, + "step": 1720 + }, + { + "epoch": 2.45, + "learning_rate": 1.6393388736512527e-05, + "loss": 0.0023, + "step": 1721 + }, + { + "epoch": 2.45, + "learning_rate": 1.6311368648300695e-05, + "loss": 0.0069, + "step": 1722 + }, + { + "epoch": 2.45, + "learning_rate": 1.6229536034178162e-05, + "loss": 0.0076, + "step": 1723 + }, + { + "epoch": 2.45, + "learning_rate": 1.6147891077461285e-05, + "loss": 0.0025, + "step": 1724 + }, + { + "epoch": 2.45, + "learning_rate": 1.6066433961046068e-05, + "loss": 0.0037, + "step": 1725 + }, + { + "epoch": 2.46, + "learning_rate": 1.5985164867407643e-05, + "loss": 0.0021, + "step": 1726 + }, + { + "epoch": 2.46, + "learning_rate": 1.5904083978600026e-05, + "loss": 0.0074, + "step": 1727 + }, + { + "epoch": 2.46, + "learning_rate": 1.5823191476255604e-05, + "loss": 0.024, + "step": 1728 + }, + { + "epoch": 2.46, + "learning_rate": 1.5742487541584784e-05, + "loss": 0.0114, + "step": 1729 + }, + { + "epoch": 2.46, + "learning_rate": 1.5661972355375465e-05, + "loss": 0.0067, + "step": 1730 + }, + { + "epoch": 2.46, + "learning_rate": 1.5581646097992797e-05, + "loss": 0.0102, + "step": 1731 + }, + { + "epoch": 2.46, + "learning_rate": 1.5501508949378707e-05, + "loss": 0.0008, + "step": 1732 + }, + { + "epoch": 2.47, + "learning_rate": 1.5421561089051496e-05, + "loss": 0.0049, + "step": 1733 + }, + { + "epoch": 2.47, + "learning_rate": 1.5341802696105335e-05, + "loss": 0.0077, + "step": 1734 + }, + { + "epoch": 2.47, + "learning_rate": 1.5262233949210093e-05, + "loss": 0.0105, + "step": 1735 + }, + { + "epoch": 2.47, + "learning_rate": 1.5182855026610753e-05, + "loss": 0.01, + "step": 1736 + }, + { + "epoch": 2.47, + "learning_rate": 1.5103666106127012e-05, + "loss": 0.0065, + "step": 1737 + }, + { + "epoch": 2.47, + "learning_rate": 1.5024667365153022e-05, + "loss": 0.0058, + "step": 1738 + }, + { + "epoch": 2.47, + "learning_rate": 1.4945858980656867e-05, + "loss": 0.0066, + "step": 1739 + }, + { + "epoch": 2.48, + "learning_rate": 1.4867241129180242e-05, + "loss": 0.0034, + "step": 1740 + }, + { + "epoch": 2.48, + "eval_loss": 0.03370482847094536, + "eval_runtime": 23.1386, + "eval_samples_per_second": 43.218, + "eval_steps_per_second": 10.804, + "step": 1740 + }, + { + "epoch": 2.48, + "learning_rate": 1.4788813986837912e-05, + "loss": 0.0082, + "step": 1741 + }, + { + "epoch": 2.48, + "learning_rate": 1.4710577729317565e-05, + "loss": 0.0022, + "step": 1742 + }, + { + "epoch": 2.48, + "learning_rate": 1.4632532531879194e-05, + "loss": 0.0069, + "step": 1743 + }, + { + "epoch": 2.48, + "learning_rate": 1.4554678569354852e-05, + "loss": 0.0032, + "step": 1744 + }, + { + "epoch": 2.48, + "learning_rate": 1.4477016016148115e-05, + "loss": 0.004, + "step": 1745 + }, + { + "epoch": 2.48, + "learning_rate": 1.4399545046233865e-05, + "loss": 0.001, + "step": 1746 + }, + { + "epoch": 2.49, + "learning_rate": 1.432226583315771e-05, + "loss": 0.0078, + "step": 1747 + }, + { + "epoch": 2.49, + "learning_rate": 1.4245178550035854e-05, + "loss": 0.0014, + "step": 1748 + }, + { + "epoch": 2.49, + "learning_rate": 1.4168283369554381e-05, + "loss": 0.0153, + "step": 1749 + }, + { + "epoch": 2.49, + "learning_rate": 1.4091580463969155e-05, + "loss": 0.0077, + "step": 1750 + }, + { + "epoch": 2.49, + "learning_rate": 1.4015070005105214e-05, + "loss": 0.0067, + "step": 1751 + }, + { + "epoch": 2.49, + "learning_rate": 1.3938752164356628e-05, + "loss": 0.0031, + "step": 1752 + }, + { + "epoch": 2.49, + "learning_rate": 1.3862627112685833e-05, + "loss": 0.0013, + "step": 1753 + }, + { + "epoch": 2.5, + "learning_rate": 1.3786695020623508e-05, + "loss": 0.0099, + "step": 1754 + }, + { + "epoch": 2.5, + "learning_rate": 1.3710956058267965e-05, + "loss": 0.0008, + "step": 1755 + }, + { + "epoch": 2.5, + "learning_rate": 1.3635410395284998e-05, + "loss": 0.0055, + "step": 1756 + }, + { + "epoch": 2.5, + "learning_rate": 1.3560058200907288e-05, + "loss": 0.0076, + "step": 1757 + }, + { + "epoch": 2.5, + "learning_rate": 1.3484899643934168e-05, + "loss": 0.009, + "step": 1758 + }, + { + "epoch": 2.5, + "learning_rate": 1.3409934892731147e-05, + "loss": 0.004, + "step": 1759 + }, + { + "epoch": 2.5, + "learning_rate": 1.3335164115229682e-05, + "loss": 0.0065, + "step": 1760 + }, + { + "epoch": 2.5, + "eval_loss": 0.03414594754576683, + "eval_runtime": 23.2095, + "eval_samples_per_second": 43.086, + "eval_steps_per_second": 10.771, + "step": 1760 + }, + { + "epoch": 2.5, + "learning_rate": 1.3260587478926578e-05, + "loss": 0.0051, + "step": 1761 + }, + { + "epoch": 2.51, + "learning_rate": 1.3186205150883857e-05, + "loss": 0.0123, + "step": 1762 + }, + { + "epoch": 2.51, + "learning_rate": 1.311201729772812e-05, + "loss": 0.0041, + "step": 1763 + }, + { + "epoch": 2.51, + "learning_rate": 1.3038024085650491e-05, + "loss": 0.0043, + "step": 1764 + }, + { + "epoch": 2.51, + "learning_rate": 1.2964225680405917e-05, + "loss": 0.0053, + "step": 1765 + }, + { + "epoch": 2.51, + "learning_rate": 1.2890622247313078e-05, + "loss": 0.0063, + "step": 1766 + }, + { + "epoch": 2.51, + "learning_rate": 1.2817213951253726e-05, + "loss": 0.0082, + "step": 1767 + }, + { + "epoch": 2.51, + "learning_rate": 1.2744000956672697e-05, + "loss": 0.006, + "step": 1768 + }, + { + "epoch": 2.52, + "learning_rate": 1.2670983427577132e-05, + "loss": 0.007, + "step": 1769 + }, + { + "epoch": 2.52, + "learning_rate": 1.2598161527536423e-05, + "loss": 0.005, + "step": 1770 + }, + { + "epoch": 2.52, + "learning_rate": 1.25255354196816e-05, + "loss": 0.0015, + "step": 1771 + }, + { + "epoch": 2.52, + "learning_rate": 1.245310526670529e-05, + "loss": 0.0024, + "step": 1772 + }, + { + "epoch": 2.52, + "learning_rate": 1.2380871230860935e-05, + "loss": 0.0031, + "step": 1773 + }, + { + "epoch": 2.52, + "learning_rate": 1.2308833473962822e-05, + "loss": 0.0028, + "step": 1774 + }, + { + "epoch": 2.52, + "learning_rate": 1.2236992157385418e-05, + "loss": 0.0086, + "step": 1775 + }, + { + "epoch": 2.53, + "learning_rate": 1.2165347442063213e-05, + "loss": 0.0057, + "step": 1776 + }, + { + "epoch": 2.53, + "learning_rate": 1.2093899488490257e-05, + "loss": 0.0032, + "step": 1777 + }, + { + "epoch": 2.53, + "learning_rate": 1.2022648456719866e-05, + "loss": 0.0128, + "step": 1778 + }, + { + "epoch": 2.53, + "learning_rate": 1.1951594506364127e-05, + "loss": 0.0116, + "step": 1779 + }, + { + "epoch": 2.53, + "learning_rate": 1.1880737796593734e-05, + "loss": 0.0034, + "step": 1780 + }, + { + "epoch": 2.53, + "eval_loss": 0.03446383401751518, + "eval_runtime": 23.1587, + "eval_samples_per_second": 43.18, + "eval_steps_per_second": 10.795, + "step": 1780 + }, + { + "epoch": 2.53, + "learning_rate": 1.18100784861375e-05, + "loss": 0.0081, + "step": 1781 + }, + { + "epoch": 2.53, + "learning_rate": 1.1739616733282056e-05, + "loss": 0.0102, + "step": 1782 + }, + { + "epoch": 2.54, + "learning_rate": 1.1669352695871416e-05, + "loss": 0.0102, + "step": 1783 + }, + { + "epoch": 2.54, + "learning_rate": 1.1599286531306742e-05, + "loss": 0.0021, + "step": 1784 + }, + { + "epoch": 2.54, + "learning_rate": 1.1529418396545933e-05, + "loss": 0.0113, + "step": 1785 + }, + { + "epoch": 2.54, + "learning_rate": 1.1459748448103226e-05, + "loss": 0.0146, + "step": 1786 + }, + { + "epoch": 2.54, + "learning_rate": 1.1390276842048942e-05, + "loss": 0.0021, + "step": 1787 + }, + { + "epoch": 2.54, + "learning_rate": 1.1321003734009084e-05, + "loss": 0.0049, + "step": 1788 + }, + { + "epoch": 2.54, + "learning_rate": 1.1251929279164974e-05, + "loss": 0.0042, + "step": 1789 + }, + { + "epoch": 2.55, + "learning_rate": 1.1183053632252905e-05, + "loss": 0.0027, + "step": 1790 + }, + { + "epoch": 2.55, + "learning_rate": 1.1114376947563843e-05, + "loss": 0.0114, + "step": 1791 + }, + { + "epoch": 2.55, + "learning_rate": 1.1045899378943058e-05, + "loss": 0.0115, + "step": 1792 + }, + { + "epoch": 2.55, + "learning_rate": 1.0977621079789779e-05, + "loss": 0.0037, + "step": 1793 + }, + { + "epoch": 2.55, + "learning_rate": 1.0909542203056767e-05, + "loss": 0.0062, + "step": 1794 + }, + { + "epoch": 2.55, + "learning_rate": 1.0841662901250138e-05, + "loss": 0.0073, + "step": 1795 + }, + { + "epoch": 2.55, + "learning_rate": 1.0773983326428904e-05, + "loss": 0.0167, + "step": 1796 + }, + { + "epoch": 2.56, + "learning_rate": 1.0706503630204667e-05, + "loss": 0.0029, + "step": 1797 + }, + { + "epoch": 2.56, + "learning_rate": 1.0639223963741229e-05, + "loss": 0.0063, + "step": 1798 + }, + { + "epoch": 2.56, + "learning_rate": 1.0572144477754375e-05, + "loss": 0.0114, + "step": 1799 + }, + { + "epoch": 2.56, + "learning_rate": 1.0505265322511349e-05, + "loss": 0.0114, + "step": 1800 + }, + { + "epoch": 2.56, + "eval_loss": 0.037059981375932693, + "eval_runtime": 23.1796, + "eval_samples_per_second": 43.141, + "eval_steps_per_second": 10.785, + "step": 1800 + }, + { + "epoch": 2.56, + "learning_rate": 1.0438586647830773e-05, + "loss": 0.0019, + "step": 1801 + }, + { + "epoch": 2.56, + "learning_rate": 1.0372108603082031e-05, + "loss": 0.0117, + "step": 1802 + }, + { + "epoch": 2.56, + "learning_rate": 1.0305831337185156e-05, + "loss": 0.0115, + "step": 1803 + }, + { + "epoch": 2.57, + "learning_rate": 1.0239754998610319e-05, + "loss": 0.0062, + "step": 1804 + }, + { + "epoch": 2.57, + "learning_rate": 1.0173879735377723e-05, + "loss": 0.0156, + "step": 1805 + }, + { + "epoch": 2.57, + "learning_rate": 1.0108205695056983e-05, + "loss": 0.0075, + "step": 1806 + }, + { + "epoch": 2.57, + "learning_rate": 1.004273302476707e-05, + "loss": 0.0024, + "step": 1807 + }, + { + "epoch": 2.57, + "learning_rate": 9.977461871175731e-06, + "loss": 0.0041, + "step": 1808 + }, + { + "epoch": 2.57, + "learning_rate": 9.912392380499458e-06, + "loss": 0.0047, + "step": 1809 + }, + { + "epoch": 2.57, + "learning_rate": 9.84752469850283e-06, + "loss": 0.0122, + "step": 1810 + }, + { + "epoch": 2.58, + "learning_rate": 9.782858970498442e-06, + "loss": 0.008, + "step": 1811 + }, + { + "epoch": 2.58, + "learning_rate": 9.718395341346398e-06, + "loss": 0.0054, + "step": 1812 + }, + { + "epoch": 2.58, + "learning_rate": 9.654133955454215e-06, + "loss": 0.0037, + "step": 1813 + }, + { + "epoch": 2.58, + "learning_rate": 9.590074956776207e-06, + "loss": 0.0186, + "step": 1814 + }, + { + "epoch": 2.58, + "learning_rate": 9.52621848881341e-06, + "loss": 0.0035, + "step": 1815 + }, + { + "epoch": 2.58, + "learning_rate": 9.462564694613063e-06, + "loss": 0.0042, + "step": 1816 + }, + { + "epoch": 2.58, + "learning_rate": 9.39911371676856e-06, + "loss": 0.0091, + "step": 1817 + }, + { + "epoch": 2.59, + "learning_rate": 9.335865697418766e-06, + "loss": 0.0003, + "step": 1818 + }, + { + "epoch": 2.59, + "learning_rate": 9.272820778248026e-06, + "loss": 0.0048, + "step": 1819 + }, + { + "epoch": 2.59, + "learning_rate": 9.2099791004856e-06, + "loss": 0.0044, + "step": 1820 + }, + { + "epoch": 2.59, + "eval_loss": 0.03769044578075409, + "eval_runtime": 23.1105, + "eval_samples_per_second": 43.27, + "eval_steps_per_second": 10.818, + "step": 1820 + }, + { + "epoch": 2.59, + "learning_rate": 9.147340804905614e-06, + "loss": 0.0031, + "step": 1821 + }, + { + "epoch": 2.59, + "learning_rate": 9.084906031826434e-06, + "loss": 0.0117, + "step": 1822 + }, + { + "epoch": 2.59, + "learning_rate": 9.022674921110608e-06, + "loss": 0.0064, + "step": 1823 + }, + { + "epoch": 2.59, + "learning_rate": 8.960647612164374e-06, + "loss": 0.0111, + "step": 1824 + }, + { + "epoch": 2.6, + "learning_rate": 8.898824243937499e-06, + "loss": 0.0029, + "step": 1825 + }, + { + "epoch": 2.6, + "learning_rate": 8.83720495492285e-06, + "loss": 0.0106, + "step": 1826 + }, + { + "epoch": 2.6, + "learning_rate": 8.775789883156171e-06, + "loss": 0.0028, + "step": 1827 + }, + { + "epoch": 2.6, + "learning_rate": 8.714579166215664e-06, + "loss": 0.0004, + "step": 1828 + }, + { + "epoch": 2.6, + "learning_rate": 8.65357294122181e-06, + "loss": 0.009, + "step": 1829 + }, + { + "epoch": 2.6, + "learning_rate": 8.592771344836958e-06, + "loss": 0.0026, + "step": 1830 + }, + { + "epoch": 2.6, + "learning_rate": 8.532174513265123e-06, + "loss": 0.0017, + "step": 1831 + }, + { + "epoch": 2.61, + "learning_rate": 8.471782582251508e-06, + "loss": 0.0051, + "step": 1832 + }, + { + "epoch": 2.61, + "learning_rate": 8.41159568708243e-06, + "loss": 0.0066, + "step": 1833 + }, + { + "epoch": 2.61, + "learning_rate": 8.351613962584825e-06, + "loss": 0.0074, + "step": 1834 + }, + { + "epoch": 2.61, + "learning_rate": 8.291837543126035e-06, + "loss": 0.0088, + "step": 1835 + }, + { + "epoch": 2.61, + "learning_rate": 8.23226656261349e-06, + "loss": 0.0069, + "step": 1836 + }, + { + "epoch": 2.61, + "learning_rate": 8.172901154494417e-06, + "loss": 0.0067, + "step": 1837 + }, + { + "epoch": 2.61, + "learning_rate": 8.113741451755551e-06, + "loss": 0.0039, + "step": 1838 + }, + { + "epoch": 2.62, + "learning_rate": 8.054787586922752e-06, + "loss": 0.0065, + "step": 1839 + }, + { + "epoch": 2.62, + "learning_rate": 7.99603969206083e-06, + "loss": 0.0086, + "step": 1840 + }, + { + "epoch": 2.62, + "eval_loss": 0.034408267587423325, + "eval_runtime": 23.2026, + "eval_samples_per_second": 43.099, + "eval_steps_per_second": 10.775, + "step": 1840 + }, + { + "epoch": 2.62, + "learning_rate": 7.937497898773194e-06, + "loss": 0.0087, + "step": 1841 + }, + { + "epoch": 2.62, + "learning_rate": 7.879162338201552e-06, + "loss": 0.0083, + "step": 1842 + }, + { + "epoch": 2.62, + "learning_rate": 7.82103314102558e-06, + "loss": 0.0031, + "step": 1843 + }, + { + "epoch": 2.62, + "learning_rate": 7.763110437462728e-06, + "loss": 0.0042, + "step": 1844 + }, + { + "epoch": 2.62, + "learning_rate": 7.705394357267848e-06, + "loss": 0.0129, + "step": 1845 + }, + { + "epoch": 2.63, + "learning_rate": 7.647885029732937e-06, + "loss": 0.0023, + "step": 1846 + }, + { + "epoch": 2.63, + "learning_rate": 7.590582583686811e-06, + "loss": 0.0071, + "step": 1847 + }, + { + "epoch": 2.63, + "learning_rate": 7.533487147494866e-06, + "loss": 0.0071, + "step": 1848 + }, + { + "epoch": 2.63, + "learning_rate": 7.476598849058769e-06, + "loss": 0.0036, + "step": 1849 + }, + { + "epoch": 2.63, + "learning_rate": 7.419917815816169e-06, + "loss": 0.0088, + "step": 1850 + }, + { + "epoch": 2.63, + "learning_rate": 7.363444174740386e-06, + "loss": 0.0066, + "step": 1851 + }, + { + "epoch": 2.63, + "learning_rate": 7.30717805234018e-06, + "loss": 0.0059, + "step": 1852 + }, + { + "epoch": 2.64, + "learning_rate": 7.251119574659448e-06, + "loss": 0.0076, + "step": 1853 + }, + { + "epoch": 2.64, + "learning_rate": 7.1952688672769275e-06, + "loss": 0.0005, + "step": 1854 + }, + { + "epoch": 2.64, + "learning_rate": 7.13962605530587e-06, + "loss": 0.0068, + "step": 1855 + }, + { + "epoch": 2.64, + "learning_rate": 7.0841912633938915e-06, + "loss": 0.0063, + "step": 1856 + }, + { + "epoch": 2.64, + "learning_rate": 7.028964615722578e-06, + "loss": 0.0008, + "step": 1857 + }, + { + "epoch": 2.64, + "learning_rate": 6.973946236007245e-06, + "loss": 0.0074, + "step": 1858 + }, + { + "epoch": 2.64, + "learning_rate": 6.9191362474966496e-06, + "loss": 0.0111, + "step": 1859 + }, + { + "epoch": 2.65, + "learning_rate": 6.8645347729727415e-06, + "loss": 0.0065, + "step": 1860 + }, + { + "epoch": 2.65, + "eval_loss": 0.03319459781050682, + "eval_runtime": 23.057, + "eval_samples_per_second": 43.371, + "eval_steps_per_second": 10.843, + "step": 1860 + }, + { + "epoch": 2.65, + "learning_rate": 6.810141934750358e-06, + "loss": 0.0057, + "step": 1861 + }, + { + "epoch": 2.65, + "learning_rate": 6.755957854676975e-06, + "loss": 0.0072, + "step": 1862 + }, + { + "epoch": 2.65, + "learning_rate": 6.70198265413241e-06, + "loss": 0.002, + "step": 1863 + }, + { + "epoch": 2.65, + "learning_rate": 6.648216454028544e-06, + "loss": 0.0176, + "step": 1864 + }, + { + "epoch": 2.65, + "learning_rate": 6.59465937480912e-06, + "loss": 0.006, + "step": 1865 + }, + { + "epoch": 2.65, + "learning_rate": 6.541311536449391e-06, + "loss": 0.0037, + "step": 1866 + }, + { + "epoch": 2.66, + "learning_rate": 6.488173058455849e-06, + "loss": 0.0132, + "step": 1867 + }, + { + "epoch": 2.66, + "learning_rate": 6.435244059866041e-06, + "loss": 0.0053, + "step": 1868 + }, + { + "epoch": 2.66, + "learning_rate": 6.382524659248235e-06, + "loss": 0.0068, + "step": 1869 + }, + { + "epoch": 2.66, + "learning_rate": 6.33001497470117e-06, + "loss": 0.004, + "step": 1870 + }, + { + "epoch": 2.66, + "learning_rate": 6.277715123853778e-06, + "loss": 0.0089, + "step": 1871 + }, + { + "epoch": 2.66, + "learning_rate": 6.22562522386495e-06, + "loss": 0.0097, + "step": 1872 + }, + { + "epoch": 2.66, + "learning_rate": 6.173745391423258e-06, + "loss": 0.0089, + "step": 1873 + }, + { + "epoch": 2.67, + "learning_rate": 6.12207574274668e-06, + "loss": 0.0087, + "step": 1874 + }, + { + "epoch": 2.67, + "learning_rate": 6.070616393582374e-06, + "loss": 0.0052, + "step": 1875 + }, + { + "epoch": 2.67, + "learning_rate": 6.019367459206393e-06, + "loss": 0.0084, + "step": 1876 + }, + { + "epoch": 2.67, + "learning_rate": 5.968329054423405e-06, + "loss": 0.0042, + "step": 1877 + }, + { + "epoch": 2.67, + "learning_rate": 5.9175012935664855e-06, + "loss": 0.0025, + "step": 1878 + }, + { + "epoch": 2.67, + "learning_rate": 5.866884290496854e-06, + "loss": 0.0005, + "step": 1879 + }, + { + "epoch": 2.67, + "learning_rate": 5.8164781586036e-06, + "loss": 0.0051, + "step": 1880 + }, + { + "epoch": 2.67, + "eval_loss": 0.03443886712193489, + "eval_runtime": 23.1237, + "eval_samples_per_second": 43.246, + "eval_steps_per_second": 10.811, + "step": 1880 + }, + { + "epoch": 2.68, + "learning_rate": 5.766283010803375e-06, + "loss": 0.0124, + "step": 1881 + }, + { + "epoch": 2.68, + "learning_rate": 5.716298959540256e-06, + "loss": 0.0255, + "step": 1882 + }, + { + "epoch": 2.68, + "learning_rate": 5.666526116785453e-06, + "loss": 0.0074, + "step": 1883 + }, + { + "epoch": 2.68, + "learning_rate": 5.616964594036955e-06, + "loss": 0.0082, + "step": 1884 + }, + { + "epoch": 2.68, + "learning_rate": 5.56761450231944e-06, + "loss": 0.0038, + "step": 1885 + }, + { + "epoch": 2.68, + "learning_rate": 5.518475952183932e-06, + "loss": 0.0051, + "step": 1886 + }, + { + "epoch": 2.68, + "learning_rate": 5.469549053707579e-06, + "loss": 0.0088, + "step": 1887 + }, + { + "epoch": 2.69, + "learning_rate": 5.4208339164933506e-06, + "loss": 0.0011, + "step": 1888 + }, + { + "epoch": 2.69, + "learning_rate": 5.37233064966991e-06, + "loss": 0.0029, + "step": 1889 + }, + { + "epoch": 2.69, + "learning_rate": 5.324039361891264e-06, + "loss": 0.0017, + "step": 1890 + }, + { + "epoch": 2.69, + "learning_rate": 5.275960161336579e-06, + "loss": 0.0065, + "step": 1891 + }, + { + "epoch": 2.69, + "learning_rate": 5.228093155709879e-06, + "loss": 0.0101, + "step": 1892 + }, + { + "epoch": 2.69, + "learning_rate": 5.180438452239878e-06, + "loss": 0.0043, + "step": 1893 + }, + { + "epoch": 2.69, + "learning_rate": 5.1329961576796814e-06, + "loss": 0.0075, + "step": 1894 + }, + { + "epoch": 2.7, + "learning_rate": 5.0857663783066e-06, + "loss": 0.0071, + "step": 1895 + }, + { + "epoch": 2.7, + "learning_rate": 5.038749219921846e-06, + "loss": 0.0032, + "step": 1896 + }, + { + "epoch": 2.7, + "learning_rate": 4.991944787850344e-06, + "loss": 0.0127, + "step": 1897 + }, + { + "epoch": 2.7, + "learning_rate": 4.945353186940482e-06, + "loss": 0.0091, + "step": 1898 + }, + { + "epoch": 2.7, + "learning_rate": 4.898974521563904e-06, + "loss": 0.0062, + "step": 1899 + }, + { + "epoch": 2.7, + "learning_rate": 4.852808895615191e-06, + "loss": 0.008, + "step": 1900 + }, + { + "epoch": 2.7, + "eval_loss": 0.03550153970718384, + "eval_runtime": 22.9915, + "eval_samples_per_second": 43.494, + "eval_steps_per_second": 10.874, + "step": 1900 + }, + { + "epoch": 2.7, + "learning_rate": 4.8068564125117396e-06, + "loss": 0.0129, + "step": 1901 + }, + { + "epoch": 2.71, + "learning_rate": 4.761117175193441e-06, + "loss": 0.0019, + "step": 1902 + }, + { + "epoch": 2.71, + "learning_rate": 4.715591286122534e-06, + "loss": 0.0137, + "step": 1903 + }, + { + "epoch": 2.71, + "learning_rate": 4.6702788472832605e-06, + "loss": 0.0007, + "step": 1904 + }, + { + "epoch": 2.71, + "learning_rate": 4.625179960181758e-06, + "loss": 0.0112, + "step": 1905 + }, + { + "epoch": 2.71, + "learning_rate": 4.5802947258457466e-06, + "loss": 0.0021, + "step": 1906 + }, + { + "epoch": 2.71, + "learning_rate": 4.535623244824372e-06, + "loss": 0.0117, + "step": 1907 + }, + { + "epoch": 2.71, + "learning_rate": 4.491165617187898e-06, + "loss": 0.0115, + "step": 1908 + }, + { + "epoch": 2.72, + "learning_rate": 4.446921942527549e-06, + "loss": 0.0126, + "step": 1909 + }, + { + "epoch": 2.72, + "learning_rate": 4.4028923199552876e-06, + "loss": 0.0052, + "step": 1910 + }, + { + "epoch": 2.72, + "learning_rate": 4.359076848103527e-06, + "loss": 0.004, + "step": 1911 + }, + { + "epoch": 2.72, + "learning_rate": 4.315475625124976e-06, + "loss": 0.0046, + "step": 1912 + }, + { + "epoch": 2.72, + "learning_rate": 4.272088748692404e-06, + "loss": 0.0009, + "step": 1913 + }, + { + "epoch": 2.72, + "learning_rate": 4.228916315998388e-06, + "loss": 0.0033, + "step": 1914 + }, + { + "epoch": 2.72, + "learning_rate": 4.185958423755165e-06, + "loss": 0.0053, + "step": 1915 + }, + { + "epoch": 2.73, + "learning_rate": 4.143215168194314e-06, + "loss": 0.0026, + "step": 1916 + }, + { + "epoch": 2.73, + "learning_rate": 4.100686645066654e-06, + "loss": 0.0064, + "step": 1917 + }, + { + "epoch": 2.73, + "learning_rate": 4.058372949641931e-06, + "loss": 0.0024, + "step": 1918 + }, + { + "epoch": 2.73, + "learning_rate": 4.016274176708679e-06, + "loss": 0.0125, + "step": 1919 + }, + { + "epoch": 2.73, + "learning_rate": 3.974390420573948e-06, + "loss": 0.0035, + "step": 1920 + }, + { + "epoch": 2.73, + "eval_loss": 0.03510810434818268, + "eval_runtime": 23.1381, + "eval_samples_per_second": 43.219, + "eval_steps_per_second": 10.805, + "step": 1920 + }, + { + "epoch": 2.73, + "learning_rate": 3.932721775063153e-06, + "loss": 0.0018, + "step": 1921 + }, + { + "epoch": 2.73, + "learning_rate": 3.891268333519804e-06, + "loss": 0.0072, + "step": 1922 + }, + { + "epoch": 2.74, + "learning_rate": 3.85003018880532e-06, + "loss": 0.006, + "step": 1923 + }, + { + "epoch": 2.74, + "learning_rate": 3.80900743329885e-06, + "loss": 0.0089, + "step": 1924 + }, + { + "epoch": 2.74, + "learning_rate": 3.7682001588970283e-06, + "loss": 0.0057, + "step": 1925 + }, + { + "epoch": 2.74, + "learning_rate": 3.7276084570138094e-06, + "loss": 0.0045, + "step": 1926 + }, + { + "epoch": 2.74, + "learning_rate": 3.6872324185801774e-06, + "loss": 0.0008, + "step": 1927 + }, + { + "epoch": 2.74, + "learning_rate": 3.647072134044027e-06, + "loss": 0.0077, + "step": 1928 + }, + { + "epoch": 2.74, + "learning_rate": 3.6071276933699714e-06, + "loss": 0.0086, + "step": 1929 + }, + { + "epoch": 2.75, + "learning_rate": 3.5673991860390666e-06, + "loss": 0.0099, + "step": 1930 + }, + { + "epoch": 2.75, + "learning_rate": 3.527886701048644e-06, + "loss": 0.003, + "step": 1931 + }, + { + "epoch": 2.75, + "learning_rate": 3.488590326912122e-06, + "loss": 0.0103, + "step": 1932 + }, + { + "epoch": 2.75, + "learning_rate": 3.449510151658808e-06, + "loss": 0.021, + "step": 1933 + }, + { + "epoch": 2.75, + "learning_rate": 3.4106462628337056e-06, + "loss": 0.0027, + "step": 1934 + }, + { + "epoch": 2.75, + "learning_rate": 3.3719987474972624e-06, + "loss": 0.0048, + "step": 1935 + }, + { + "epoch": 2.75, + "learning_rate": 3.3335676922252814e-06, + "loss": 0.011, + "step": 1936 + }, + { + "epoch": 2.76, + "learning_rate": 3.295353183108596e-06, + "loss": 0.0149, + "step": 1937 + }, + { + "epoch": 2.76, + "learning_rate": 3.257355305753029e-06, + "loss": 0.0046, + "step": 1938 + }, + { + "epoch": 2.76, + "learning_rate": 3.219574145279025e-06, + "loss": 0.0042, + "step": 1939 + }, + { + "epoch": 2.76, + "learning_rate": 3.1820097863216492e-06, + "loss": 0.0065, + "step": 1940 + }, + { + "epoch": 2.76, + "eval_loss": 0.03521741181612015, + "eval_runtime": 23.2795, + "eval_samples_per_second": 42.956, + "eval_steps_per_second": 10.739, + "step": 1940 + }, + { + "epoch": 2.76, + "learning_rate": 3.144662313030189e-06, + "loss": 0.0034, + "step": 1941 + }, + { + "epoch": 2.76, + "learning_rate": 3.1075318090682094e-06, + "loss": 0.0014, + "step": 1942 + }, + { + "epoch": 2.76, + "learning_rate": 3.0706183576131196e-06, + "loss": 0.002, + "step": 1943 + }, + { + "epoch": 2.77, + "learning_rate": 3.033922041356174e-06, + "loss": 0.011, + "step": 1944 + }, + { + "epoch": 2.77, + "learning_rate": 2.9974429425021487e-06, + "loss": 0.0115, + "step": 1945 + }, + { + "epoch": 2.77, + "learning_rate": 2.9611811427693203e-06, + "loss": 0.0033, + "step": 1946 + }, + { + "epoch": 2.77, + "learning_rate": 2.925136723389077e-06, + "loss": 0.0099, + "step": 1947 + }, + { + "epoch": 2.77, + "learning_rate": 2.8893097651059297e-06, + "loss": 0.0085, + "step": 1948 + }, + { + "epoch": 2.77, + "learning_rate": 2.8537003481771687e-06, + "loss": 0.0032, + "step": 1949 + }, + { + "epoch": 2.77, + "learning_rate": 2.8183085523728502e-06, + "loss": 0.0044, + "step": 1950 + }, + { + "epoch": 2.78, + "learning_rate": 2.783134456975456e-06, + "loss": 0.0092, + "step": 1951 + }, + { + "epoch": 2.78, + "learning_rate": 2.7481781407798447e-06, + "loss": 0.0048, + "step": 1952 + }, + { + "epoch": 2.78, + "learning_rate": 2.7134396820929444e-06, + "loss": 0.0122, + "step": 1953 + }, + { + "epoch": 2.78, + "learning_rate": 2.678919158733739e-06, + "loss": 0.0007, + "step": 1954 + }, + { + "epoch": 2.78, + "learning_rate": 2.6446166480329605e-06, + "loss": 0.0027, + "step": 1955 + }, + { + "epoch": 2.78, + "learning_rate": 2.6105322268329625e-06, + "loss": 0.0039, + "step": 1956 + }, + { + "epoch": 2.78, + "learning_rate": 2.5766659714875354e-06, + "loss": 0.0063, + "step": 1957 + }, + { + "epoch": 2.79, + "learning_rate": 2.5430179578618042e-06, + "loss": 0.0083, + "step": 1958 + }, + { + "epoch": 2.79, + "learning_rate": 2.509588261331941e-06, + "loss": 0.005, + "step": 1959 + }, + { + "epoch": 2.79, + "learning_rate": 2.476376956785098e-06, + "loss": 0.0097, + "step": 1960 + }, + { + "epoch": 2.79, + "eval_loss": 0.03469571843743324, + "eval_runtime": 23.2731, + "eval_samples_per_second": 42.968, + "eval_steps_per_second": 10.742, + "step": 1960 + }, + { + "epoch": 2.79, + "learning_rate": 2.443384118619152e-06, + "loss": 0.0075, + "step": 1961 + }, + { + "epoch": 2.79, + "learning_rate": 2.4106098207426487e-06, + "loss": 0.008, + "step": 1962 + }, + { + "epoch": 2.79, + "learning_rate": 2.3780541365745367e-06, + "loss": 0.0088, + "step": 1963 + }, + { + "epoch": 2.79, + "learning_rate": 2.3457171390440346e-06, + "loss": 0.0019, + "step": 1964 + }, + { + "epoch": 2.8, + "learning_rate": 2.3135989005904747e-06, + "loss": 0.0029, + "step": 1965 + }, + { + "epoch": 2.8, + "learning_rate": 2.2816994931631473e-06, + "loss": 0.0083, + "step": 1966 + }, + { + "epoch": 2.8, + "learning_rate": 2.2500189882211475e-06, + "loss": 0.0076, + "step": 1967 + }, + { + "epoch": 2.8, + "learning_rate": 2.2185574567331723e-06, + "loss": 0.0011, + "step": 1968 + }, + { + "epoch": 2.8, + "learning_rate": 2.1873149691773897e-06, + "loss": 0.0054, + "step": 1969 + }, + { + "epoch": 2.8, + "learning_rate": 2.156291595541282e-06, + "loss": 0.0127, + "step": 1970 + }, + { + "epoch": 2.8, + "learning_rate": 2.1254874053215136e-06, + "loss": 0.0093, + "step": 1971 + }, + { + "epoch": 2.81, + "learning_rate": 2.094902467523696e-06, + "loss": 0.0051, + "step": 1972 + }, + { + "epoch": 2.81, + "learning_rate": 2.0645368506623242e-06, + "loss": 0.0065, + "step": 1973 + }, + { + "epoch": 2.81, + "learning_rate": 2.0343906227605623e-06, + "loss": 0.0049, + "step": 1974 + }, + { + "epoch": 2.81, + "learning_rate": 2.0044638513501468e-06, + "loss": 0.0026, + "step": 1975 + }, + { + "epoch": 2.81, + "learning_rate": 1.9747566034711616e-06, + "loss": 0.0078, + "step": 1976 + }, + { + "epoch": 2.81, + "learning_rate": 1.945268945671941e-06, + "loss": 0.011, + "step": 1977 + }, + { + "epoch": 2.81, + "learning_rate": 1.916000944008911e-06, + "loss": 0.0192, + "step": 1978 + }, + { + "epoch": 2.82, + "learning_rate": 1.8869526640464597e-06, + "loss": 0.0036, + "step": 1979 + }, + { + "epoch": 2.82, + "learning_rate": 1.858124170856712e-06, + "loss": 0.0034, + "step": 1980 + }, + { + "epoch": 2.82, + "eval_loss": 0.034716423600912094, + "eval_runtime": 23.183, + "eval_samples_per_second": 43.135, + "eval_steps_per_second": 10.784, + "step": 1980 + }, + { + "epoch": 2.82, + "learning_rate": 1.8295155290194876e-06, + "loss": 0.0066, + "step": 1981 + }, + { + "epoch": 2.82, + "learning_rate": 1.8011268026220883e-06, + "loss": 0.0025, + "step": 1982 + }, + { + "epoch": 2.82, + "learning_rate": 1.7729580552591885e-06, + "loss": 0.0017, + "step": 1983 + }, + { + "epoch": 2.82, + "learning_rate": 1.745009350032667e-06, + "loss": 0.0033, + "step": 1984 + }, + { + "epoch": 2.82, + "learning_rate": 1.7172807495514974e-06, + "loss": 0.0019, + "step": 1985 + }, + { + "epoch": 2.83, + "learning_rate": 1.6897723159315371e-06, + "loss": 0.0052, + "step": 1986 + }, + { + "epoch": 2.83, + "learning_rate": 1.6624841107955259e-06, + "loss": 0.0016, + "step": 1987 + }, + { + "epoch": 2.83, + "learning_rate": 1.6354161952727654e-06, + "loss": 0.0087, + "step": 1988 + }, + { + "epoch": 2.83, + "learning_rate": 1.608568629999152e-06, + "loss": 0.0031, + "step": 1989 + }, + { + "epoch": 2.83, + "learning_rate": 1.5819414751169215e-06, + "loss": 0.0063, + "step": 1990 + }, + { + "epoch": 2.83, + "learning_rate": 1.5555347902745932e-06, + "loss": 0.0131, + "step": 1991 + }, + { + "epoch": 2.83, + "learning_rate": 1.5293486346267704e-06, + "loss": 0.0, + "step": 1992 + }, + { + "epoch": 2.83, + "learning_rate": 1.5033830668340743e-06, + "loss": 0.0009, + "step": 1993 + }, + { + "epoch": 2.84, + "learning_rate": 1.4776381450629318e-06, + "loss": 0.0185, + "step": 1994 + }, + { + "epoch": 2.84, + "learning_rate": 1.4521139269855766e-06, + "loss": 0.0084, + "step": 1995 + }, + { + "epoch": 2.84, + "learning_rate": 1.4268104697797269e-06, + "loss": 0.004, + "step": 1996 + }, + { + "epoch": 2.84, + "learning_rate": 1.4017278301286518e-06, + "loss": 0.0106, + "step": 1997 + }, + { + "epoch": 2.84, + "learning_rate": 1.3768660642209052e-06, + "loss": 0.0005, + "step": 1998 + }, + { + "epoch": 2.84, + "learning_rate": 1.3522252277503033e-06, + "loss": 0.0006, + "step": 1999 + }, + { + "epoch": 2.84, + "learning_rate": 1.3278053759156917e-06, + "loss": 0.0054, + "step": 2000 + }, + { + "epoch": 2.84, + "eval_loss": 0.0348287858068943, + "eval_runtime": 23.349, + "eval_samples_per_second": 42.828, + "eval_steps_per_second": 10.707, + "step": 2000 + }, + { + "epoch": 2.85, + "learning_rate": 1.303606563420945e-06, + "loss": 0.0105, + "step": 2001 + }, + { + "epoch": 2.85, + "learning_rate": 1.2796288444747006e-06, + "loss": 0.0167, + "step": 2002 + }, + { + "epoch": 2.85, + "learning_rate": 1.2558722727904038e-06, + "loss": 0.0011, + "step": 2003 + }, + { + "epoch": 2.85, + "learning_rate": 1.2323369015860286e-06, + "loss": 0.0097, + "step": 2004 + }, + { + "epoch": 2.85, + "learning_rate": 1.2090227835840683e-06, + "loss": 0.0066, + "step": 2005 + }, + { + "epoch": 2.85, + "learning_rate": 1.185929971011346e-06, + "loss": 0.0024, + "step": 2006 + }, + { + "epoch": 2.85, + "learning_rate": 1.1630585155989803e-06, + "loss": 0.0067, + "step": 2007 + }, + { + "epoch": 2.86, + "learning_rate": 1.1404084685821659e-06, + "loss": 0.0114, + "step": 2008 + }, + { + "epoch": 2.86, + "learning_rate": 1.1179798807001485e-06, + "loss": 0.0225, + "step": 2009 + }, + { + "epoch": 2.86, + "learning_rate": 1.095772802196049e-06, + "loss": 0.0092, + "step": 2010 + }, + { + "epoch": 2.86, + "learning_rate": 1.073787282816796e-06, + "loss": 0.0074, + "step": 2011 + }, + { + "epoch": 2.86, + "learning_rate": 1.0520233718129935e-06, + "loss": 0.005, + "step": 2012 + }, + { + "epoch": 2.86, + "learning_rate": 1.0304811179388085e-06, + "loss": 0.0007, + "step": 2013 + }, + { + "epoch": 2.86, + "learning_rate": 1.0091605694518502e-06, + "loss": 0.0059, + "step": 2014 + }, + { + "epoch": 2.87, + "learning_rate": 9.880617741131026e-07, + "loss": 0.0035, + "step": 2015 + }, + { + "epoch": 2.87, + "learning_rate": 9.671847791867805e-07, + "loss": 0.0097, + "step": 2016 + }, + { + "epoch": 2.87, + "learning_rate": 9.465296314402295e-07, + "loss": 0.0053, + "step": 2017 + }, + { + "epoch": 2.87, + "learning_rate": 9.260963771438369e-07, + "loss": 0.0055, + "step": 2018 + }, + { + "epoch": 2.87, + "learning_rate": 9.058850620709103e-07, + "loss": 0.0073, + "step": 2019 + }, + { + "epoch": 2.87, + "learning_rate": 8.85895731497588e-07, + "loss": 0.0045, + "step": 2020 + }, + { + "epoch": 2.87, + "eval_loss": 0.034411560744047165, + "eval_runtime": 23.339, + "eval_samples_per_second": 42.847, + "eval_steps_per_second": 10.712, + "step": 2020 + }, + { + "epoch": 2.87, + "learning_rate": 8.661284302027284e-07, + "loss": 0.0053, + "step": 2021 + }, + { + "epoch": 2.88, + "learning_rate": 8.465832024678211e-07, + "loss": 0.0016, + "step": 2022 + }, + { + "epoch": 2.88, + "learning_rate": 8.272600920768647e-07, + "loss": 0.0018, + "step": 2023 + }, + { + "epoch": 2.88, + "learning_rate": 8.081591423163115e-07, + "loss": 0.0093, + "step": 2024 + }, + { + "epoch": 2.88, + "learning_rate": 7.892803959749007e-07, + "loss": 0.0202, + "step": 2025 + }, + { + "epoch": 2.88, + "learning_rate": 7.706238953436585e-07, + "loss": 0.007, + "step": 2026 + }, + { + "epoch": 2.88, + "learning_rate": 7.521896822156982e-07, + "loss": 0.008, + "step": 2027 + }, + { + "epoch": 2.88, + "learning_rate": 7.339777978862205e-07, + "loss": 0.004, + "step": 2028 + }, + { + "epoch": 2.89, + "learning_rate": 7.159882831523468e-07, + "loss": 0.0092, + "step": 2029 + }, + { + "epoch": 2.89, + "learning_rate": 6.982211783130854e-07, + "loss": 0.0089, + "step": 2030 + }, + { + "epoch": 2.89, + "learning_rate": 6.806765231692103e-07, + "loss": 0.003, + "step": 2031 + }, + { + "epoch": 2.89, + "learning_rate": 6.633543570231493e-07, + "loss": 0.0141, + "step": 2032 + }, + { + "epoch": 2.89, + "learning_rate": 6.462547186789625e-07, + "loss": 0.0043, + "step": 2033 + }, + { + "epoch": 2.89, + "learning_rate": 6.293776464421974e-07, + "loss": 0.0032, + "step": 2034 + }, + { + "epoch": 2.89, + "learning_rate": 6.127231781198117e-07, + "loss": 0.0046, + "step": 2035 + }, + { + "epoch": 2.9, + "learning_rate": 5.962913510201396e-07, + "loss": 0.0046, + "step": 2036 + }, + { + "epoch": 2.9, + "learning_rate": 5.800822019527253e-07, + "loss": 0.0051, + "step": 2037 + }, + { + "epoch": 2.9, + "learning_rate": 5.64095767228312e-07, + "loss": 0.0064, + "step": 2038 + }, + { + "epoch": 2.9, + "learning_rate": 5.483320826587091e-07, + "loss": 0.0088, + "step": 2039 + }, + { + "epoch": 2.9, + "learning_rate": 5.327911835567578e-07, + "loss": 0.0032, + "step": 2040 + }, + { + "epoch": 2.9, + "eval_loss": 0.03432368114590645, + "eval_runtime": 23.1099, + "eval_samples_per_second": 43.272, + "eval_steps_per_second": 10.818, + "step": 2040 + }, + { + "epoch": 2.9, + "learning_rate": 5.174731047362102e-07, + "loss": 0.0096, + "step": 2041 + }, + { + "epoch": 2.9, + "learning_rate": 5.02377880511673e-07, + "loss": 0.004, + "step": 2042 + }, + { + "epoch": 2.91, + "learning_rate": 4.875055446985411e-07, + "loss": 0.0045, + "step": 2043 + }, + { + "epoch": 2.91, + "learning_rate": 4.7285613061290865e-07, + "loss": 0.0064, + "step": 2044 + }, + { + "epoch": 2.91, + "learning_rate": 4.5842967107145864e-07, + "loss": 0.0069, + "step": 2045 + }, + { + "epoch": 2.91, + "learning_rate": 4.4422619839148415e-07, + "loss": 0.0121, + "step": 2046 + }, + { + "epoch": 2.91, + "learning_rate": 4.302457443907004e-07, + "loss": 0.0106, + "step": 2047 + }, + { + "epoch": 2.91, + "learning_rate": 4.164883403872777e-07, + "loss": 0.0014, + "step": 2048 + }, + { + "epoch": 2.91, + "learning_rate": 4.0295401719967484e-07, + "loss": 0.0046, + "step": 2049 + }, + { + "epoch": 2.92, + "learning_rate": 3.8964280514666164e-07, + "loss": 0.0028, + "step": 2050 + }, + { + "epoch": 2.92, + "learning_rate": 3.7655473404718533e-07, + "loss": 0.0048, + "step": 2051 + }, + { + "epoch": 2.92, + "learning_rate": 3.6368983322033754e-07, + "loss": 0.0113, + "step": 2052 + }, + { + "epoch": 2.92, + "learning_rate": 3.510481314852543e-07, + "loss": 0.0023, + "step": 2053 + }, + { + "epoch": 2.92, + "learning_rate": 3.3862965716112693e-07, + "loss": 0.0013, + "step": 2054 + }, + { + "epoch": 2.92, + "learning_rate": 3.264344380670137e-07, + "loss": 0.0157, + "step": 2055 + }, + { + "epoch": 2.92, + "learning_rate": 3.1446250152191713e-07, + "loss": 0.006, + "step": 2056 + }, + { + "epoch": 2.93, + "learning_rate": 3.0271387434461786e-07, + "loss": 0.0157, + "step": 2057 + }, + { + "epoch": 2.93, + "learning_rate": 2.9118858285366314e-07, + "loss": 0.0046, + "step": 2058 + }, + { + "epoch": 2.93, + "learning_rate": 2.798866528673005e-07, + "loss": 0.0014, + "step": 2059 + }, + { + "epoch": 2.93, + "learning_rate": 2.688081097034112e-07, + "loss": 0.0072, + "step": 2060 + }, + { + "epoch": 2.93, + "eval_loss": 0.03417177498340607, + "eval_runtime": 23.2162, + "eval_samples_per_second": 43.073, + "eval_steps_per_second": 10.768, + "step": 2060 + }, + { + "epoch": 2.93, + "learning_rate": 2.579529781794543e-07, + "loss": 0.0029, + "step": 2061 + }, + { + "epoch": 2.93, + "learning_rate": 2.473212826124449e-07, + "loss": 0.0035, + "step": 2062 + }, + { + "epoch": 2.93, + "learning_rate": 2.3691304681882075e-07, + "loss": 0.0086, + "step": 2063 + }, + { + "epoch": 2.94, + "learning_rate": 2.2672829411448659e-07, + "loss": 0.0062, + "step": 2064 + }, + { + "epoch": 2.94, + "learning_rate": 2.167670473146921e-07, + "loss": 0.012, + "step": 2065 + }, + { + "epoch": 2.94, + "learning_rate": 2.0702932873399862e-07, + "loss": 0.0079, + "step": 2066 + }, + { + "epoch": 2.94, + "learning_rate": 1.9751516018624571e-07, + "loss": 0.0031, + "step": 2067 + }, + { + "epoch": 2.94, + "learning_rate": 1.8822456298447366e-07, + "loss": 0.0015, + "step": 2068 + }, + { + "epoch": 2.94, + "learning_rate": 1.7915755794092327e-07, + "loss": 0.0096, + "step": 2069 + }, + { + "epoch": 2.94, + "learning_rate": 1.703141653669249e-07, + "loss": 0.0126, + "step": 2070 + }, + { + "epoch": 2.95, + "learning_rate": 1.6169440507289857e-07, + "loss": 0.0015, + "step": 2071 + }, + { + "epoch": 2.95, + "learning_rate": 1.5329829636829828e-07, + "loss": 0.0046, + "step": 2072 + }, + { + "epoch": 2.95, + "learning_rate": 1.4512585806157886e-07, + "loss": 0.0042, + "step": 2073 + }, + { + "epoch": 2.95, + "learning_rate": 1.37177108460107e-07, + "loss": 0.022, + "step": 2074 + }, + { + "epoch": 2.95, + "learning_rate": 1.2945206537019473e-07, + "loss": 0.0106, + "step": 2075 + }, + { + "epoch": 2.95, + "learning_rate": 1.219507460969993e-07, + "loss": 0.0055, + "step": 2076 + }, + { + "epoch": 2.95, + "learning_rate": 1.1467316744452339e-07, + "loss": 0.0025, + "step": 2077 + }, + { + "epoch": 2.96, + "learning_rate": 1.0761934571553722e-07, + "loss": 0.0048, + "step": 2078 + }, + { + "epoch": 2.96, + "learning_rate": 1.0078929671157866e-07, + "loss": 0.0214, + "step": 2079 + }, + { + "epoch": 2.96, + "learning_rate": 9.418303573289766e-08, + "loss": 0.0074, + "step": 2080 + }, + { + "epoch": 2.96, + "eval_loss": 0.03440214321017265, + "eval_runtime": 22.8935, + "eval_samples_per_second": 43.68, + "eval_steps_per_second": 10.92, + "step": 2080 + }, + { + "epoch": 2.96, + "learning_rate": 8.780057757843407e-08, + "loss": 0.0023, + "step": 2081 + }, + { + "epoch": 2.96, + "learning_rate": 8.164193654577323e-08, + "loss": 0.0149, + "step": 2082 + }, + { + "epoch": 2.96, + "learning_rate": 7.570712643111266e-08, + "loss": 0.0147, + "step": 2083 + }, + { + "epoch": 2.96, + "learning_rate": 6.999616052927315e-08, + "loss": 0.0029, + "step": 2084 + }, + { + "epoch": 2.97, + "learning_rate": 6.450905163357667e-08, + "loss": 0.001, + "step": 2085 + }, + { + "epoch": 2.97, + "learning_rate": 5.924581203592405e-08, + "loss": 0.0134, + "step": 2086 + }, + { + "epoch": 2.97, + "learning_rate": 5.4206453526695064e-08, + "loss": 0.0013, + "step": 2087 + }, + { + "epoch": 2.97, + "learning_rate": 4.939098739473735e-08, + "loss": 0.0095, + "step": 2088 + }, + { + "epoch": 2.97, + "learning_rate": 4.47994244273775e-08, + "loss": 0.0057, + "step": 2089 + }, + { + "epoch": 2.97, + "learning_rate": 4.043177491033223e-08, + "loss": 0.0032, + "step": 2090 + }, + { + "epoch": 2.97, + "learning_rate": 3.628804862776391e-08, + "loss": 0.0046, + "step": 2091 + }, + { + "epoch": 2.98, + "learning_rate": 3.2368254862158445e-08, + "loss": 0.0035, + "step": 2092 + }, + { + "epoch": 2.98, + "learning_rate": 2.8672402394425146e-08, + "loss": 0.0073, + "step": 2093 + }, + { + "epoch": 2.98, + "learning_rate": 2.5200499503774676e-08, + "loss": 0.0099, + "step": 2094 + }, + { + "epoch": 2.98, + "learning_rate": 2.1952553967741208e-08, + "loss": 0.0023, + "step": 2095 + }, + { + "epoch": 2.98, + "learning_rate": 1.8928573062171328e-08, + "loss": 0.014, + "step": 2096 + }, + { + "epoch": 2.98, + "learning_rate": 1.612856356121295e-08, + "loss": 0.0084, + "step": 2097 + }, + { + "epoch": 2.98, + "learning_rate": 1.3552531737259788e-08, + "loss": 0.0127, + "step": 2098 + }, + { + "epoch": 2.99, + "learning_rate": 1.1200483360984671e-08, + "loss": 0.0065, + "step": 2099 + }, + { + "epoch": 2.99, + "learning_rate": 9.07242370129513e-09, + "loss": 0.0111, + "step": 2100 + }, + { + "epoch": 2.99, + "eval_loss": 0.03416823968291283, + "eval_runtime": 23.1725, + "eval_samples_per_second": 43.155, + "eval_steps_per_second": 10.789, + "step": 2100 + }, + { + "epoch": 2.99, + "learning_rate": 7.168357525355607e-09, + "loss": 0.0019, + "step": 2101 + }, + { + "epoch": 2.99, + "learning_rate": 5.488289098520838e-09, + "loss": 0.0062, + "step": 2102 + }, + { + "epoch": 2.99, + "learning_rate": 4.032222184380263e-09, + "loss": 0.0008, + "step": 2103 + }, + { + "epoch": 2.99, + "learning_rate": 2.80016004472472e-09, + "loss": 0.0086, + "step": 2104 + }, + { + "epoch": 2.99, + "learning_rate": 1.792105439546443e-09, + "loss": 0.0015, + "step": 2105 + }, + { + "epoch": 3.0, + "learning_rate": 1.0080606270279624e-09, + "loss": 0.0039, + "step": 2106 + }, + { + "epoch": 3.0, + "learning_rate": 4.480273635310006e-10, + "loss": 0.0042, + "step": 2107 + }, + { + "epoch": 3.0, + "learning_rate": 1.1200690360757548e-10, + "loss": 0.0081, + "step": 2108 + }, + { + "epoch": 3.0, + "learning_rate": 0.0, + "loss": 0.0024, + "step": 2109 + } + ], + "logging_steps": 1, + "max_steps": 2109, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 2.772862993057186e+18, + "trial_name": null, + "trial_params": null +}