Jamba-Chat / trainer_state.json
mattshumer's picture
Upload folder using huggingface_hub (#1)
4dcffa2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 22.986753463745117,
"learning_rate": 0.00198,
"loss": 1.7003,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 8.073963165283203,
"learning_rate": 0.00196,
"loss": 1.9785,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 7.6197099685668945,
"learning_rate": 0.0019399999999999999,
"loss": 1.9353,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 6.2731146812438965,
"learning_rate": 0.00192,
"loss": 1.5962,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 5.425559043884277,
"learning_rate": 0.0019,
"loss": 1.389,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 4.872774600982666,
"learning_rate": 0.00188,
"loss": 1.4156,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 5.199490070343018,
"learning_rate": 0.00186,
"loss": 1.6583,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 4.413191318511963,
"learning_rate": 0.00184,
"loss": 1.4334,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 5.90674352645874,
"learning_rate": 0.00182,
"loss": 1.6046,
"step": 90
},
{
"epoch": 0.02,
"grad_norm": 6.410930633544922,
"learning_rate": 0.0018000000000000002,
"loss": 1.5504,
"step": 100
},
{
"epoch": 0.02,
"grad_norm": 3.529223680496216,
"learning_rate": 0.0017800000000000001,
"loss": 1.6463,
"step": 110
},
{
"epoch": 0.02,
"grad_norm": 4.781284332275391,
"learning_rate": 0.00176,
"loss": 1.6136,
"step": 120
},
{
"epoch": 0.03,
"grad_norm": 5.6382951736450195,
"learning_rate": 0.00174,
"loss": 1.5105,
"step": 130
},
{
"epoch": 0.03,
"grad_norm": 4.392839431762695,
"learning_rate": 0.00172,
"loss": 1.6061,
"step": 140
},
{
"epoch": 0.03,
"grad_norm": 3.9011926651000977,
"learning_rate": 0.0017,
"loss": 1.6188,
"step": 150
},
{
"epoch": 0.03,
"grad_norm": 4.002920627593994,
"learning_rate": 0.00168,
"loss": 1.4177,
"step": 160
},
{
"epoch": 0.03,
"grad_norm": 4.34838342666626,
"learning_rate": 0.00166,
"loss": 1.5689,
"step": 170
},
{
"epoch": 0.04,
"grad_norm": 8.142854690551758,
"learning_rate": 0.00164,
"loss": 1.5804,
"step": 180
},
{
"epoch": 0.04,
"grad_norm": 5.837989330291748,
"learning_rate": 0.0016200000000000001,
"loss": 1.5981,
"step": 190
},
{
"epoch": 0.04,
"grad_norm": 2.33852219581604,
"learning_rate": 0.0016,
"loss": 1.4347,
"step": 200
},
{
"epoch": 0.04,
"grad_norm": 3.069826602935791,
"learning_rate": 0.00158,
"loss": 1.4809,
"step": 210
},
{
"epoch": 0.04,
"grad_norm": 2.71095609664917,
"learning_rate": 0.0015600000000000002,
"loss": 1.388,
"step": 220
},
{
"epoch": 0.05,
"grad_norm": 4.450407981872559,
"learning_rate": 0.0015400000000000001,
"loss": 1.603,
"step": 230
},
{
"epoch": 0.05,
"grad_norm": 4.475738048553467,
"learning_rate": 0.00152,
"loss": 1.5731,
"step": 240
},
{
"epoch": 0.05,
"grad_norm": 3.051819086074829,
"learning_rate": 0.0015,
"loss": 1.5133,
"step": 250
},
{
"epoch": 0.05,
"grad_norm": 5.014269828796387,
"learning_rate": 0.00148,
"loss": 1.5458,
"step": 260
},
{
"epoch": 0.05,
"grad_norm": 2.558957815170288,
"learning_rate": 0.00146,
"loss": 1.4918,
"step": 270
},
{
"epoch": 0.06,
"grad_norm": 4.6234660148620605,
"learning_rate": 0.0014399999999999999,
"loss": 1.5247,
"step": 280
},
{
"epoch": 0.06,
"grad_norm": 2.9923095703125,
"learning_rate": 0.00142,
"loss": 1.6671,
"step": 290
},
{
"epoch": 0.06,
"grad_norm": 7.883978366851807,
"learning_rate": 0.0014,
"loss": 1.5732,
"step": 300
},
{
"epoch": 0.06,
"grad_norm": 3.3218066692352295,
"learning_rate": 0.00138,
"loss": 1.6297,
"step": 310
},
{
"epoch": 0.06,
"grad_norm": 9.045559883117676,
"learning_rate": 0.00136,
"loss": 1.6581,
"step": 320
},
{
"epoch": 0.07,
"grad_norm": 2.832301139831543,
"learning_rate": 0.00134,
"loss": 1.6966,
"step": 330
},
{
"epoch": 0.07,
"grad_norm": 3.6719107627868652,
"learning_rate": 0.00132,
"loss": 1.5904,
"step": 340
},
{
"epoch": 0.07,
"grad_norm": 5.4335455894470215,
"learning_rate": 0.0013000000000000002,
"loss": 1.6643,
"step": 350
},
{
"epoch": 0.07,
"grad_norm": 3.2848339080810547,
"learning_rate": 0.00128,
"loss": 1.4174,
"step": 360
},
{
"epoch": 0.07,
"grad_norm": 2.8206841945648193,
"learning_rate": 0.00126,
"loss": 1.7362,
"step": 370
},
{
"epoch": 0.08,
"grad_norm": 3.389599084854126,
"learning_rate": 0.00124,
"loss": 1.6058,
"step": 380
},
{
"epoch": 0.08,
"grad_norm": 4.887266159057617,
"learning_rate": 0.00122,
"loss": 1.4604,
"step": 390
},
{
"epoch": 0.08,
"grad_norm": 2.9653384685516357,
"learning_rate": 0.0012,
"loss": 1.5152,
"step": 400
},
{
"epoch": 0.08,
"grad_norm": 2.5362136363983154,
"learning_rate": 0.00118,
"loss": 1.469,
"step": 410
},
{
"epoch": 0.08,
"grad_norm": 2.7318670749664307,
"learning_rate": 0.00116,
"loss": 1.4136,
"step": 420
},
{
"epoch": 0.09,
"grad_norm": 3.6364078521728516,
"learning_rate": 0.00114,
"loss": 1.6937,
"step": 430
},
{
"epoch": 0.09,
"grad_norm": 1.9428081512451172,
"learning_rate": 0.0011200000000000001,
"loss": 1.4825,
"step": 440
},
{
"epoch": 0.09,
"grad_norm": 2.1813700199127197,
"learning_rate": 0.0011,
"loss": 1.4593,
"step": 450
},
{
"epoch": 0.09,
"grad_norm": 4.612652778625488,
"learning_rate": 0.00108,
"loss": 1.389,
"step": 460
},
{
"epoch": 0.09,
"grad_norm": 2.5145719051361084,
"learning_rate": 0.0010600000000000002,
"loss": 1.3896,
"step": 470
},
{
"epoch": 0.1,
"grad_norm": 2.4980382919311523,
"learning_rate": 0.0010400000000000001,
"loss": 1.3725,
"step": 480
},
{
"epoch": 0.1,
"grad_norm": 2.6995227336883545,
"learning_rate": 0.00102,
"loss": 1.4769,
"step": 490
},
{
"epoch": 0.1,
"grad_norm": 2.1483154296875,
"learning_rate": 0.001,
"loss": 1.5983,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 4.0788232684018176e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}