TokFSM_k1_codebook_model / trainer_state.json
taufeeque's picture
Add model
d2a7b67
{
"best_metric": 0.6364516129032258,
"best_model_checkpoint": "output_toy/checkpoint-15500",
"epoch": 0.775,
"global_step": 15500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.0,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 3.5788,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 13.754842758178711,
"max_norm/layer2": 14.107428550720215,
"max_norm/layer3": 14.075852394104004,
"mean_norm": 8.452343925833702,
"mean_norm/layer0": 8.452010810375214,
"mean_norm/layer1": 8.451448321342468,
"mean_norm/layer2": 8.451122224330902,
"mean_norm/layer3": 8.454794347286224,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 1
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.03,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 2.2465,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 13.313672065734863,
"max_norm/layer2": 13.402200698852539,
"max_norm/layer3": 13.972790718078613,
"mean_norm": 8.24767641723156,
"mean_norm/layer0": 8.446629762649536,
"mean_norm/layer1": 8.26882529258728,
"mean_norm/layer2": 8.018843710422516,
"mean_norm/layer3": 8.256406903266907,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 500
},
{
"epoch": 0.03,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.3564852016178642,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.31,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.8386362791061401,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.7699,
"eval_samples_per_second": 803.731,
"eval_steps_per_second": 1.57,
"eval_transition_accuracy": 0.3554838709677419,
"step": 500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.05,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.5981,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 13.26521110534668,
"max_norm/layer2": 13.196029663085938,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.097165614366531,
"mean_norm/layer0": 8.447738409042358,
"mean_norm/layer1": 8.100942492485046,
"mean_norm/layer2": 7.6819451451301575,
"mean_norm/layer3": 8.158036410808563,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 1000
},
{
"epoch": 0.05,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4204176054226132,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.58,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.4651859998703003,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.3952,
"eval_samples_per_second": 811.185,
"eval_steps_per_second": 1.584,
"eval_transition_accuracy": 0.5014516129032258,
"step": 1000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.07,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.3928,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 13.196317672729492,
"max_norm/layer2": 12.821307182312012,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.05762867629528,
"mean_norm/layer0": 8.449561476707458,
"mean_norm/layer1": 8.045644223690033,
"mean_norm/layer2": 7.585634410381317,
"mean_norm/layer3": 8.149674594402313,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 1500
},
{
"epoch": 0.07,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4378030131182333,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.79,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.354053258895874,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.3967,
"eval_samples_per_second": 811.156,
"eval_steps_per_second": 1.584,
"eval_transition_accuracy": 0.555,
"step": 1500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.1,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.3405,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 12.928963661193848,
"max_norm/layer2": 12.629460334777832,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.056178480386734,
"mean_norm/layer0": 8.451180398464203,
"mean_norm/layer1": 8.046804785728455,
"mean_norm/layer2": 7.5705525279045105,
"mean_norm/layer3": 8.156176209449768,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 2000
},
{
"epoch": 0.1,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4426980807086614,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.82,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.3263764381408691,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.4538,
"eval_samples_per_second": 810.011,
"eval_steps_per_second": 1.582,
"eval_transition_accuracy": 0.5756451612903226,
"step": 2000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.12,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.3189,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 12.928963661193848,
"max_norm/layer2": 12.610198974609375,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.065410792827606,
"mean_norm/layer0": 8.452252388000488,
"mean_norm/layer1": 8.05913120508194,
"mean_norm/layer2": 7.583977818489075,
"mean_norm/layer3": 8.166281759738922,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 2500
},
{
"epoch": 0.12,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4445690245140256,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.86,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.3187371492385864,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.0204,
"eval_samples_per_second": 818.782,
"eval_steps_per_second": 1.599,
"eval_transition_accuracy": 0.5575806451612904,
"step": 2500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.15,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.308,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 12.928963661193848,
"max_norm/layer2": 12.610198974609375,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.076771080493927,
"mean_norm/layer0": 8.453002035617828,
"mean_norm/layer1": 8.070474624633789,
"mean_norm/layer2": 7.606890320777893,
"mean_norm/layer3": 8.176717340946198,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 3000
},
{
"epoch": 0.15,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.44684558778297245,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.82,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.3064292669296265,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.4873,
"eval_samples_per_second": 809.341,
"eval_steps_per_second": 1.581,
"eval_transition_accuracy": 0.557258064516129,
"step": 3000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.17,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.3009,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 12.928963661193848,
"max_norm/layer2": 12.480420112609863,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.087428167462349,
"mean_norm/layer0": 8.45363199710846,
"mean_norm/layer1": 8.081151723861694,
"mean_norm/layer2": 7.628681242465973,
"mean_norm/layer3": 8.186247706413269,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 3500
},
{
"epoch": 0.17,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.44931342658095474,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.87,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2963054180145264,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.5536,
"eval_samples_per_second": 808.018,
"eval_steps_per_second": 1.578,
"eval_transition_accuracy": 0.5762903225806452,
"step": 3500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.2,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2965,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 12.945528030395508,
"max_norm/layer2": 13.130059242248535,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.097855687141418,
"mean_norm/layer0": 8.454240918159485,
"mean_norm/layer1": 8.09092777967453,
"mean_norm/layer2": 7.649804890155792,
"mean_norm/layer3": 8.196449160575867,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 4000
},
{
"epoch": 0.2,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4493698961152805,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.9,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2922283411026,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.3081,
"eval_samples_per_second": 812.939,
"eval_steps_per_second": 1.588,
"eval_transition_accuracy": 0.567741935483871,
"step": 4000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.23,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2919,
"max_norm": 14.111713409423828,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 13.611448287963867,
"max_norm/layer2": 13.752634048461914,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.10820010304451,
"mean_norm/layer0": 8.454896569252014,
"mean_norm/layer1": 8.099043369293213,
"mean_norm/layer2": 7.671496093273163,
"mean_norm/layer3": 8.20736438035965,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 4500
},
{
"epoch": 0.23,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.449889175535187,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.91,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2880299091339111,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.5303,
"eval_samples_per_second": 808.482,
"eval_steps_per_second": 1.579,
"eval_transition_accuracy": 0.5820967741935484,
"step": 4500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.25,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2889,
"max_norm": 14.629088401794434,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 14.212156295776367,
"max_norm/layer2": 14.629088401794434,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.117734983563423,
"mean_norm/layer0": 8.455368101596832,
"mean_norm/layer1": 8.104798018932343,
"mean_norm/layer2": 7.693721830844879,
"mean_norm/layer3": 8.217051982879639,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 5000
},
{
"epoch": 0.25,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45008597786970966,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.9,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2855565547943115,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.1444,
"eval_samples_per_second": 816.252,
"eval_steps_per_second": 1.594,
"eval_transition_accuracy": 0.56,
"step": 5000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.28,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2855,
"max_norm": 15.594667434692383,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 14.633105278015137,
"max_norm/layer2": 15.594667434692383,
"max_norm/layer3": 13.539477348327637,
"mean_norm": 8.12691231071949,
"mean_norm/layer0": 8.45590054988861,
"mean_norm/layer1": 8.110510230064392,
"mean_norm/layer2": 7.71501624584198,
"mean_norm/layer3": 8.226222217082977,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 5500
},
{
"epoch": 0.28,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4503207469549705,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.9,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2815940380096436,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.4723,
"eval_samples_per_second": 809.64,
"eval_steps_per_second": 1.581,
"eval_transition_accuracy": 0.6016129032258064,
"step": 5500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.3,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2828,
"max_norm": 16.523221969604492,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 15.140296936035156,
"max_norm/layer2": 16.523221969604492,
"max_norm/layer3": 15.213438987731934,
"mean_norm": 8.136623591184616,
"mean_norm/layer0": 8.456406712532043,
"mean_norm/layer1": 8.11625623703003,
"mean_norm/layer2": 7.738001704216003,
"mean_norm/layer3": 8.235829710960388,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 6000
},
{
"epoch": 0.3,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45019699457123524,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.87,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2844185829162598,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.4554,
"eval_samples_per_second": 809.979,
"eval_steps_per_second": 1.582,
"eval_transition_accuracy": 0.5733870967741935,
"step": 6000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.33,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2805,
"max_norm": 17.3682861328125,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 15.587692260742188,
"max_norm/layer2": 17.3682861328125,
"max_norm/layer3": 16.75887107849121,
"mean_norm": 8.145677655935287,
"mean_norm/layer0": 8.456890940666199,
"mean_norm/layer1": 8.120486974716187,
"mean_norm/layer2": 7.760386824607849,
"mean_norm/layer3": 8.244945883750916,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 6500
},
{
"epoch": 0.33,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45155730960875984,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.95,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.277733325958252,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.4778,
"eval_samples_per_second": 809.529,
"eval_steps_per_second": 1.581,
"eval_transition_accuracy": 0.6083870967741936,
"step": 6500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.35,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2793,
"max_norm": 18.367448806762695,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 16.05299949645996,
"max_norm/layer2": 18.142847061157227,
"max_norm/layer3": 18.367448806762695,
"mean_norm": 8.15514886379242,
"mean_norm/layer0": 8.45737361907959,
"mean_norm/layer1": 8.127346932888031,
"mean_norm/layer2": 7.781527459621429,
"mean_norm/layer3": 8.254347443580627,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 7000
},
{
"epoch": 0.35,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45114255890132876,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.93,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2795634269714355,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.6598,
"eval_samples_per_second": 805.906,
"eval_steps_per_second": 1.574,
"eval_transition_accuracy": 0.5680645161290323,
"step": 7000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.38,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2785,
"max_norm": 20.019760131835938,
"max_norm/layer0": 14.111713409423828,
"max_norm/layer1": 16.42165756225586,
"max_norm/layer2": 19.017709732055664,
"max_norm/layer3": 20.019760131835938,
"mean_norm": 8.164311796426773,
"mean_norm/layer0": 8.457763373851776,
"mean_norm/layer1": 8.13219028711319,
"mean_norm/layer2": 7.803518235683441,
"mean_norm/layer3": 8.263775289058685,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 7500
},
{
"epoch": 0.38,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45188651497908466,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.95,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2748253345489502,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.4299,
"eval_samples_per_second": 810.489,
"eval_steps_per_second": 1.583,
"eval_transition_accuracy": 0.5919354838709677,
"step": 7500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.4,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2764,
"max_norm": 21.821395874023438,
"max_norm/layer0": 14.158592224121094,
"max_norm/layer1": 16.7973690032959,
"max_norm/layer2": 19.965808868408203,
"max_norm/layer3": 21.821395874023438,
"mean_norm": 8.173083677887917,
"mean_norm/layer0": 8.458155512809753,
"mean_norm/layer1": 8.137401163578033,
"mean_norm/layer2": 7.8241875767707825,
"mean_norm/layer3": 8.272590458393097,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 8000
},
{
"epoch": 0.4,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.451844222902313,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.9,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.276716709136963,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.3344,
"eval_samples_per_second": 812.407,
"eval_steps_per_second": 1.587,
"eval_transition_accuracy": 0.5759677419354838,
"step": 8000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.42,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2763,
"max_norm": 23.506425857543945,
"max_norm/layer0": 14.451132774353027,
"max_norm/layer1": 17.121503829956055,
"max_norm/layer2": 20.906761169433594,
"max_norm/layer3": 23.506425857543945,
"mean_norm": 8.181086376309395,
"mean_norm/layer0": 8.458472549915314,
"mean_norm/layer1": 8.140588343143463,
"mean_norm/layer2": 7.84406965970993,
"mean_norm/layer3": 8.281214952468872,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 8500
},
{
"epoch": 0.42,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45069200410617616,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.94,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2800538539886475,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.8942,
"eval_samples_per_second": 801.287,
"eval_steps_per_second": 1.565,
"eval_transition_accuracy": 0.582741935483871,
"step": 8500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.45,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2755,
"max_norm": 25.07321548461914,
"max_norm/layer0": 14.874269485473633,
"max_norm/layer1": 17.39398956298828,
"max_norm/layer2": 21.67055892944336,
"max_norm/layer3": 25.07321548461914,
"mean_norm": 8.188907638192177,
"mean_norm/layer0": 8.458899140357971,
"mean_norm/layer1": 8.143711388111115,
"mean_norm/layer2": 7.863752484321594,
"mean_norm/layer3": 8.289267539978027,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 9000
},
{
"epoch": 0.45,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4516255536417323,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.9,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2754778861999512,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.3576,
"eval_samples_per_second": 811.942,
"eval_steps_per_second": 1.586,
"eval_transition_accuracy": 0.5764516129032258,
"step": 9000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.47,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2746,
"max_norm": 26.98938751220703,
"max_norm/layer0": 15.152251243591309,
"max_norm/layer1": 17.66726303100586,
"max_norm/layer2": 22.432802200317383,
"max_norm/layer3": 26.98938751220703,
"mean_norm": 8.197539746761322,
"mean_norm/layer0": 8.4593066573143,
"mean_norm/layer1": 8.148526132106781,
"mean_norm/layer2": 7.884801626205444,
"mean_norm/layer3": 8.297524571418762,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 9500
},
{
"epoch": 0.47,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45229261503444884,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.9,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2736179828643799,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.4635,
"eval_samples_per_second": 809.817,
"eval_steps_per_second": 1.582,
"eval_transition_accuracy": 0.5864516129032258,
"step": 9500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.5,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2734,
"max_norm": 28.549026489257812,
"max_norm/layer0": 15.429606437683105,
"max_norm/layer1": 17.9191837310791,
"max_norm/layer2": 23.421247482299805,
"max_norm/layer3": 28.549026489257812,
"mean_norm": 8.206039026379585,
"mean_norm/layer0": 8.459603905677795,
"mean_norm/layer1": 8.153472065925598,
"mean_norm/layer2": 7.905435502529144,
"mean_norm/layer3": 8.305644631385803,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 10000
},
{
"epoch": 0.5,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4518793061023622,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.91,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2739558219909668,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.3957,
"eval_samples_per_second": 811.175,
"eval_steps_per_second": 1.584,
"eval_transition_accuracy": 0.5779032258064516,
"step": 10000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.53,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2732,
"max_norm": 30.3062801361084,
"max_norm/layer0": 15.69118595123291,
"max_norm/layer1": 18.259578704833984,
"max_norm/layer2": 24.377281188964844,
"max_norm/layer3": 30.3062801361084,
"mean_norm": 8.214424923062325,
"mean_norm/layer0": 8.459942817687988,
"mean_norm/layer1": 8.156991243362427,
"mean_norm/layer2": 7.927173614501953,
"mean_norm/layer3": 8.31359201669693,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 10500
},
{
"epoch": 0.53,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45159791961429624,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.89,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2743829488754272,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.5803,
"eval_samples_per_second": 807.485,
"eval_steps_per_second": 1.577,
"eval_transition_accuracy": 0.5879032258064516,
"step": 10500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.55,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2723,
"max_norm": 31.752639770507812,
"max_norm/layer0": 16.03121566772461,
"max_norm/layer1": 18.51296043395996,
"max_norm/layer2": 25.478057861328125,
"max_norm/layer3": 31.752639770507812,
"mean_norm": 8.222758993506432,
"mean_norm/layer0": 8.460269570350647,
"mean_norm/layer1": 8.1603884100914,
"mean_norm/layer2": 7.949049711227417,
"mean_norm/layer3": 8.321328282356262,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 11000
},
{
"epoch": 0.55,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45252594234436516,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.89,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.269043207168579,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.1138,
"eval_samples_per_second": 816.875,
"eval_steps_per_second": 1.595,
"eval_transition_accuracy": 0.5811290322580646,
"step": 11000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.57,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2712,
"max_norm": 33.44036102294922,
"max_norm/layer0": 16.178028106689453,
"max_norm/layer1": 18.79450798034668,
"max_norm/layer2": 26.372129440307617,
"max_norm/layer3": 33.44036102294922,
"mean_norm": 8.230609133839607,
"mean_norm/layer0": 8.460545778274536,
"mean_norm/layer1": 8.163512825965881,
"mean_norm/layer2": 7.969985008239746,
"mean_norm/layer3": 8.328392922878265,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 11500
},
{
"epoch": 0.57,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45260740265132876,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.93,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2705051898956299,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.2988,
"eval_samples_per_second": 813.126,
"eval_steps_per_second": 1.588,
"eval_transition_accuracy": 0.5779032258064516,
"step": 11500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.6,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2716,
"max_norm": 35.007076263427734,
"max_norm/layer0": 16.497554779052734,
"max_norm/layer1": 18.986597061157227,
"max_norm/layer2": 27.318687438964844,
"max_norm/layer3": 35.007076263427734,
"mean_norm": 8.238363325595856,
"mean_norm/layer0": 8.460874915122986,
"mean_norm/layer1": 8.166603803634644,
"mean_norm/layer2": 7.990897297859192,
"mean_norm/layer3": 8.335077285766602,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 12000
},
{
"epoch": 0.6,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4526636318897638,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.89,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2700704336166382,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.2954,
"eval_samples_per_second": 813.195,
"eval_steps_per_second": 1.588,
"eval_transition_accuracy": 0.5759677419354838,
"step": 12000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.62,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2708,
"max_norm": 36.530174255371094,
"max_norm/layer0": 16.774707794189453,
"max_norm/layer1": 19.286115646362305,
"max_norm/layer2": 28.3411808013916,
"max_norm/layer3": 36.530174255371094,
"mean_norm": 8.246006086468697,
"mean_norm/layer0": 8.461170196533203,
"mean_norm/layer1": 8.169004082679749,
"mean_norm/layer2": 8.012158274650574,
"mean_norm/layer3": 8.341691792011261,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 12500
},
{
"epoch": 0.62,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45218808632197344,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.95,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2715603113174438,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.6419,
"eval_samples_per_second": 806.261,
"eval_steps_per_second": 1.575,
"eval_transition_accuracy": 0.5485483870967742,
"step": 12500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.65,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2705,
"max_norm": 38.14279556274414,
"max_norm/layer0": 16.91411590576172,
"max_norm/layer1": 19.68568229675293,
"max_norm/layer2": 29.167861938476562,
"max_norm/layer3": 38.14279556274414,
"mean_norm": 8.253673061728477,
"mean_norm/layer0": 8.461421430110931,
"mean_norm/layer1": 8.171639680862427,
"mean_norm/layer2": 8.033495247364044,
"mean_norm/layer3": 8.348135888576508,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 13000
},
{
"epoch": 0.65,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4528599536325049,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.93,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2675950527191162,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.2422,
"eval_samples_per_second": 814.27,
"eval_steps_per_second": 1.59,
"eval_transition_accuracy": 0.5733870967741935,
"step": 13000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.68,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2696,
"max_norm": 39.6273307800293,
"max_norm/layer0": 17.14823341369629,
"max_norm/layer1": 19.711994171142578,
"max_norm/layer2": 30.05324363708496,
"max_norm/layer3": 39.6273307800293,
"mean_norm": 8.261498123407364,
"mean_norm/layer0": 8.46165120601654,
"mean_norm/layer1": 8.17492812871933,
"mean_norm/layer2": 8.055188477039337,
"mean_norm/layer3": 8.354224681854248,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 13500
},
{
"epoch": 0.68,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4518833911325049,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.91,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2716896533966064,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.327,
"eval_samples_per_second": 812.557,
"eval_steps_per_second": 1.587,
"eval_transition_accuracy": 0.5993548387096774,
"step": 13500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.7,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2687,
"max_norm": 41.1092643737793,
"max_norm/layer0": 17.324541091918945,
"max_norm/layer1": 19.974578857421875,
"max_norm/layer2": 30.87788200378418,
"max_norm/layer3": 41.1092643737793,
"mean_norm": 8.26871033012867,
"mean_norm/layer0": 8.46183955669403,
"mean_norm/layer1": 8.177218735218048,
"mean_norm/layer2": 8.076003432273865,
"mean_norm/layer3": 8.359779596328735,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 14000
},
{
"epoch": 0.7,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45239882581815943,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.9,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2687005996704102,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.5627,
"eval_samples_per_second": 807.837,
"eval_steps_per_second": 1.578,
"eval_transition_accuracy": 0.5756451612903226,
"step": 14000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.72,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2685,
"max_norm": 42.766014099121094,
"max_norm/layer0": 17.42141342163086,
"max_norm/layer1": 20.071725845336914,
"max_norm/layer2": 31.913164138793945,
"max_norm/layer3": 42.766014099121094,
"mean_norm": 8.276251748204231,
"mean_norm/layer0": 8.46206510066986,
"mean_norm/layer1": 8.179498374462128,
"mean_norm/layer2": 8.097876787185669,
"mean_norm/layer3": 8.365566730499268,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 14500
},
{
"epoch": 0.72,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45213714359313484,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.89,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2709327936172485,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.2398,
"eval_samples_per_second": 814.319,
"eval_steps_per_second": 1.59,
"eval_transition_accuracy": 0.612741935483871,
"step": 14500
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.75,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2685,
"max_norm": 44.61077117919922,
"max_norm/layer0": 17.665727615356445,
"max_norm/layer1": 20.15743637084961,
"max_norm/layer2": 32.966209411621094,
"max_norm/layer3": 44.61077117919922,
"mean_norm": 8.283790707588196,
"mean_norm/layer0": 8.462257981300354,
"mean_norm/layer1": 8.181303024291992,
"mean_norm/layer2": 8.120182931423187,
"mean_norm/layer3": 8.37141889333725,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 15000
},
{
"epoch": 0.75,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.4519187146284449,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.91,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2706036567687988,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 40.0952,
"eval_samples_per_second": 817.256,
"eval_steps_per_second": 1.596,
"eval_transition_accuracy": 0.587258064516129,
"step": 15000
},
{
"MSE": 0.0,
"MSE/layer0": 0.0,
"MSE/layer1": 0.0,
"MSE/layer2": 0.0,
"MSE/layer3": 0.0,
"dead_code_fraction": 1.0,
"dead_code_fraction/layer0": 1.0,
"dead_code_fraction/layer1": 1.0,
"dead_code_fraction/layer2": 1.0,
"dead_code_fraction/layer3": 1.0,
"epoch": 0.78,
"input_norm": 0.0,
"input_norm/layer0": 0.0,
"input_norm/layer1": 0.0,
"input_norm/layer2": 0.0,
"input_norm/layer3": 0.0,
"learning_rate": 0.001,
"loss": 1.2675,
"max_norm": 46.33829879760742,
"max_norm/layer0": 17.856664657592773,
"max_norm/layer1": 20.084186553955078,
"max_norm/layer2": 33.940242767333984,
"max_norm/layer3": 46.33829879760742,
"mean_norm": 8.291451185941696,
"mean_norm/layer0": 8.462452054023743,
"mean_norm/layer1": 8.18280303478241,
"mean_norm/layer2": 8.143204748630524,
"mean_norm/layer3": 8.377344906330109,
"multicode_k": 1,
"output_norm": 0.0,
"output_norm/layer0": 0.0,
"output_norm/layer1": 0.0,
"output_norm/layer2": 0.0,
"output_norm/layer3": 0.0,
"step": 15500
},
{
"epoch": 0.78,
"eval_MSE/layer0": 0.0,
"eval_MSE/layer1": 0.0,
"eval_MSE/layer2": 0.0,
"eval_MSE/layer3": 0.0,
"eval_accuracy": 0.45268790177472934,
"eval_dead_code_fraction/layer0": 1.0,
"eval_dead_code_fraction/layer1": 1.0,
"eval_dead_code_fraction/layer2": 1.0,
"eval_dead_code_fraction/layer3": 1.0,
"eval_first_transition_accuracy": 0.96,
"eval_input_norm/layer0": 0.0,
"eval_input_norm/layer1": 0.0,
"eval_input_norm/layer2": 0.0,
"eval_input_norm/layer3": 0.0,
"eval_loss": 1.2690919637680054,
"eval_multicode_k": 1,
"eval_output_norm/layer0": 0.0,
"eval_output_norm/layer1": 0.0,
"eval_output_norm/layer2": 0.0,
"eval_output_norm/layer3": 0.0,
"eval_runtime": 39.8501,
"eval_samples_per_second": 822.281,
"eval_steps_per_second": 1.606,
"eval_transition_accuracy": 0.6364516129032258,
"step": 15500
}
],
"max_steps": 20000,
"num_train_epochs": 9223372036854775807,
"total_flos": 9712749772800000.0,
"trial_name": null,
"trial_params": null
}