wandb_version: 1 _n_gpu: desc: null value: 1 _name_or_path: desc: null value: ./ _wandb: desc: null value: cli_version: 0.12.10 framework: huggingface huggingface_version: 4.17.0.dev0 is_jupyter_run: false is_kaggle_kernel: false m: - 1: train/global_step 6: - 3 - 1: train/loss 5: 1 6: - 1 - 1: train/learning_rate 5: 1 6: - 1 - 1: train/epoch 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.ln_f\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.ln_f\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.ln_f\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.ln_f\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.ln_f\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.ln_f\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.wpe\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.wpe\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.wpe\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.wte\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.wte\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.transformer\.wte\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.bins 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: train/train_runtime 5: 1 6: - 1 - 1: train/train_samples_per_second 5: 1 6: - 1 - 1: train/train_steps_per_second 5: 1 6: - 1 - 1: train/total_flos 5: 1 6: - 1 - 1: train/train_loss 5: 1 6: - 1 - 1: eval/loss 5: 1 6: - 1 - 1: eval/wer 5: 1 6: - 1 - 1: eval/runtime 5: 1 6: - 1 - 1: eval/samples_per_second 5: 1 6: - 1 - 1: eval/steps_per_second 5: 1 6: - 1 python_version: 3.9.5 start_time: 1646194412 t: 1: - 1 - 5 - 11 2: - 1 - 5 - 11 - 12 3: - 1 - 7 - 13 4: 3.9.5 5: 0.12.10 6: 4.17.0.dev0 8: - 5 adafactor: desc: null value: false adam_beta1: desc: null value: 0.9 adam_beta2: desc: null value: 0.999 adam_epsilon: desc: null value: 1.0e-08 add_cross_attention: desc: null value: false architectures: desc: null value: - SpeechEncoderDecoderModel bad_words_ids: desc: null value: null bf16: desc: null value: false bf16_full_eval: desc: null value: false bos_token_id: desc: null value: null chunk_size_feed_forward: desc: null value: 0 cross_attention_hidden_size: desc: null value: null dataloader_drop_last: desc: null value: false dataloader_num_workers: desc: null value: 0 dataloader_pin_memory: desc: null value: true ddp_bucket_cap_mb: desc: null value: None ddp_find_unused_parameters: desc: null value: None debug: desc: null value: '[]' decoder: desc: null value: _name_or_path: gpt2-medium activation_function: gelu_new add_cross_attention: true architectures: - GPT2LMHeadModel attn_pdrop: 0.0 bad_words_ids: null bos_token_id: 50256 chunk_size_feed_forward: 0 cross_attention_hidden_size: null decoder_start_token_id: null diversity_penalty: 0.0 do_sample: false early_stopping: false embd_pdrop: 0.0 encoder_no_repeat_ngram_size: 0 eos_token_id: 50256 finetuning_task: null forced_bos_token_id: null forced_eos_token_id: null id2label: '0': LABEL_0 '1': LABEL_1 initializer_range: 0.02 is_decoder: true is_encoder_decoder: false label2id: LABEL_0: 0 LABEL_1: 1 layer_norm_epsilon: 1.0e-05 length_penalty: 1.0 max_length: 20 min_length: 0 model_type: gpt2 n_ctx: 1024 n_embd: 1024 n_head: 16 n_inner: null n_layer: 24 n_positions: 1024 n_special: 0 no_repeat_ngram_size: 0 num_beam_groups: 1 num_beams: 1 num_return_sequences: 1 output_attentions: false output_hidden_states: false output_scores: false pad_token_id: null predict_special_tokens: true prefix: null problem_type: null pruned_heads: {} remove_invalid_values: false reorder_and_upcast_attn: false repetition_penalty: 1.0 resid_pdrop: 0.0 return_dict: true return_dict_in_generate: false scale_attn_by_inverse_layer_idx: false scale_attn_weights: true sep_token_id: null summary_activation: null summary_first_dropout: 0.0 summary_proj_to_labels: true summary_type: cls_index summary_use_proj: true task_specific_params: text-generation: do_sample: true max_length: 50 temperature: 1.0 tie_encoder_decoder: false tie_word_embeddings: true tokenizer_class: null top_k: 50 top_p: 1.0 torch_dtype: null torchscript: false transformers_version: 4.17.0.dev0 use_bfloat16: false use_cache: false vocab_size: 50257 decoder_start_token_id: desc: null value: 50256 deepspeed: desc: null value: None disable_tqdm: desc: null value: false diversity_penalty: desc: null value: 0.0 do_eval: desc: null value: true do_predict: desc: null value: false do_sample: desc: null value: false do_train: desc: null value: true early_stopping: desc: null value: false encoder: desc: null value: _name_or_path: facebook/wav2vec2-large-lv60 activation_dropout: 0.0 adapter_kernel_size: 3 adapter_stride: 2 add_adapter: true add_cross_attention: false apply_spec_augment: false architectures: - Wav2Vec2ForPreTraining attention_dropout: 0.0 bad_words_ids: null bos_token_id: 1 chunk_size_feed_forward: 0 classifier_proj_size: 256 codevector_dim: 768 contrastive_logits_temperature: 0.1 conv_bias: true conv_dim: - 512 - 512 - 512 - 512 - 512 - 512 - 512 conv_kernel: - 10 - 3 - 3 - 3 - 3 - 2 - 2 conv_stride: - 5 - 2 - 2 - 2 - 2 - 2 - 2 cross_attention_hidden_size: null ctc_loss_reduction: sum ctc_zero_infinity: false decoder_start_token_id: null diversity_loss_weight: 0.1 diversity_penalty: 0.0 do_sample: false do_stable_layer_norm: true early_stopping: false encoder_no_repeat_ngram_size: 0 eos_token_id: 2 feat_extract_activation: gelu feat_extract_dropout: 0.0 feat_extract_norm: layer feat_proj_dropout: 0.0 feat_quantizer_dropout: 0.0 final_dropout: 0.0 finetuning_task: null forced_bos_token_id: null forced_eos_token_id: null gradient_checkpointing: false hidden_act: gelu hidden_dropout: 0.0 hidden_dropout_prob: 0.0 hidden_size: 1024 id2label: '0': LABEL_0 '1': LABEL_1 initializer_range: 0.02 intermediate_size: 4096 is_decoder: false is_encoder_decoder: false label2id: LABEL_0: 0 LABEL_1: 1 layer_norm_eps: 1.0e-05 layerdrop: 0.0 length_penalty: 1.0 mask_feature_length: 10 mask_feature_min_masks: 0 mask_feature_prob: 0.0 mask_time_length: 10 mask_time_min_masks: 2 mask_time_prob: 0.0 max_length: 20 min_length: 0 model_type: wav2vec2 no_repeat_ngram_size: 0 num_adapter_layers: 3 num_attention_heads: 16 num_beam_groups: 1 num_beams: 1 num_codevector_groups: 2 num_codevectors_per_group: 320 num_conv_pos_embedding_groups: 16 num_conv_pos_embeddings: 128 num_feat_extract_layers: 7 num_hidden_layers: 24 num_negatives: 100 num_return_sequences: 1 output_attentions: false output_hidden_size: 1024 output_hidden_states: false output_scores: false pad_token_id: 0 prefix: null problem_type: null proj_codevector_dim: 768 pruned_heads: {} remove_invalid_values: false repetition_penalty: 1.0 return_dict: true return_dict_in_generate: false sep_token_id: null task_specific_params: null tdnn_dilation: - 1 - 2 - 3 - 1 - 1 tdnn_dim: - 512 - 512 - 512 - 512 - 1500 tdnn_kernel: - 5 - 3 - 3 - 1 - 1 temperature: 1.0 tie_encoder_decoder: false tie_word_embeddings: true tokenizer_class: null top_k: 50 top_p: 1.0 torch_dtype: null torchscript: false transformers_version: 4.17.0.dev0 use_bfloat16: false use_weighted_layer_sum: false vocab_size: 32 xvector_output_dim: 512 encoder_no_repeat_ngram_size: desc: null value: 0 eos_token_id: desc: null value: 50256 eval_accumulation_steps: desc: null value: None eval_batch_size: desc: null value: 12 eval_steps: desc: null value: 500 evaluation_strategy: desc: null value: steps finetuning_task: desc: null value: null forced_bos_token_id: desc: null value: null forced_eos_token_id: desc: null value: null fp16: desc: null value: true fp16_backend: desc: null value: auto fp16_full_eval: desc: null value: false fp16_opt_level: desc: null value: O1 generation_max_length: desc: null value: 40 generation_num_beams: desc: null value: 1 gradient_accumulation_steps: desc: null value: 8 gradient_checkpointing: desc: null value: true greater_is_better: desc: null value: None group_by_length: desc: null value: true half_precision_backend: desc: null value: amp hub_model_id: desc: null value: None hub_strategy: desc: null value: every_save hub_token: desc: null value: id2label: desc: null value: '0': LABEL_0 '1': LABEL_1 ignore_data_skip: desc: null value: false is_decoder: desc: null value: false is_encoder_decoder: desc: null value: true label2id: desc: null value: LABEL_0: 0 LABEL_1: 1 label_names: desc: null value: None label_smoothing_factor: desc: null value: 0.0 learning_rate: desc: null value: 0.0001 length_column_name: desc: null value: input_length length_penalty: desc: null value: 1.0 load_best_model_at_end: desc: null value: false local_rank: desc: null value: -1 log_level: desc: null value: -1 log_level_replica: desc: null value: -1 log_on_each_node: desc: null value: true logging_dir: desc: null value: ./runs/Mar02_04-12-50_sanchit--v100 logging_first_step: desc: null value: false logging_nan_inf_filter: desc: null value: true logging_steps: desc: null value: 1 logging_strategy: desc: null value: steps lr_scheduler_type: desc: null value: linear max_grad_norm: desc: null value: 1.0 max_length: desc: null value: 50 max_steps: desc: null value: -1 metric_for_best_model: desc: null value: None min_length: desc: null value: 0 model_type: desc: null value: speech-encoder-decoder mp_parameters: desc: null value: '' no_cuda: desc: null value: false no_repeat_ngram_size: desc: null value: 0 num_beam_groups: desc: null value: 1 num_beams: desc: null value: 1 num_return_sequences: desc: null value: 1 num_train_epochs: desc: null value: 1.0 optim: desc: null value: adamw_hf output_attentions: desc: null value: false output_dir: desc: null value: ./ output_hidden_states: desc: null value: false output_scores: desc: null value: false overwrite_output_dir: desc: null value: true pad_token_id: desc: null value: 50256 past_index: desc: null value: -1 per_device_eval_batch_size: desc: null value: 12 per_device_train_batch_size: desc: null value: 12 per_gpu_eval_batch_size: desc: null value: None per_gpu_train_batch_size: desc: null value: None predict_with_generate: desc: null value: true prediction_loss_only: desc: null value: false prefix: desc: null value: null problem_type: desc: null value: null processor_class: desc: null value: Wav2Vec2Processor pruned_heads: desc: null value: {} push_to_hub: desc: null value: true push_to_hub_model_id: desc: null value: None push_to_hub_organization: desc: null value: None push_to_hub_token: desc: null value: remove_invalid_values: desc: null value: false remove_unused_columns: desc: null value: true repetition_penalty: desc: null value: 1.0 report_to: desc: null value: '[''wandb'']' resume_from_checkpoint: desc: null value: None return_dict: desc: null value: true return_dict_in_generate: desc: null value: false run_name: desc: null value: ./ save_on_each_node: desc: null value: false save_steps: desc: null value: 500 save_strategy: desc: null value: steps save_total_limit: desc: null value: 1 seed: desc: null value: 42 sep_token_id: desc: null value: null sharded_ddp: desc: null value: '[]' skip_memory_metrics: desc: null value: true sortish_sampler: desc: null value: false task_specific_params: desc: null value: null temperature: desc: null value: 1.0 tf32: desc: null value: None tie_encoder_decoder: desc: null value: false tie_word_embeddings: desc: null value: false tokenizer_class: desc: null value: null top_k: desc: null value: 50 top_p: desc: null value: 1.0 torch_dtype: desc: null value: torch.float32 torchscript: desc: null value: false tpu_metrics_debug: desc: null value: false tpu_num_cores: desc: null value: None train_batch_size: desc: null value: 12 transformers_version: desc: null value: null use_bfloat16: desc: null value: false use_cache: desc: null value: false use_legacy_prediction_loop: desc: null value: false warmup_ratio: desc: null value: 0.0 warmup_steps: desc: null value: 500 weight_decay: desc: null value: 0.0 xpu_backend: desc: null value: None