diff --git "a/wandb/run-20220228_122826-3vepvy3m/files/config.yaml" "b/wandb/run-20220228_122826-3vepvy3m/files/config.yaml" new file mode 100644--- /dev/null +++ "b/wandb/run-20220228_122826-3vepvy3m/files/config.yaml" @@ -0,0 +1,11335 @@ +wandb_version: 1 + +_n_gpu: + desc: null + value: 1 +_name_or_path: + desc: null + value: ./ +_wandb: + desc: null + value: + cli_version: 0.12.10 + framework: huggingface + huggingface_version: 4.17.0.dev0 + is_jupyter_run: false + is_kaggle_kernel: false + m: + - 1: train/global_step + 6: + - 3 + - 1: train/loss + 5: 1 + 6: + - 1 + - 1: train/learning_rate + 5: 1 + 6: + - 1 + - 1: train/epoch + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.ln_f\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.ln_f\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.ln_f\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.ln_f\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.ln_f\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.ln_f\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.23\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.22\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.21\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.20\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.19\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.18\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.17\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.16\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.15\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.14\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.13\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.12\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.11\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.10\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.9\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.8\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.7\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.6\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.5\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.4\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.3\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.2\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.1\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.mlp\.c_fc\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.crossattention\.q_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_cross_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.attn\.c_attn\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.h\.0\.ln_1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.wpe\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.wpe\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.wpe\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.wte\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.wte\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.transformer\.wte\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: eval/loss + 5: 1 + 6: + - 1 + - 1: eval/wer + 5: 1 + 6: + - 1 + - 1: eval/runtime + 5: 1 + 6: + - 1 + - 1: eval/samples_per_second + 5: 1 + 6: + - 1 + - 1: eval/steps_per_second + 5: 1 + 6: + - 1 + python_version: 3.9.5 + start_time: 1646051306 + t: + 1: + - 1 + - 5 + - 11 + 3: + - 13 + 4: 3.9.5 + 5: 0.12.10 + 6: 4.17.0.dev0 + 8: + - 5 +adafactor: + desc: null + value: false +adam_beta1: + desc: null + value: 0.9 +adam_beta2: + desc: null + value: 0.999 +adam_epsilon: + desc: null + value: 1.0e-08 +add_cross_attention: + desc: null + value: false +architectures: + desc: null + value: + - SpeechEncoderDecoderModel +bad_words_ids: + desc: null + value: null +bf16: + desc: null + value: false +bf16_full_eval: + desc: null + value: false +bos_token_id: + desc: null + value: null +chunk_size_feed_forward: + desc: null + value: 0 +cross_attention_hidden_size: + desc: null + value: null +dataloader_drop_last: + desc: null + value: false +dataloader_num_workers: + desc: null + value: 0 +dataloader_pin_memory: + desc: null + value: true +ddp_bucket_cap_mb: + desc: null + value: None +ddp_find_unused_parameters: + desc: null + value: None +debug: + desc: null + value: '[]' +decoder: + desc: null + value: + _name_or_path: gpt2-medium + activation_function: gelu_new + add_cross_attention: true + architectures: + - GPT2LMHeadModel + attn_pdrop: 0.0 + bad_words_ids: null + bos_token_id: 50256 + chunk_size_feed_forward: 0 + cross_attention_hidden_size: null + decoder_start_token_id: null + diversity_penalty: 0.0 + do_sample: false + early_stopping: false + embd_pdrop: 0.0 + encoder_no_repeat_ngram_size: 0 + eos_token_id: 50256 + finetuning_task: null + forced_bos_token_id: null + forced_eos_token_id: null + id2label: + '0': LABEL_0 + '1': LABEL_1 + initializer_range: 0.02 + is_decoder: true + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + layer_norm_epsilon: 1.0e-05 + length_penalty: 1.0 + max_length: 20 + min_length: 0 + model_type: gpt2 + n_ctx: 1024 + n_embd: 1024 + n_head: 16 + n_inner: null + n_layer: 24 + n_positions: 1024 + n_special: 0 + no_repeat_ngram_size: 0 + num_beam_groups: 1 + num_beams: 1 + num_return_sequences: 1 + output_attentions: false + output_hidden_states: false + output_scores: false + pad_token_id: null + predict_special_tokens: true + prefix: null + problem_type: null + pruned_heads: {} + remove_invalid_values: false + reorder_and_upcast_attn: false + repetition_penalty: 1.0 + resid_pdrop: 0.0 + return_dict: true + return_dict_in_generate: false + scale_attn_by_inverse_layer_idx: false + scale_attn_weights: true + sep_token_id: null + summary_activation: null + summary_first_dropout: 0.0 + summary_proj_to_labels: true + summary_type: cls_index + summary_use_proj: true + task_specific_params: + text-generation: + do_sample: true + max_length: 50 + temperature: 1.0 + tie_encoder_decoder: false + tie_word_embeddings: true + tokenizer_class: null + top_k: 50 + top_p: 1.0 + torch_dtype: null + torchscript: false + transformers_version: 4.17.0.dev0 + use_bfloat16: false + use_cache: false + vocab_size: 50257 +decoder_start_token_id: + desc: null + value: 50256 +deepspeed: + desc: null + value: None +disable_tqdm: + desc: null + value: false +diversity_penalty: + desc: null + value: 0.0 +do_eval: + desc: null + value: true +do_predict: + desc: null + value: false +do_sample: + desc: null + value: false +do_train: + desc: null + value: true +early_stopping: + desc: null + value: false +encoder: + desc: null + value: + _name_or_path: facebook/wav2vec2-large-lv60 + activation_dropout: 0.0 + adapter_kernel_size: 3 + adapter_stride: 2 + add_adapter: true + add_cross_attention: false + apply_spec_augment: false + architectures: + - Wav2Vec2ForPreTraining + attention_dropout: 0.0 + bad_words_ids: null + bos_token_id: 1 + chunk_size_feed_forward: 0 + classifier_proj_size: 256 + codevector_dim: 768 + contrastive_logits_temperature: 0.1 + conv_bias: true + conv_dim: + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + conv_kernel: + - 10 + - 3 + - 3 + - 3 + - 3 + - 2 + - 2 + conv_stride: + - 5 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + cross_attention_hidden_size: null + ctc_loss_reduction: sum + ctc_zero_infinity: false + decoder_start_token_id: null + diversity_loss_weight: 0.1 + diversity_penalty: 0.0 + do_sample: false + do_stable_layer_norm: true + early_stopping: false + encoder_no_repeat_ngram_size: 0 + eos_token_id: 2 + feat_extract_activation: gelu + feat_extract_dropout: 0.0 + feat_extract_norm: layer + feat_proj_dropout: 0.0 + feat_quantizer_dropout: 0.0 + final_dropout: 0.0 + finetuning_task: null + forced_bos_token_id: null + forced_eos_token_id: null + gradient_checkpointing: false + hidden_act: gelu + hidden_dropout: 0.0 + hidden_dropout_prob: 0.0 + hidden_size: 1024 + id2label: + '0': LABEL_0 + '1': LABEL_1 + initializer_range: 0.02 + intermediate_size: 4096 + is_decoder: false + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + layer_norm_eps: 1.0e-05 + layerdrop: 0.0 + length_penalty: 1.0 + mask_feature_length: 10 + mask_feature_min_masks: 0 + mask_feature_prob: 0.0 + mask_time_length: 10 + mask_time_min_masks: 2 + mask_time_prob: 0.0 + max_length: 20 + min_length: 0 + model_type: wav2vec2 + no_repeat_ngram_size: 0 + num_adapter_layers: 3 + num_attention_heads: 16 + num_beam_groups: 1 + num_beams: 1 + num_codevector_groups: 2 + num_codevectors_per_group: 320 + num_conv_pos_embedding_groups: 16 + num_conv_pos_embeddings: 128 + num_feat_extract_layers: 7 + num_hidden_layers: 24 + num_negatives: 100 + num_return_sequences: 1 + output_attentions: false + output_hidden_size: 1024 + output_hidden_states: false + output_scores: false + pad_token_id: 0 + prefix: null + problem_type: null + proj_codevector_dim: 768 + pruned_heads: {} + remove_invalid_values: false + repetition_penalty: 1.0 + return_dict: true + return_dict_in_generate: false + sep_token_id: null + task_specific_params: null + tdnn_dilation: + - 1 + - 2 + - 3 + - 1 + - 1 + tdnn_dim: + - 512 + - 512 + - 512 + - 512 + - 1500 + tdnn_kernel: + - 5 + - 3 + - 3 + - 1 + - 1 + temperature: 1.0 + tie_encoder_decoder: false + tie_word_embeddings: true + tokenizer_class: null + top_k: 50 + top_p: 1.0 + torch_dtype: null + torchscript: false + transformers_version: 4.17.0.dev0 + use_bfloat16: false + use_weighted_layer_sum: false + vocab_size: 32 + xvector_output_dim: 512 +encoder_no_repeat_ngram_size: + desc: null + value: 0 +eos_token_id: + desc: null + value: 50256 +eval_accumulation_steps: + desc: null + value: None +eval_batch_size: + desc: null + value: 8 +eval_steps: + desc: null + value: 500 +evaluation_strategy: + desc: null + value: steps +finetuning_task: + desc: null + value: null +forced_bos_token_id: + desc: null + value: null +forced_eos_token_id: + desc: null + value: null +fp16: + desc: null + value: true +fp16_backend: + desc: null + value: auto +fp16_full_eval: + desc: null + value: false +fp16_opt_level: + desc: null + value: O1 +generation_max_length: + desc: null + value: 40 +generation_num_beams: + desc: null + value: 1 +gradient_accumulation_steps: + desc: null + value: 4 +gradient_checkpointing: + desc: null + value: true +greater_is_better: + desc: null + value: None +group_by_length: + desc: null + value: true +half_precision_backend: + desc: null + value: amp +hub_model_id: + desc: null + value: None +hub_strategy: + desc: null + value: every_save +hub_token: + desc: null + value: +id2label: + desc: null + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: null + value: false +is_decoder: + desc: null + value: false +is_encoder_decoder: + desc: null + value: true +label2id: + desc: null + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: null + value: None +label_smoothing_factor: + desc: null + value: 0.0 +learning_rate: + desc: null + value: 1.0e-05 +length_column_name: + desc: null + value: input_length +length_penalty: + desc: null + value: 1.0 +load_best_model_at_end: + desc: null + value: false +local_rank: + desc: null + value: -1 +log_level: + desc: null + value: -1 +log_level_replica: + desc: null + value: -1 +log_on_each_node: + desc: null + value: true +logging_dir: + desc: null + value: ./runs/Feb28_12-27-43_sanchit--v100 +logging_first_step: + desc: null + value: false +logging_nan_inf_filter: + desc: null + value: true +logging_steps: + desc: null + value: 1 +logging_strategy: + desc: null + value: steps +lr_scheduler_type: + desc: null + value: linear +max_grad_norm: + desc: null + value: 1.0 +max_length: + desc: null + value: 50 +max_steps: + desc: null + value: -1 +metric_for_best_model: + desc: null + value: None +min_length: + desc: null + value: 0 +model_type: + desc: null + value: speech-encoder-decoder +mp_parameters: + desc: null + value: '' +no_cuda: + desc: null + value: false +no_repeat_ngram_size: + desc: null + value: 0 +num_beam_groups: + desc: null + value: 1 +num_beams: + desc: null + value: 1 +num_return_sequences: + desc: null + value: 1 +num_train_epochs: + desc: null + value: 1.0 +optim: + desc: null + value: adamw_hf +output_attentions: + desc: null + value: false +output_dir: + desc: null + value: ./ +output_hidden_states: + desc: null + value: false +output_scores: + desc: null + value: false +overwrite_output_dir: + desc: null + value: true +pad_token_id: + desc: null + value: 50256 +past_index: + desc: null + value: -1 +per_device_eval_batch_size: + desc: null + value: 8 +per_device_train_batch_size: + desc: null + value: 8 +per_gpu_eval_batch_size: + desc: null + value: None +per_gpu_train_batch_size: + desc: null + value: None +predict_with_generate: + desc: null + value: true +prediction_loss_only: + desc: null + value: false +prefix: + desc: null + value: null +problem_type: + desc: null + value: null +processor_class: + desc: null + value: Wav2Vec2Processor +pruned_heads: + desc: null + value: {} +push_to_hub: + desc: null + value: true +push_to_hub_model_id: + desc: null + value: None +push_to_hub_organization: + desc: null + value: None +push_to_hub_token: + desc: null + value: +remove_invalid_values: + desc: null + value: false +remove_unused_columns: + desc: null + value: true +repetition_penalty: + desc: null + value: 1.0 +report_to: + desc: null + value: '[''wandb'']' +resume_from_checkpoint: + desc: null + value: None +return_dict: + desc: null + value: true +return_dict_in_generate: + desc: null + value: false +run_name: + desc: null + value: ./ +save_on_each_node: + desc: null + value: false +save_steps: + desc: null + value: 500 +save_strategy: + desc: null + value: steps +save_total_limit: + desc: null + value: 1 +seed: + desc: null + value: 42 +sep_token_id: + desc: null + value: null +sharded_ddp: + desc: null + value: '[]' +skip_memory_metrics: + desc: null + value: true +sortish_sampler: + desc: null + value: false +task_specific_params: + desc: null + value: null +temperature: + desc: null + value: 1.0 +tf32: + desc: null + value: None +tie_encoder_decoder: + desc: null + value: false +tie_word_embeddings: + desc: null + value: false +tokenizer_class: + desc: null + value: null +top_k: + desc: null + value: 50 +top_p: + desc: null + value: 1.0 +torch_dtype: + desc: null + value: torch.float32 +torchscript: + desc: null + value: false +tpu_metrics_debug: + desc: null + value: false +tpu_num_cores: + desc: null + value: None +train_batch_size: + desc: null + value: 8 +transformers_version: + desc: null + value: null +use_bfloat16: + desc: null + value: false +use_cache: + desc: null + value: false +use_legacy_prediction_loop: + desc: null + value: false +warmup_ratio: + desc: null + value: 0.0 +warmup_steps: + desc: null + value: 500 +weight_decay: + desc: null + value: 0.0 +xpu_backend: + desc: null + value: None