diff --git "a/wandb/run-20220325_193848-1sz5964i/files/config.yaml" "b/wandb/run-20220325_193848-1sz5964i/files/config.yaml" new file mode 100644--- /dev/null +++ "b/wandb/run-20220325_193848-1sz5964i/files/config.yaml" @@ -0,0 +1,9340 @@ +wandb_version: 1 + +_n_gpu: + desc: null + value: 1 +_name_or_path: + desc: null + value: ./ +_wandb: + desc: null + value: + cli_version: 0.12.10 + framework: huggingface + huggingface_version: 4.17.0.dev0 + is_jupyter_run: false + is_kaggle_kernel: false + m: + - 1: train/global_step + 6: + - 3 + - 1: train/loss + 5: 1 + 6: + - 1 + - 1: train/learning_rate + 5: 1 + 6: + - 1 + - 1: train/epoch + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_positions\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_positions\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_positions\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_tokens\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_tokens\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_tokens\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.masked_spec_embed._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.masked_spec_embed.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.masked_spec_embed.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: eval/loss + 5: 1 + 6: + - 1 + - 1: eval/wer + 5: 1 + 6: + - 1 + - 1: eval/runtime + 5: 1 + 6: + - 1 + - 1: eval/samples_per_second + 5: 1 + 6: + - 1 + - 1: eval/steps_per_second + 5: 1 + 6: + - 1 + python_version: 3.9.5 + start_time: 1648237128 + t: + 1: + - 1 + - 5 + - 11 + 3: + - 13 + 4: 3.9.5 + 5: 0.12.10 + 6: 4.17.0.dev0 + 8: + - 5 +adafactor: + desc: null + value: false +adam_beta1: + desc: null + value: 0.9 +adam_beta2: + desc: null + value: 0.999 +adam_epsilon: + desc: null + value: 1.0e-08 +add_cross_attention: + desc: null + value: false +architectures: + desc: null + value: + - SpeechEncoderDecoderModel +bad_words_ids: + desc: null + value: null +bf16: + desc: null + value: false +bf16_full_eval: + desc: null + value: false +bos_token_id: + desc: null + value: null +chunk_size_feed_forward: + desc: null + value: 0 +cross_attention_hidden_size: + desc: null + value: null +dataloader_drop_last: + desc: null + value: false +dataloader_num_workers: + desc: null + value: 0 +dataloader_pin_memory: + desc: null + value: true +ddp_bucket_cap_mb: + desc: null + value: None +ddp_find_unused_parameters: + desc: null + value: None +debug: + desc: null + value: '[]' +decoder: + desc: null + value: + _name_or_path: facebook/bart-large-cnn + _num_labels: 3 + activation_dropout: 0.0 + activation_function: gelu + add_cross_attention: true + add_final_layer_norm: false + architectures: + - BartForConditionalGeneration + attention_dropout: 0.0 + bad_words_ids: null + bos_token_id: 0 + chunk_size_feed_forward: 0 + classif_dropout: 0.0 + classifier_dropout: 0.0 + cross_attention_hidden_size: null + d_model: 1024 + decoder_attention_heads: 16 + decoder_ffn_dim: 4096 + decoder_layerdrop: 0.0 + decoder_layers: 12 + decoder_start_token_id: 2 + diversity_penalty: 0.0 + do_sample: false + dropout: 0.1 + early_stopping: true + encoder_attention_heads: 16 + encoder_ffn_dim: 4096 + encoder_layerdrop: 0.0 + encoder_layers: 12 + encoder_no_repeat_ngram_size: 0 + eos_token_id: 2 + finetuning_task: null + force_bos_token_to_be_generated: true + forced_bos_token_id: 0 + forced_eos_token_id: 2 + gradient_checkpointing: false + id2label: + '0': LABEL_0 + '1': LABEL_1 + '2': LABEL_2 + init_std: 0.02 + is_decoder: true + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + LABEL_2: 2 + length_penalty: 2.0 + max_length: 142 + max_position_embeddings: 1024 + min_length: 56 + model_type: bart + no_repeat_ngram_size: 3 + normalize_before: false + num_beam_groups: 1 + num_beams: 4 + num_hidden_layers: 12 + num_return_sequences: 1 + output_attentions: false + output_hidden_states: false + output_past: true + output_scores: false + pad_token_id: 1 + prefix: ' ' + problem_type: null + pruned_heads: {} + remove_invalid_values: false + repetition_penalty: 1.0 + return_dict: true + return_dict_in_generate: false + scale_embedding: false + sep_token_id: null + task_specific_params: + summarization: + early_stopping: true + length_penalty: 2.0 + max_length: 142 + min_length: 56 + no_repeat_ngram_size: 3 + num_beams: 4 + temperature: 1.0 + tie_encoder_decoder: false + tie_word_embeddings: true + tokenizer_class: null + top_k: 50 + top_p: 1.0 + torch_dtype: null + torchscript: false + transformers_version: 4.17.0.dev0 + use_bfloat16: false + use_cache: true + vocab_size: 50264 +decoder_start_token_id: + desc: null + value: 0 +deepspeed: + desc: null + value: None +disable_tqdm: + desc: null + value: false +diversity_penalty: + desc: null + value: 0.0 +do_eval: + desc: null + value: true +do_predict: + desc: null + value: false +do_sample: + desc: null + value: false +do_train: + desc: null + value: true +early_stopping: + desc: null + value: false +encoder: + desc: null + value: + _name_or_path: facebook/wav2vec2-large-lv60 + activation_dropout: 0.1 + adapter_kernel_size: 3 + adapter_stride: 2 + add_adapter: true + add_cross_attention: false + apply_spec_augment: true + architectures: + - Wav2Vec2ForPreTraining + attention_dropout: 0.1 + bad_words_ids: null + bos_token_id: 1 + chunk_size_feed_forward: 0 + classifier_proj_size: 256 + codevector_dim: 768 + contrastive_logits_temperature: 0.1 + conv_bias: true + conv_dim: + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + conv_kernel: + - 10 + - 3 + - 3 + - 3 + - 3 + - 2 + - 2 + conv_stride: + - 5 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + cross_attention_hidden_size: null + ctc_loss_reduction: sum + ctc_zero_infinity: false + decoder_start_token_id: null + diversity_loss_weight: 0.1 + diversity_penalty: 0.0 + do_sample: false + do_stable_layer_norm: true + early_stopping: false + encoder_no_repeat_ngram_size: 0 + eos_token_id: 2 + feat_extract_activation: gelu + feat_extract_dropout: 0.0 + feat_extract_norm: layer + feat_proj_dropout: 0.0 + feat_quantizer_dropout: 0.0 + final_dropout: 0.0 + finetuning_task: null + forced_bos_token_id: null + forced_eos_token_id: null + gradient_checkpointing: false + hidden_act: gelu + hidden_dropout: 0.1 + hidden_dropout_prob: 0.1 + hidden_size: 1024 + id2label: + '0': LABEL_0 + '1': LABEL_1 + initializer_range: 0.02 + intermediate_size: 4096 + is_decoder: false + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + layer_norm_eps: 1.0e-05 + layerdrop: 0.0 + length_penalty: 1.0 + mask_feature_length: 10 + mask_feature_min_masks: 0 + mask_feature_prob: 0.0 + mask_time_length: 10 + mask_time_min_masks: 2 + mask_time_prob: 0.1 + max_length: 20 + min_length: 0 + model_type: wav2vec2 + no_repeat_ngram_size: 0 + num_adapter_layers: 3 + num_attention_heads: 16 + num_beam_groups: 1 + num_beams: 1 + num_codevector_groups: 2 + num_codevectors_per_group: 320 + num_conv_pos_embedding_groups: 16 + num_conv_pos_embeddings: 128 + num_feat_extract_layers: 7 + num_hidden_layers: 24 + num_negatives: 100 + num_return_sequences: 1 + output_attentions: false + output_hidden_size: 1024 + output_hidden_states: false + output_scores: false + pad_token_id: 0 + prefix: null + problem_type: null + proj_codevector_dim: 768 + pruned_heads: {} + remove_invalid_values: false + repetition_penalty: 1.0 + return_dict: true + return_dict_in_generate: false + sep_token_id: null + task_specific_params: null + tdnn_dilation: + - 1 + - 2 + - 3 + - 1 + - 1 + tdnn_dim: + - 512 + - 512 + - 512 + - 512 + - 1500 + tdnn_kernel: + - 5 + - 3 + - 3 + - 1 + - 1 + temperature: 1.0 + tie_encoder_decoder: false + tie_word_embeddings: true + tokenizer_class: null + top_k: 50 + top_p: 1.0 + torch_dtype: null + torchscript: false + transformers_version: 4.17.0.dev0 + use_bfloat16: false + use_weighted_layer_sum: false + vocab_size: 32 + xvector_output_dim: 512 +encoder_no_repeat_ngram_size: + desc: null + value: 0 +eos_token_id: + desc: null + value: 2 +eval_accumulation_steps: + desc: null + value: None +eval_batch_size: + desc: null + value: 8 +eval_steps: + desc: null + value: 500 +evaluation_strategy: + desc: null + value: steps +finetuning_task: + desc: null + value: null +forced_bos_token_id: + desc: null + value: null +forced_eos_token_id: + desc: null + value: null +fp16: + desc: null + value: true +fp16_backend: + desc: null + value: auto +fp16_full_eval: + desc: null + value: false +fp16_opt_level: + desc: null + value: O1 +generation_max_length: + desc: null + value: 40 +generation_num_beams: + desc: null + value: 1 +gradient_accumulation_steps: + desc: null + value: 16 +gradient_checkpointing: + desc: null + value: true +greater_is_better: + desc: null + value: None +group_by_length: + desc: null + value: true +half_precision_backend: + desc: null + value: amp +hub_model_id: + desc: null + value: None +hub_strategy: + desc: null + value: every_save +hub_token: + desc: null + value: +id2label: + desc: null + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: null + value: false +is_decoder: + desc: null + value: false +is_encoder_decoder: + desc: null + value: true +label2id: + desc: null + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: null + value: None +label_smoothing_factor: + desc: null + value: 0.0 +learning_rate: + desc: null + value: 0.0003 +length_column_name: + desc: null + value: input_length +length_penalty: + desc: null + value: 1.0 +load_best_model_at_end: + desc: null + value: false +local_rank: + desc: null + value: -1 +log_level: + desc: null + value: -1 +log_level_replica: + desc: null + value: -1 +log_on_each_node: + desc: null + value: true +logging_dir: + desc: null + value: ./runs/Mar25_19-38-20_sanchit--v100 +logging_first_step: + desc: null + value: false +logging_nan_inf_filter: + desc: null + value: true +logging_steps: + desc: null + value: 1 +logging_strategy: + desc: null + value: steps +lr_scheduler_type: + desc: null + value: linear +max_grad_norm: + desc: null + value: 1.0 +max_length: + desc: null + value: 40 +max_steps: + desc: null + value: -1 +metric_for_best_model: + desc: null + value: None +min_length: + desc: null + value: 0 +model_type: + desc: null + value: speech-encoder-decoder +mp_parameters: + desc: null + value: '' +no_cuda: + desc: null + value: false +no_repeat_ngram_size: + desc: null + value: 0 +num_beam_groups: + desc: null + value: 1 +num_beams: + desc: null + value: 5 +num_return_sequences: + desc: null + value: 1 +num_train_epochs: + desc: null + value: 5.0 +optim: + desc: null + value: adamw_hf +output_attentions: + desc: null + value: false +output_dir: + desc: null + value: ./ +output_hidden_states: + desc: null + value: false +output_scores: + desc: null + value: false +overwrite_output_dir: + desc: null + value: true +pad_token_id: + desc: null + value: 1 +past_index: + desc: null + value: -1 +per_device_eval_batch_size: + desc: null + value: 8 +per_device_train_batch_size: + desc: null + value: 8 +per_gpu_eval_batch_size: + desc: null + value: None +per_gpu_train_batch_size: + desc: null + value: None +predict_with_generate: + desc: null + value: true +prediction_loss_only: + desc: null + value: false +prefix: + desc: null + value: null +problem_type: + desc: null + value: null +processor_class: + desc: null + value: Wav2Vec2Processor +pruned_heads: + desc: null + value: {} +push_to_hub: + desc: null + value: true +push_to_hub_model_id: + desc: null + value: None +push_to_hub_organization: + desc: null + value: None +push_to_hub_token: + desc: null + value: +remove_invalid_values: + desc: null + value: false +remove_unused_columns: + desc: null + value: true +repetition_penalty: + desc: null + value: 1.0 +report_to: + desc: null + value: '[''tensorboard'', ''wandb'']' +resume_from_checkpoint: + desc: null + value: None +return_dict: + desc: null + value: true +return_dict_in_generate: + desc: null + value: false +run_name: + desc: null + value: ./ +save_on_each_node: + desc: null + value: false +save_steps: + desc: null + value: 500 +save_strategy: + desc: null + value: steps +save_total_limit: + desc: null + value: 1 +seed: + desc: null + value: 42 +sep_token_id: + desc: null + value: null +sharded_ddp: + desc: null + value: '[]' +skip_memory_metrics: + desc: null + value: true +sortish_sampler: + desc: null + value: false +task_specific_params: + desc: null + value: null +temperature: + desc: null + value: 1.0 +tf32: + desc: null + value: None +tie_encoder_decoder: + desc: null + value: false +tie_word_embeddings: + desc: null + value: false +tokenizer_class: + desc: null + value: null +top_k: + desc: null + value: 50 +top_p: + desc: null + value: 1.0 +torch_dtype: + desc: null + value: torch.float32 +torchscript: + desc: null + value: false +tpu_metrics_debug: + desc: null + value: false +tpu_num_cores: + desc: null + value: None +train_batch_size: + desc: null + value: 8 +transformers_version: + desc: null + value: null +use_bfloat16: + desc: null + value: false +use_cache: + desc: null + value: false +use_legacy_prediction_loop: + desc: null + value: false +warmup_ratio: + desc: null + value: 0.0 +warmup_steps: + desc: null + value: 500 +weight_decay: + desc: null + value: 0.0 +xpu_backend: + desc: null + value: None