MoLM-700M-8B / pytorch_model.bin.index.json
YikangS's picture
First model version
2d4540e
{
"metadata": {
"total_size": 16515088384
},
"weight_map": {
"lm_head.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.0.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.0.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.1.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.10.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.11.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.12.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.13.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.14.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.15.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.16.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.17.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.18.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.19.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.2.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.20.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.21.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.22.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.23.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.24.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.25.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.26.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.27.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.28.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.29.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.29.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.29.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.29.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.29.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.29.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.29.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.3.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.3.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.30.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.30.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.31.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.32.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.33.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.34.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.35.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.36.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.37.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.38.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.39.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.4.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.4.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.40.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.40.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.41.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.42.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.43.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.44.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.45.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.46.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.attn.cum_weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.attn.mask": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.attn.q_proj.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.attn.q_proj.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.ln_1.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.ln_1.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.ln_2.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.ln_2.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.mlpf.experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.mlpf.gate.w_gate.0.bias": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.mlpf.gate.w_gate.0.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.mlpf.gate.w_gate.3.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.47.mlpf.output_experts.weight": "pytorch_model-00002-of-00002.bin",
"transformer.h.5.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.5.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.6.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.7.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.8.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.attn.cum_weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.attn.mask": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.attn.q_proj.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.attn.q_proj.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.attn.q_proj.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.attn.q_proj.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.attn.q_proj.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.ln_1.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.ln_2.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.mlpf.experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.mlpf.gate.w_gate.0.bias": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.mlpf.gate.w_gate.0.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.mlpf.gate.w_gate.3.weight": "pytorch_model-00001-of-00002.bin",
"transformer.h.9.mlpf.output_experts.weight": "pytorch_model-00001-of-00002.bin",
"transformer.ln_f.bias": "pytorch_model-00002-of-00002.bin",
"transformer.ln_f.weight": "pytorch_model-00002-of-00002.bin",
"transformer.wte.weight": "pytorch_model-00001-of-00002.bin"
}
}