Upload LeanAlbertForPreTraining

Browse files

Files changed (6) hide show

config.json +42 -0
pytorch_model-00001-of-00004.bin +3 -0
pytorch_model-00002-of-00004.bin +3 -0
pytorch_model-00003-of-00004.bin +3 -0
pytorch_model-00004-of-00004.bin +3 -0
pytorch_model.bin.index.json +410 -0

config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "adapter_dim": 0,
+  "architectures": [
+    "LeanAlbertForPreTraining"
+  ],
+  "attention_probs_dropout_prob": 0,
+  "attn_qkv_bias": true,
+  "block_size": 0,
+  "bos_token_id": 2,
+  "classifier_dropout_prob": 0.1,
+  "embedding_size": 128,
+  "eos_token_id": 3,
+  "hidden_act": "gelu_fused",
+  "hidden_act_gated": true,
+  "hidden_dropout_prob": 0,
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "inner_group_num": 1,
+  "intermediate_size": 16384,
+  "layer_norm_eps": 1e-12,
+  "lowrank_dim": 0,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 64,
+  "num_hidden_groups": 32,
+  "num_hidden_layers": 32,
+  "num_inner_groups": 1,
+  "num_inner_matrices": 1,
+  "num_memory_blocks": 0,
+  "num_shared_matrices": 1,
+  "out_proj_bias": true,
+  "pad_token_id": 0,
+  "position_embedding_type": "rotary",
+  "rotary_embedding_base": 10000,
+  "sandwich_norm": false,
+  "share_large_matrices": true,
+  "torch_dtype": "float32",
+  "total_num_layer_groups": 32,
+  "total_shared_matrix_sets": 1,
+  "transformers_version": "4.24.0",
+  "type_vocab_size": 2,
+  "vocab_size": 50005
+}

pytorch_model-00001-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20855d68e88294c3a3c16ba6a79e8373509cceb85c0a5396adc80bfb188a3037
+size 1104232557

pytorch_model-00002-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14405235c03af56e11049a02633e0cc113b29e7ddaa70402c05a24152ab87a53
+size 1076280999

pytorch_model-00003-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7c671915cdd977db5ca86852e249483958c8e8032de06487c23609df63a15f1
+size 1076280999

pytorch_model-00004-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:303cd9f0601ab7774d7c2eb7b3f903408035ed6afa800786380c8138b1c97484
+size 1170017317

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,410 @@

+{
+  "metadata": {
+    "total_size": 34491661488
+  },
+  "weight_map": {
+    "albert.embeddings.embedding_hidden_mapping.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.embeddings.embedding_hidden_mapping.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.embeddings.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.embeddings.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.embeddings.token_type_embeddings.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.embeddings.word_embeddings.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.pooler.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.pooler.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.ffn.dense_h2o.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.ffn.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.0.layers.0.ffn.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.ffn.dense_h2o.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.ffn.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.1.layers.0.ffn.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.attention.dense_out.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.attention.dense_qkv.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.attention.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.attention.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.ffn.dense_h2o.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.ffn.dense_i2h.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.ffn.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.10.layers.0.ffn.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.attention.dense_out.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.attention.dense_qkv.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.attention.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.attention.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.ffn.dense_h2o.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.ffn.dense_i2h.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.ffn.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.11.layers.0.ffn.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.attention.dense_out.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.attention.dense_qkv.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.attention.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.attention.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.ffn.dense_h2o.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.ffn.dense_i2h.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.ffn.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.12.layers.0.ffn.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.attention.dense_out.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.attention.dense_qkv.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.attention.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.attention.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.ffn.dense_h2o.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.ffn.dense_i2h.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.ffn.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.13.layers.0.ffn.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.attention.dense_out.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.attention.dense_qkv.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.attention.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.attention.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.ffn.dense_h2o.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.ffn.dense_i2h.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.ffn.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.14.layers.0.ffn.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.attention.dense_out.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.attention.dense_qkv.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.attention.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.attention.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.ffn.dense_h2o.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.ffn.dense_i2h.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.ffn.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.15.layers.0.ffn.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.attention.dense_out.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.attention.dense_qkv.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.attention.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.attention.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.ffn.dense_h2o.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.ffn.dense_i2h.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.ffn.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.16.layers.0.ffn.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.attention.dense_out.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.attention.dense_qkv.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.attention.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.attention.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.ffn.dense_h2o.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.ffn.dense_i2h.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.ffn.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.17.layers.0.ffn.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.attention.dense_out.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.attention.dense_qkv.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.attention.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.attention.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.ffn.dense_h2o.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.ffn.dense_i2h.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.ffn.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.18.layers.0.ffn.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.attention.dense_out.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.attention.dense_qkv.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.attention.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.attention.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.ffn.dense_h2o.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.ffn.dense_i2h.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.ffn.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.19.layers.0.ffn.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.ffn.dense_h2o.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.ffn.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.2.layers.0.ffn.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.attention.dense_out.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.attention.dense_qkv.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.attention.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.attention.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.ffn.dense_h2o.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.ffn.dense_i2h.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.ffn.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.20.layers.0.ffn.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.attention.dense_out.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.attention.dense_qkv.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.attention.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.attention.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.ffn.dense_h2o.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.ffn.dense_i2h.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.ffn.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.21.layers.0.ffn.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.attention.dense_out.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.attention.dense_qkv.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.attention.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.attention.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.ffn.dense_h2o.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.ffn.dense_i2h.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.ffn.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.22.layers.0.ffn.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.attention.dense_out.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.attention.dense_qkv.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.attention.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.attention.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.ffn.dense_h2o.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.ffn.dense_i2h.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.ffn.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.23.layers.0.ffn.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.attention.dense_out.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.attention.dense_qkv.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.attention.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.attention.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.ffn.dense_h2o.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.ffn.dense_i2h.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.ffn.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.24.layers.0.ffn.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.attention.dense_out.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.attention.dense_qkv.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.attention.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.attention.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.ffn.dense_h2o.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.ffn.dense_i2h.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.ffn.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.25.layers.0.ffn.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.attention.dense_out.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.attention.dense_qkv.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.attention.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.attention.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.ffn.dense_h2o.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.ffn.dense_i2h.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.ffn.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.26.layers.0.ffn.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.attention.dense_out.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.attention.dense_qkv.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.attention.layer_norm.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.attention.layer_norm.weight": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.ffn.dense_h2o.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.ffn.dense_i2h.bias": "pytorch_model-00003-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.ffn.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.27.layers.0.ffn.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.attention.dense_out.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.attention.dense_qkv.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.attention.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.attention.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.ffn.dense_h2o.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.ffn.dense_i2h.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.ffn.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.28.layers.0.ffn.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.attention.dense_out.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.attention.dense_qkv.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.attention.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.attention.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.ffn.dense_h2o.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.ffn.dense_i2h.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.ffn.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.29.layers.0.ffn.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.ffn.dense_h2o.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.ffn.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.3.layers.0.ffn.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.attention.dense_out.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.attention.dense_qkv.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.attention.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.attention.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.ffn.dense_h2o.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.ffn.dense_i2h.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.ffn.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.30.layers.0.ffn.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.attention.dense_out.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.attention.dense_qkv.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.attention.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.attention.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.ffn.dense_h2o.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.ffn.dense_i2h.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.ffn.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.31.layers.0.ffn.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.ffn.dense_h2o.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.ffn.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.4.layers.0.ffn.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.ffn.dense_h2o.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.ffn.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.5.layers.0.ffn.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.ffn.dense_h2o.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.ffn.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.6.layers.0.ffn.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.ffn.dense_h2o.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.ffn.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.7.layers.0.ffn.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.ffn.dense_h2o.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.ffn.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.8.layers.0.ffn.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.attention.dense_out.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.attention.dense_out.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.attention.dense_qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.attention.dense_qkv.shared_matrix.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.attention.layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.attention.layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.ffn.dense_h2o.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.ffn.dense_h2o.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.ffn.dense_i2h.bias": "pytorch_model-00001-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.ffn.dense_i2h.shared_matrix.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.ffn.layer_norm.bias": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.layer_groups.9.layers.0.ffn.layer_norm.weight": "pytorch_model-00002-of-00004.bin",
+    "albert.transformer.post_layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "albert.transformer.post_layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "predictions.bias": "pytorch_model-00004-of-00004.bin",
+    "predictions.decoder.bias": "pytorch_model-00004-of-00004.bin",
+    "predictions.decoder.weight": "pytorch_model-00004-of-00004.bin",
+    "predictions.dense.bias": "pytorch_model-00004-of-00004.bin",
+    "predictions.dense.weight": "pytorch_model-00004-of-00004.bin",
+    "predictions.layer_norm.bias": "pytorch_model-00004-of-00004.bin",
+    "predictions.layer_norm.weight": "pytorch_model-00004-of-00004.bin",
+    "sop_classifier.classifier.bias": "pytorch_model-00004-of-00004.bin",
+    "sop_classifier.classifier.weight": "pytorch_model-00004-of-00004.bin"
+  }
+}