thiomajid
/

fineweb-distil-xlstm-v2

@@ -5,7 +5,7 @@
   "model_type": "xlstm",
   "pad_token_id": 151643,
   "torch_dtype": "float32",
-  "transformers_version": "4.47.0",
   "xlstm_cfg": {
     "_block_map": "1,0,1,0,1,0",
     "add_embedding_dropout": false,
@@ -13,19 +13,19 @@
     "bias": false,
     "context_length": 512,
     "dropout": 0.0,
-    "embedding_dim": 896,
     "mlstm_block": {
       "_block_idx": null,
       "_num_blocks": 6,
       "mlstm": {
-        "_inner_embedding_dim": 1792,
         "_num_blocks": 6,
-        "_proj_up_dim": 1792,
         "bias": false,
         "context_length": 512,
         "conv1d_kernel_size": 4,
         "dropout": 0.0,
-        "embedding_dim": 896,
         "num_heads": 16,
         "proj_factor": 2.0,
         "qkv_proj_blocksize": 32,
@@ -70,14 +70,14 @@
         "dtype_r": "bfloat16",
         "dtype_s": "bfloat16",
         "dtype_w": "bfloat16",
-        "embedding_dim": 896,
         "enable_automatic_mixed_precision": true,
         "forward_clipval": null,
         "function": "slstm",
         "gradient_recurrent_clipval": null,
         "gradient_recurrent_cut": false,
         "group_norm_weight": true,
-        "hidden_size": 896,
         "initial_val": 0.0,
         "input_shape": "BSGNH",
         "internal_input_shape": "SBNGH",

   "model_type": "xlstm",
   "pad_token_id": 151643,
   "torch_dtype": "float32",
+  "transformers_version": "4.47.1",
   "xlstm_cfg": {
     "_block_map": "1,0,1,0,1,0",
     "add_embedding_dropout": false,
     "bias": false,
     "context_length": 512,
     "dropout": 0.0,
+    "embedding_dim": 1536,
     "mlstm_block": {
       "_block_idx": null,
       "_num_blocks": 6,
       "mlstm": {
+        "_inner_embedding_dim": 3072,
         "_num_blocks": 6,
+        "_proj_up_dim": 3072,
         "bias": false,
         "context_length": 512,
         "conv1d_kernel_size": 4,
         "dropout": 0.0,
+        "embedding_dim": 1536,
         "num_heads": 16,
         "proj_factor": 2.0,
         "qkv_proj_blocksize": 32,
         "dtype_r": "bfloat16",
         "dtype_s": "bfloat16",
         "dtype_w": "bfloat16",
+        "embedding_dim": 1536,
         "enable_automatic_mixed_precision": true,
         "forward_clipval": null,
         "function": "slstm",
         "gradient_recurrent_clipval": null,
         "gradient_recurrent_cut": false,
         "group_norm_weight": true,
+        "hidden_size": 1536,
         "initial_val": 0.0,
         "input_shape": "BSGNH",
         "internal_input_shape": "SBNGH",

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:14cb0d529e133417c713de6a2e5940e6ae10b9d52f8aecf6f5ee024dcf12a716
-size 1205675536

 version https://git-lfs.github.com/spec/v1
+oid sha256:eb464cae802ff426ddc9d8c1d7c65397362f6a3beb1a388450de538fa85ae06d
+size 2203703856

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5116cf1f58f4f76d8917953ec2e6ea1f47662dd090bb5f003c1951ff5bc2d50
 size 5944

 version https://git-lfs.github.com/spec/v1
+oid sha256:96653b90ef5c8a1a6f8678ce98752b85d28dbfd1477d9efa92ece25be0e3c0e8
 size 5944