thiomajid commited on
Commit
a66aa1e
·
verified ·
1 Parent(s): 0d86300

Training in progress, step 100

Browse files
Files changed (3) hide show
  1. config.json +7 -7
  2. model.safetensors +2 -2
  3. training_args.bin +1 -1
config.json CHANGED
@@ -5,7 +5,7 @@
5
  "model_type": "xlstm",
6
  "pad_token_id": 151643,
7
  "torch_dtype": "float32",
8
- "transformers_version": "4.47.0",
9
  "xlstm_cfg": {
10
  "_block_map": "1,0,1,0,1,0",
11
  "add_embedding_dropout": false,
@@ -13,19 +13,19 @@
13
  "bias": false,
14
  "context_length": 512,
15
  "dropout": 0.0,
16
- "embedding_dim": 896,
17
  "mlstm_block": {
18
  "_block_idx": null,
19
  "_num_blocks": 6,
20
  "mlstm": {
21
- "_inner_embedding_dim": 1792,
22
  "_num_blocks": 6,
23
- "_proj_up_dim": 1792,
24
  "bias": false,
25
  "context_length": 512,
26
  "conv1d_kernel_size": 4,
27
  "dropout": 0.0,
28
- "embedding_dim": 896,
29
  "num_heads": 16,
30
  "proj_factor": 2.0,
31
  "qkv_proj_blocksize": 32,
@@ -70,14 +70,14 @@
70
  "dtype_r": "bfloat16",
71
  "dtype_s": "bfloat16",
72
  "dtype_w": "bfloat16",
73
- "embedding_dim": 896,
74
  "enable_automatic_mixed_precision": true,
75
  "forward_clipval": null,
76
  "function": "slstm",
77
  "gradient_recurrent_clipval": null,
78
  "gradient_recurrent_cut": false,
79
  "group_norm_weight": true,
80
- "hidden_size": 896,
81
  "initial_val": 0.0,
82
  "input_shape": "BSGNH",
83
  "internal_input_shape": "SBNGH",
 
5
  "model_type": "xlstm",
6
  "pad_token_id": 151643,
7
  "torch_dtype": "float32",
8
+ "transformers_version": "4.47.1",
9
  "xlstm_cfg": {
10
  "_block_map": "1,0,1,0,1,0",
11
  "add_embedding_dropout": false,
 
13
  "bias": false,
14
  "context_length": 512,
15
  "dropout": 0.0,
16
+ "embedding_dim": 1536,
17
  "mlstm_block": {
18
  "_block_idx": null,
19
  "_num_blocks": 6,
20
  "mlstm": {
21
+ "_inner_embedding_dim": 3072,
22
  "_num_blocks": 6,
23
+ "_proj_up_dim": 3072,
24
  "bias": false,
25
  "context_length": 512,
26
  "conv1d_kernel_size": 4,
27
  "dropout": 0.0,
28
+ "embedding_dim": 1536,
29
  "num_heads": 16,
30
  "proj_factor": 2.0,
31
  "qkv_proj_blocksize": 32,
 
70
  "dtype_r": "bfloat16",
71
  "dtype_s": "bfloat16",
72
  "dtype_w": "bfloat16",
73
+ "embedding_dim": 1536,
74
  "enable_automatic_mixed_precision": true,
75
  "forward_clipval": null,
76
  "function": "slstm",
77
  "gradient_recurrent_clipval": null,
78
  "gradient_recurrent_cut": false,
79
  "group_norm_weight": true,
80
+ "hidden_size": 1536,
81
  "initial_val": 0.0,
82
  "input_shape": "BSGNH",
83
  "internal_input_shape": "SBNGH",
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14cb0d529e133417c713de6a2e5940e6ae10b9d52f8aecf6f5ee024dcf12a716
3
- size 1205675536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb464cae802ff426ddc9d8c1d7c65397362f6a3beb1a388450de538fa85ae06d
3
+ size 2203703856
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5116cf1f58f4f76d8917953ec2e6ea1f47662dd090bb5f003c1951ff5bc2d50
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96653b90ef5c8a1a6f8678ce98752b85d28dbfd1477d9efa92ece25be0e3c0e8
3
  size 5944