kaizerBox commited on
Commit
3a2238a
1 Parent(s): 862d641

retnet-xsum_model_70k-28_1M

Browse files
README.md CHANGED
@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [](https://huggingface.co/) on the xsum dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 4.5288
19
 
20
  ## Model description
21
 
@@ -35,31 +35,29 @@ More information needed
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 0.0006
38
- - train_batch_size: 1
39
- - eval_batch_size: 1
40
  - seed: 42
41
  - gradient_accumulation_steps: 4
42
- - total_train_batch_size: 4
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: cosine
45
  - lr_scheduler_warmup_steps: 10
46
- - num_epochs: 10
47
  - mixed_precision_training: Native AMP
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:-----:|:---------------:|
53
- | 5.5013 | 1.0 | 2500 | 4.8933 |
54
- | 4.5772 | 2.0 | 5000 | 4.5701 |
55
- | 4.1583 | 3.0 | 7500 | 4.4502 |
56
- | 3.8363 | 4.0 | 10000 | 4.3856 |
57
- | 3.5506 | 5.0 | 12500 | 4.3816 |
58
- | 3.2924 | 6.0 | 15000 | 4.4053 |
59
- | 3.0678 | 7.0 | 17500 | 4.4400 |
60
- | 2.8893 | 8.0 | 20000 | 4.4869 |
61
- | 2.7668 | 9.0 | 22500 | 4.5172 |
62
- | 2.7036 | 10.0 | 25000 | 4.5288 |
63
 
64
 
65
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [](https://huggingface.co/) on the xsum dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 4.0200
19
 
20
  ## Model description
21
 
 
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 0.0006
38
+ - train_batch_size: 8
39
+ - eval_batch_size: 8
40
  - seed: 42
41
  - gradient_accumulation_steps: 4
42
+ - total_train_batch_size: 32
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: cosine
45
  - lr_scheduler_warmup_steps: 10
46
+ - num_epochs: 8
47
  - mixed_precision_training: Native AMP
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:-----:|:---------------:|
53
+ | 5.3257 | 1.0 | 2187 | 4.6412 |
54
+ | 4.5863 | 2.0 | 4375 | 4.3474 |
55
+ | 4.3703 | 3.0 | 6562 | 4.2111 |
56
+ | 4.2404 | 4.0 | 8750 | 4.1213 |
57
+ | 4.1568 | 5.0 | 10937 | 4.0673 |
58
+ | 4.0975 | 6.0 | 13125 | 4.0371 |
59
+ | 4.0618 | 7.0 | 15312 | 4.0219 |
60
+ | 4.045 | 8.0 | 17496 | 4.0200 |
 
 
61
 
62
 
63
  ### Framework versions
config.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
- "activation_dropout": 0.0,
3
  "activation_fn": "swish",
4
  "architectures": [
5
  "RetNetForCausalLM"
6
  ],
7
- "decoder_embed_dim": 512,
8
- "decoder_ffn_embed_dim": 864,
9
- "decoder_layers": 4,
10
  "decoder_normalize_before": true,
11
  "decoder_retention_heads": 2,
12
- "decoder_value_embed_dim": 864,
13
  "deepnorm": false,
14
- "drop_path_rate": 0.0,
15
  "dropout": 0.1,
16
  "eos_token_id": 50256,
17
  "forward_impl": "parallel",
@@ -23,7 +23,7 @@
23
  "no_scale_embedding": false,
24
  "output_retentions": false,
25
  "pad_token_id": 50257,
26
- "recurrent_chunk_size": 512,
27
  "subln": true,
28
  "tie_word_embeddings": false,
29
  "torch_dtype": "float32",
 
1
  {
2
+ "activation_dropout": 0.1,
3
  "activation_fn": "swish",
4
  "architectures": [
5
  "RetNetForCausalLM"
6
  ],
7
+ "decoder_embed_dim": 256,
8
+ "decoder_ffn_embed_dim": 432,
9
+ "decoder_layers": 3,
10
  "decoder_normalize_before": true,
11
  "decoder_retention_heads": 2,
12
+ "decoder_value_embed_dim": 432,
13
  "deepnorm": false,
14
+ "drop_path_rate": 0.1,
15
  "dropout": 0.1,
16
  "eos_token_id": 50256,
17
  "forward_impl": "parallel",
 
23
  "no_scale_embedding": false,
24
  "output_retentions": false,
25
  "pad_token_id": 50257,
26
+ "recurrent_chunk_size": 256,
27
  "subln": true,
28
  "tie_word_embeddings": false,
29
  "torch_dtype": "float32",
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26e8b5ce8a75937cbe2bfedd62db58c48a622baaf6948a10dfc5afd7e4d65e2c
3
- size 256743288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec44e3271173c6e75a66acc989aa19a83850d775155a5a968a248b07ee0cd3da
3
+ size 112478520
runs/Nov03_22-11-48_cbf29f69c7ee/events.out.tfevents.1699049509.cbf29f69c7ee.2454.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c8681c92acdca48d5c00bced2a5e295a7e8f1e94892319125a30623875b8a0d
3
+ size 8207
runs/Nov03_22-11-48_cbf29f69c7ee/events.out.tfevents.1699067684.cbf29f69c7ee.2454.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0462915650419b4b9e2fe38a407d605f91531f643c1f577bc213d80f880a3cf
3
+ size 364
tokenizer.json CHANGED
@@ -1,7 +1,16 @@
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 50256,
 
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
+ "padding": {
5
+ "strategy": {
6
+ "Fixed": 1024
7
+ },
8
+ "direction": "Right",
9
+ "pad_to_multiple_of": null,
10
+ "pad_id": 50257,
11
+ "pad_type_id": 0,
12
+ "pad_token": "<|pad|>"
13
+ },
14
  "added_tokens": [
15
  {
16
  "id": 50256,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f079f397e15b5a36b5952f2298243ba98bc70f542dda8e1bebe1c8869c260cc
3
  size 4536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:011be80e171330f986658f62208876e42ea8718c3e05fb87230fcb064c1711b8
3
  size 4536