wzhouad commited on
Commit
99a462b
1 Parent(s): 40ca381

Model save

Browse files
README.md CHANGED
@@ -35,14 +35,14 @@ More information needed
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 5e-07
38
- - train_batch_size: 8
39
- - eval_batch_size: 8
40
- - seed: 1
41
  - distributed_type: multi-GPU
42
  - num_devices: 8
43
- - gradient_accumulation_steps: 2
44
  - total_train_batch_size: 128
45
- - total_eval_batch_size: 64
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.1
@@ -54,7 +54,7 @@ The following hyperparameters were used during training:
54
 
55
  ### Framework versions
56
 
57
- - Transformers 4.35.2
58
  - Pytorch 2.1.2+cu121
59
  - Datasets 2.14.6
60
- - Tokenizers 0.14.1
 
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 5e-07
38
+ - train_batch_size: 1
39
+ - eval_batch_size: 1
40
+ - seed: 42
41
  - distributed_type: multi-GPU
42
  - num_devices: 8
43
+ - gradient_accumulation_steps: 16
44
  - total_train_batch_size: 128
45
+ - total_eval_batch_size: 8
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.1
 
54
 
55
  ### Framework versions
56
 
57
+ - Transformers 4.41.1
58
  - Pytorch 2.1.2+cu121
59
  - Datasets 2.14.6
60
+ - Tokenizers 0.19.1
all_results.json CHANGED
@@ -1,8 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.29227719434256694,
4
- "train_runtime": 2368.6921,
5
- "train_samples": 39494,
6
- "train_samples_per_second": 16.673,
7
- "train_steps_per_second": 0.13
 
8
  }
 
1
  {
2
+ "epoch": 0.9998009950248756,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.017644011282071378,
5
+ "train_runtime": 64691.5343,
6
+ "train_samples": 160800,
7
+ "train_samples_per_second": 2.486,
8
+ "train_steps_per_second": 0.019
9
  }
config.json CHANGED
@@ -3,6 +3,7 @@
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
 
6
  "bos_token_id": 1,
7
  "eos_token_id": 2,
8
  "hidden_act": "silu",
@@ -19,7 +20,7 @@
19
  "sliding_window": 4096,
20
  "tie_word_embeddings": false,
21
  "torch_dtype": "bfloat16",
22
- "transformers_version": "4.35.2",
23
  "use_cache": false,
24
  "vocab_size": 32000
25
  }
 
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
6
+ "attention_dropout": 0.0,
7
  "bos_token_id": 1,
8
  "eos_token_id": 2,
9
  "hidden_act": "silu",
 
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.41.1",
24
  "use_cache": false,
25
  "vocab_size": 32000
26
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.35.2"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.41.1"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c2ba5f0a961585b40d16135c17eafa4fdf03f5a42f890844186a18b238c4ca6
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94741663b8c22d9d63dc103e16021d4a795bebc3806bc8a3a28df4389a2a3436
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff12e41f45a36c3f68890ef475dcd6c05be44dedb814cf20a0f21a884a55dbdd
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c7ea3821a24d6f5d661d04d83b9368480e46efa1906566d522b643a56e54714
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af1132bbacd4b520075a385096f1f152b8e4ca6f3f46aec4c5c4824b996b83f0
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3d5ac37fcc5e44bdbcf64441154710502567ad3af48f65a66a7f9644cc8630d
3
  size 4540516344
tokenizer.json CHANGED
@@ -134,6 +134,7 @@
134
  "end_of_word_suffix": null,
135
  "fuse_unk": true,
136
  "byte_fallback": true,
 
137
  "vocab": {
138
  "<unk>": 0,
139
  "<s>": 1,
 
134
  "end_of_word_suffix": null,
135
  "fuse_unk": true,
136
  "byte_fallback": true,
137
+ "ignore_merges": false,
138
  "vocab": {
139
  "<unk>": 0,
140
  "<s>": 1,
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "added_tokens_decoder": {
3
  "0": {
4
  "content": "<unk>",
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
train_results.json CHANGED
@@ -1,8 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.29227719434256694,
4
- "train_runtime": 2368.6921,
5
- "train_samples": 39494,
6
- "train_samples_per_second": 16.673,
7
- "train_steps_per_second": 0.13
 
8
  }
 
1
  {
2
+ "epoch": 0.9998009950248756,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.017644011282071378,
5
+ "train_runtime": 64691.5343,
6
+ "train_samples": 160800,
7
+ "train_samples_per_second": 2.486,
8
+ "train_steps_per_second": 0.019
9
  }
trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d65697231b98f7e888eecc68068bced2937dd398ec5f61cc77ca94851e4af926
3
- size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c944456dbc3ebf108bccf6fc2316ab03d0876e88963b264d4065848df494725
3
+ size 6520