AlekseyKorshuk commited on
Commit
7e3d7f5
1 Parent(s): 29f97ae

End of training

Browse files
README.md CHANGED
@@ -35,7 +35,7 @@ datasets:
35
 
36
  dataset_prepared_path:
37
  val_set_size: 0.001
38
- output_dir: ./phi-sft-out
39
 
40
  sequence_len: 2048
41
  sample_packing: false # currently unsupported
@@ -63,8 +63,8 @@ adam_beta2: 0.95
63
  adam_epsilon: 0.00001
64
  #max_grad_norm: 1.0
65
  lr_scheduler: cosine
66
- learning_rate: 1e-5
67
- warmup_ratio: 0.03
68
  weight_decay: 0.01
69
 
70
  train_on_inputs: false
@@ -85,7 +85,7 @@ flash_attention: true
85
 
86
  evals_per_epoch: 1
87
  eval_table_size: 8 # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
88
- eval_table_max_new_tokens: 512 # Total number of tokens generated for predictions sent to wandb. Default is 128
89
 
90
  saves_per_epoch: 1
91
  save_total_limit: 1
@@ -109,7 +109,7 @@ tokens:
109
 
110
  This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the None dataset.
111
  It achieves the following results on the evaluation set:
112
- - Loss: 1.0374
113
 
114
  ## Model description
115
 
@@ -128,7 +128,7 @@ More information needed
128
  ### Training hyperparameters
129
 
130
  The following hyperparameters were used during training:
131
- - learning_rate: 1e-05
132
  - train_batch_size: 16
133
  - eval_batch_size: 16
134
  - seed: 42
@@ -138,16 +138,17 @@ The following hyperparameters were used during training:
138
  - total_eval_batch_size: 128
139
  - optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-05
140
  - lr_scheduler_type: cosine
 
141
  - num_epochs: 3
142
 
143
  ### Training results
144
 
145
  | Training Loss | Epoch | Step | Validation Loss |
146
  |:-------------:|:-----:|:----:|:---------------:|
147
- | 1.0571 | 0.01 | 1 | 1.2056 |
148
- | 0.8271 | 1.0 | 82 | 1.0443 |
149
- | 0.7871 | 2.0 | 164 | 1.0378 |
150
- | 0.8198 | 3.0 | 246 | 1.0374 |
151
 
152
 
153
  ### Framework versions
 
35
 
36
  dataset_prepared_path:
37
  val_set_size: 0.001
38
+ output_dir: ./output
39
 
40
  sequence_len: 2048
41
  sample_packing: false # currently unsupported
 
63
  adam_epsilon: 0.00001
64
  #max_grad_norm: 1.0
65
  lr_scheduler: cosine
66
+ learning_rate: 2e-5
67
+ warmup_steps: 4
68
  weight_decay: 0.01
69
 
70
  train_on_inputs: false
 
85
 
86
  evals_per_epoch: 1
87
  eval_table_size: 8 # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
88
+ eval_table_max_new_tokens: 768 # Total number of tokens generated for predictions sent to wandb. Default is 128
89
 
90
  saves_per_epoch: 1
91
  save_total_limit: 1
 
109
 
110
  This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the None dataset.
111
  It achieves the following results on the evaluation set:
112
+ - Loss: 1.0121
113
 
114
  ## Model description
115
 
 
128
  ### Training hyperparameters
129
 
130
  The following hyperparameters were used during training:
131
+ - learning_rate: 2e-05
132
  - train_batch_size: 16
133
  - eval_batch_size: 16
134
  - seed: 42
 
138
  - total_eval_batch_size: 128
139
  - optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-05
140
  - lr_scheduler_type: cosine
141
+ - lr_scheduler_warmup_steps: 4
142
  - num_epochs: 3
143
 
144
  ### Training results
145
 
146
  | Training Loss | Epoch | Step | Validation Loss |
147
  |:-------------:|:-----:|:----:|:---------------:|
148
+ | 1.0571 | 0.01 | 1 | 1.3648 |
149
+ | 0.8044 | 1.0 | 82 | 1.0212 |
150
+ | 0.7486 | 2.0 | 164 | 1.0126 |
151
+ | 0.7745 | 3.0 | 246 | 1.0121 |
152
 
153
 
154
  ### Framework versions
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b8e440ba098c6693e1970dbfdf8684e3a14da4bbbdade6d0ad2cd0a952c2b46
3
  size 4995584424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e6b2ac5cd4fca2335a391e321a6ed3737b759803f20b91b91e19d1fa1e95c08
3
  size 4995584424
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dacdf3eaf603bfb78a59db4f250e37abcabf85e593fd346c5fd74fc7e76839b6
3
  size 563832976
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8b5a250a1c279dc3032236b9641e76e72c47a57ab50db7beed09bb9615f1789
3
  size 563832976
pytorch_model-00001-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4ca6187ba5a05330be169c2ba5b686eb828e7ee70bbbc417f00527486bc017d
3
  size 4995685160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b89297bf572d4668437fed3b2e66f4ad7def0a4f5e99d8ea0d8db73ac1927a0
3
  size 4995685160
pytorch_model-00002-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a1598764727976664df4ce877a1b746d5eafb550412192922e7bac0695011e1
3
  size 563839915
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9babe694436407ae9a4421b5e2e3438fa875bcfd0c3437877df2b5c83b15e810
3
  size 563839915