lukeleeai commited on
Commit
8d4f869
1 Parent(s): 1322f51

End of training

Browse files
README.md CHANGED
@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 8.6108
19
 
20
  ## Model description
21
 
@@ -45,15 +45,20 @@ The following hyperparameters were used during training:
45
  - total_eval_batch_size: 2
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
- - training_steps: 50
49
 
50
  ### Training results
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:-----:|:----:|:---------------:|
54
- | 2.1687 | 0.0 | 10 | 2.4190 |
55
- | 9.3886 | 0.0 | 25 | 9.1091 |
56
- | 8.3891 | 0.01 | 50 | 8.3752 |
 
 
 
 
 
57
 
58
 
59
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 9.8230
19
 
20
  ## Model description
21
 
 
45
  - total_eval_batch_size: 2
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
+ - training_steps: 200
49
 
50
  ### Training results
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:-----:|:----:|:---------------:|
54
+ | 3.7949 | 0.0 | 25 | 2.4027 |
55
+ | 3.6416 | 0.01 | 50 | 2.3898 |
56
+ | 3.6954 | 0.01 | 75 | 2.3849 |
57
+ | 3.583 | 0.02 | 100 | 2.3945 |
58
+ | 3.547 | 0.02 | 125 | 2.4562 |
59
+ | 3.5568 | 0.02 | 150 | 2.5076 |
60
+ | 3.4658 | 0.03 | 175 | 2.5108 |
61
+ | 3.4684 | 0.03 | 200 | 2.5249 |
62
 
63
 
64
  ### Framework versions
config.json CHANGED
@@ -59,7 +59,8 @@
59
  "tie_word_embeddings": false,
60
  "torch_dtype": "bfloat16",
61
  "transformers_version": "4.37.2",
62
- "use_cache": false,
 
63
  "use_graceful_regularization": true,
64
  "use_relu": false,
65
  "use_sparse_model": true,
 
59
  "tie_word_embeddings": false,
60
  "torch_dtype": "bfloat16",
61
  "transformers_version": "4.37.2",
62
+ "us_sparse_regularization": true,
63
+ "use_cache": true,
64
  "use_graceful_regularization": true,
65
  "use_relu": false,
66
  "use_sparse_model": true,
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89be71f4726756540a7e92925ea6cc4884466952294d8ce42bcb108d45be4494
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ace92b47f66a1e5a1371c04c29f95dbe246e2ba01fc87d3410c04e9152acaeab
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75f4b1e531515c7c971bb268f5bceb2c045e9cf6a76ebed4fddd2c5353c795c2
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4ffb3a1220bc47d6078225df316c9fc9161206859ea9120a9f35c406ff347f4
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78921f2c3c1c21893a7694d97bd2652764de23a1a58fb43cfbd717bf1aa6fdbd
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d85be4d4fd64d8cbd6e88d4ae76f147705308dfbe4b0ece5f0d94693b2c4c59
3
  size 4540516344
sparsification_sftt.py CHANGED
@@ -78,6 +78,14 @@ class SparseSFTTTrainer(SFTTrainer):
78
  if loss is not None:
79
  self.accelerator.backward(loss, retain_graph=False)
80
 
 
 
 
 
 
 
 
 
81
  if self.use_sparse_regularization:
82
  regularization_loss = self.compute_regularization(model)
83
  if self.args.n_gpu > 1:
@@ -86,13 +94,8 @@ class SparseSFTTTrainer(SFTTrainer):
86
  self.accelerator.backward(regularization_loss, retain_graph=True)
87
  loss += regularization_loss
88
 
89
- if self.use_spm_loss:
90
- spm_loss = self.compute_spm_loss(model)
91
- if self.args.n_gpu > 1:
92
- spm_loss = spm_loss.mean()
93
- if spm_loss is not None:
94
- self.accelerator.backward(spm_loss, retain_graph=False)
95
- loss += spm_loss
96
 
97
  return loss.detach() / self.args.gradient_accumulation_steps
98
 
@@ -198,9 +201,10 @@ class SparseTrainer(Trainer):
198
 
199
  if self.args.n_gpu > 1:
200
  loss = loss.mean() # mean() to average on multi-gpu parallel training
 
201
  if not self.freeze_original_weights:
202
  if loss is not None:
203
- self.accelerator.backward(loss, retain_graph=False)
204
 
205
  if self.use_sparse_regularization:
206
  regularization_loss = self.compute_regularization(model)
 
78
  if loss is not None:
79
  self.accelerator.backward(loss, retain_graph=False)
80
 
81
+ if self.use_spm_loss:
82
+ spm_loss = self.compute_spm_loss(model)
83
+ if self.args.n_gpu > 1:
84
+ spm_loss = spm_loss.mean()
85
+ if spm_loss is not None:
86
+ self.accelerator.backward(spm_loss, retain_graph=False)
87
+ loss += spm_loss
88
+
89
  if self.use_sparse_regularization:
90
  regularization_loss = self.compute_regularization(model)
91
  if self.args.n_gpu > 1:
 
94
  self.accelerator.backward(regularization_loss, retain_graph=True)
95
  loss += regularization_loss
96
 
97
+ if self.state.global_step % 5 == 0:
98
+ ds_print("Regularization loss: ", regularization_loss.item())
 
 
 
 
 
99
 
100
  return loss.detach() / self.args.gradient_accumulation_steps
101
 
 
201
 
202
  if self.args.n_gpu > 1:
203
  loss = loss.mean() # mean() to average on multi-gpu parallel training
204
+
205
  if not self.freeze_original_weights:
206
  if loss is not None:
207
+ self.accelerator.backward(loss, retain_graph=True)
208
 
209
  if self.use_sparse_regularization:
210
  regularization_loss = self.compute_regularization(model)