End of training

Files changed (6) hide show

README.md CHANGED Viewed

@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: 8.6108
 ## Model description
@@ -45,15 +45,20 @@ The following hyperparameters were used during training:
 - total_eval_batch_size: 2
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
-- training_steps: 50
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 2.1687        | 0.0   | 10   | 2.4190          |
-| 9.3886        | 0.0   | 25   | 9.1091          |
-| 8.3891        | 0.01  | 50   | 8.3752          |
 ### Framework versions

 This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 9.8230
 ## Model description
 - total_eval_batch_size: 2
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
+- training_steps: 200
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 3.7949        | 0.0   | 25   | 2.4027          |
+| 3.6416        | 0.01  | 50   | 2.3898          |
+| 3.6954        | 0.01  | 75   | 2.3849          |
+| 3.583         | 0.02  | 100  | 2.3945          |
+| 3.547         | 0.02  | 125  | 2.4562          |
+| 3.5568        | 0.02  | 150  | 2.5076          |
+| 3.4658        | 0.03  | 175  | 2.5108          |
+| 3.4684        | 0.03  | 200  | 2.5249          |
 ### Framework versions

config.json CHANGED Viewed

@@ -59,7 +59,8 @@
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.37.2",
-  "use_cache": false,
   "use_graceful_regularization": true,
   "use_relu": false,
   "use_sparse_model": true,

   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.37.2",
+  "us_sparse_regularization": true,
+  "use_cache": true,
   "use_graceful_regularization": true,
   "use_relu": false,
   "use_sparse_model": true,

model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89be71f4726756540a7e92925ea6cc4884466952294d8ce42bcb108d45be4494
 size 4943162336

 version https://git-lfs.github.com/spec/v1
+oid sha256:ace92b47f66a1e5a1371c04c29f95dbe246e2ba01fc87d3410c04e9152acaeab
 size 4943162336

model-00002-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:75f4b1e531515c7c971bb268f5bceb2c045e9cf6a76ebed4fddd2c5353c795c2
 size 4999819336

 version https://git-lfs.github.com/spec/v1
+oid sha256:c4ffb3a1220bc47d6078225df316c9fc9161206859ea9120a9f35c406ff347f4
 size 4999819336

model-00003-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:78921f2c3c1c21893a7694d97bd2652764de23a1a58fb43cfbd717bf1aa6fdbd
 size 4540516344

 version https://git-lfs.github.com/spec/v1
+oid sha256:6d85be4d4fd64d8cbd6e88d4ae76f147705308dfbe4b0ece5f0d94693b2c4c59
 size 4540516344

sparsification_sftt.py CHANGED Viewed

@@ -78,6 +78,14 @@ class SparseSFTTTrainer(SFTTrainer):
             if loss is not None:
                 self.accelerator.backward(loss, retain_graph=False)
         if self.use_sparse_regularization:
             regularization_loss = self.compute_regularization(model)
             if self.args.n_gpu > 1:
@@ -86,13 +94,8 @@ class SparseSFTTTrainer(SFTTrainer):
                 self.accelerator.backward(regularization_loss, retain_graph=True)
             loss += regularization_loss
-        if self.use_spm_loss:
-            spm_loss = self.compute_spm_loss(model)
-            if self.args.n_gpu > 1:
-                spm_loss = spm_loss.mean()
-            if spm_loss is not None:
-                self.accelerator.backward(spm_loss, retain_graph=False)
-            loss += spm_loss
         return loss.detach() / self.args.gradient_accumulation_steps
@@ -198,9 +201,10 @@ class SparseTrainer(Trainer):
         if self.args.n_gpu > 1:
             loss = loss.mean()  # mean() to average on multi-gpu parallel training
         if not self.freeze_original_weights:
             if loss is not None:
-                self.accelerator.backward(loss, retain_graph=False)
         if self.use_sparse_regularization:
             regularization_loss = self.compute_regularization(model)

             if loss is not None:
                 self.accelerator.backward(loss, retain_graph=False)
+        if self.use_spm_loss:
+            spm_loss = self.compute_spm_loss(model)
+            if self.args.n_gpu > 1:
+                spm_loss = spm_loss.mean()
+            if spm_loss is not None:
+                self.accelerator.backward(spm_loss, retain_graph=False)
+            loss += spm_loss
         if self.use_sparse_regularization:
             regularization_loss = self.compute_regularization(model)
             if self.args.n_gpu > 1:
                 self.accelerator.backward(regularization_loss, retain_graph=True)
             loss += regularization_loss
+            if self.state.global_step % 5 == 0:
+                ds_print("Regularization loss: ", regularization_loss.item())
         return loss.detach() / self.args.gradient_accumulation_steps
         if self.args.n_gpu > 1:
             loss = loss.mean()  # mean() to average on multi-gpu parallel training
         if not self.freeze_original_weights:
             if loss is not None:
+                self.accelerator.backward(loss, retain_graph=True)
         if self.use_sparse_regularization:
             regularization_loss = self.compute_regularization(model)