End of training

Browse files

Files changed (5) hide show

README.md +5 -5
model-00001-of-00003.safetensors +1 -1
model-00002-of-00003.safetensors +1 -1
model-00003-of-00003.safetensors +1 -1
sparsification_sftt.py +18 -11

README.md CHANGED Viewed

@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: 4.4216
 ## Model description
@@ -51,10 +51,10 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 8.8887        | 0.0   | 25   | 8.7331          |
-| 7.8731        | 0.01  | 50   | 7.7943          |
-| 6.9763        | 0.01  | 75   | 6.7960          |
-| 4.476         | 0.02  | 100  | 4.1595          |
 ### Framework versions

 This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 4.7548
 ## Model description
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 8.8795        | 0.0   | 25   | 8.7468          |
+| 7.9016        | 0.01  | 50   | 7.8344          |
+| 7.1601        | 0.01  | 75   | 7.0248          |
+| 4.8571        | 0.02  | 100  | 4.4464          |
 ### Framework versions

model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49847005ec5a257745afccd4cf9d08a06ef02bdc2f9fde40c053c1a6aa3173c6
 size 4943162336

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce3ddfe8fe10bdb06f7a9e1e4076eda4e932f468ea80030311ddd175ea0d9e0e
 size 4943162336

model-00002-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34b0604748b7547d8d2f8485621b8d499aab05dfb83c41df4f5ca3f8d04aa609
 size 4999819336

 version https://git-lfs.github.com/spec/v1
+oid sha256:b50857b2b7c48d86020e09a2153d23e884faa29ca259623f1288381109fa6b3b
 size 4999819336

model-00003-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f3b355c1e62de71e20058b29c536fa1d0ba05d4ecc5dcc168a2888c0df6fbac
 size 4540516344

 version https://git-lfs.github.com/spec/v1
+oid sha256:c5ef6f3146013e20ebe3e9cb4a52e10a9385a21aa2a7c5f838f15f830ee2cabd
 size 4540516344

sparsification_sftt.py CHANGED Viewed

@@ -378,9 +378,7 @@ class SparseMistralFlashAttention(MistralFlashAttention2):
         self.num_bins = num_bins
         self.hist_min = -2
         self.hist_max = 2
-        self.histogram_bins = torch.linspace(
-            self.hist_min, self.hist_max, num_bins - 2
-        )
         self.histogram_bins = torch.cat(
             [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
         )
@@ -749,9 +747,7 @@ class SparseMistralAttention(MistralAttention):
         self.num_bins = num_bins
         self.hist_min = -2
         self.hist_max = 2
-        self.histogram_bins = torch.linspace(
-            self.hist_min, self.hist_max, num_bins - 2
-        )
         self.histogram_bins = torch.cat(
             [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
         )
@@ -935,9 +931,7 @@ class MistralSparseSiluMLP(MistralMLP):
         self.num_bins = num_bins
         self.hist_min = -2
         self.hist_max = 2
-        self.histogram_bins = torch.linspace(
-            self.hist_min, self.hist_max, num_bins - 2
-        )
         self.histogram_bins = torch.cat(
             [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
         )
@@ -1676,10 +1670,13 @@ def plot_histogram(
     histogram_counts: torch.tensor,
     title: str = "Activation Distribution",
     fig_dir: str = "figures",
 ):
     plt.bar(
         bin_edges[:-1], histogram_counts, width=np.diff(bin_edges), edgecolor="black"
     )
     plt.title(title)
     plt.xlabel("Activation Value")
     plt.ylabel("Frequency")
@@ -1710,13 +1707,21 @@ def plot_act(model, fig_dir: str = "figures"):
             plot_histogram(
                 layer.mlp.histogram_bins, layer.mlp.post_act_hist_counts, plot_title
             )
     for i, layer in enumerate(model.model.layers):
         if (
-            isinstance(layer.self_attn, SparseMistralAttention) and layer.self_attn.is_stats
         ):  # Can set the threshold only the relevant statistics is collected.
             plot_title = f"Layer: {i} Pre-attention Distribution"
             plot_histogram(
-                layer.self_attn.histogram_bins, layer.self_attn.pre_attn_hist_counts, plot_title
             )
             plot_title = f"Layer: {i} Post QK_T Distribution"
@@ -1724,8 +1729,10 @@ def plot_act(model, fig_dir: str = "figures"):
                 layer.self_attn.histogram_bins,
                 layer.self_attn.post_qk_hist_counts,
                 plot_title,
             )
 def save_act_hist(model, dirname="/scr/jay/models/mistral/pre_finetune/cola_act_hist"):
     os.makedirs(dirname, exist_ok=True)
     act_dict = {}

         self.num_bins = num_bins
         self.hist_min = -2
         self.hist_max = 2
+        self.histogram_bins = torch.linspace(self.hist_min, self.hist_max, num_bins - 2)
         self.histogram_bins = torch.cat(
             [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
         )
         self.num_bins = num_bins
         self.hist_min = -2
         self.hist_max = 2
+        self.histogram_bins = torch.linspace(self.hist_min, self.hist_max, num_bins - 2)
         self.histogram_bins = torch.cat(
             [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
         )
         self.num_bins = num_bins
         self.hist_min = -2
         self.hist_max = 2
+        self.histogram_bins = torch.linspace(self.hist_min, self.hist_max, num_bins - 2)
         self.histogram_bins = torch.cat(
             [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
         )
     histogram_counts: torch.tensor,
     title: str = "Activation Distribution",
     fig_dir: str = "figures",
+    y_logscale:bool = False,
 ):
     plt.bar(
         bin_edges[:-1], histogram_counts, width=np.diff(bin_edges), edgecolor="black"
     )
+    if y_logscale:
+        plt.yscale("log")
     plt.title(title)
     plt.xlabel("Activation Value")
     plt.ylabel("Frequency")
             plot_histogram(
                 layer.mlp.histogram_bins, layer.mlp.post_act_hist_counts, plot_title
             )
+            plot_title = f"Layer: {i} Pre-MLP Absolute Distribution"
+            plot_histogram(
+                layer.mlp.histogram_bins, layer.mlp.pre_mlp_hist_counts, plot_title
+            )
     for i, layer in enumerate(model.model.layers):
         if (
+            isinstance(layer.self_attn, SparseMistralAttention)
+            and layer.self_attn.is_stats
         ):  # Can set the threshold only the relevant statistics is collected.
             plot_title = f"Layer: {i} Pre-attention Distribution"
             plot_histogram(
+                layer.self_attn.histogram_bins,
+                layer.self_attn.pre_attn_hist_counts,
+                plot_title,
             )
             plot_title = f"Layer: {i} Post QK_T Distribution"
                 layer.self_attn.histogram_bins,
                 layer.self_attn.post_qk_hist_counts,
                 plot_title,
+                y_logscale=True,
             )
 def save_act_hist(model, dirname="/scr/jay/models/mistral/pre_finetune/cola_act_hist"):
     os.makedirs(dirname, exist_ok=True)
     act_dict = {}