vxbrandon commited on
Commit
197e6c6
1 Parent(s): 34c87da

End of training

Browse files
README.md CHANGED
@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 4.4216
19
 
20
  ## Model description
21
 
@@ -51,10 +51,10 @@ The following hyperparameters were used during training:
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:-----:|:----:|:---------------:|
54
- | 8.8887 | 0.0 | 25 | 8.7331 |
55
- | 7.8731 | 0.01 | 50 | 7.7943 |
56
- | 6.9763 | 0.01 | 75 | 6.7960 |
57
- | 4.476 | 0.02 | 100 | 4.1595 |
58
 
59
 
60
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 4.7548
19
 
20
  ## Model description
21
 
 
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:-----:|:----:|:---------------:|
54
+ | 8.8795 | 0.0 | 25 | 8.7468 |
55
+ | 7.9016 | 0.01 | 50 | 7.8344 |
56
+ | 7.1601 | 0.01 | 75 | 7.0248 |
57
+ | 4.8571 | 0.02 | 100 | 4.4464 |
58
 
59
 
60
  ### Framework versions
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49847005ec5a257745afccd4cf9d08a06ef02bdc2f9fde40c053c1a6aa3173c6
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce3ddfe8fe10bdb06f7a9e1e4076eda4e932f468ea80030311ddd175ea0d9e0e
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34b0604748b7547d8d2f8485621b8d499aab05dfb83c41df4f5ca3f8d04aa609
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b50857b2b7c48d86020e09a2153d23e884faa29ca259623f1288381109fa6b3b
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f3b355c1e62de71e20058b29c536fa1d0ba05d4ecc5dcc168a2888c0df6fbac
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5ef6f3146013e20ebe3e9cb4a52e10a9385a21aa2a7c5f838f15f830ee2cabd
3
  size 4540516344
sparsification_sftt.py CHANGED
@@ -378,9 +378,7 @@ class SparseMistralFlashAttention(MistralFlashAttention2):
378
  self.num_bins = num_bins
379
  self.hist_min = -2
380
  self.hist_max = 2
381
- self.histogram_bins = torch.linspace(
382
- self.hist_min, self.hist_max, num_bins - 2
383
- )
384
  self.histogram_bins = torch.cat(
385
  [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
386
  )
@@ -749,9 +747,7 @@ class SparseMistralAttention(MistralAttention):
749
  self.num_bins = num_bins
750
  self.hist_min = -2
751
  self.hist_max = 2
752
- self.histogram_bins = torch.linspace(
753
- self.hist_min, self.hist_max, num_bins - 2
754
- )
755
  self.histogram_bins = torch.cat(
756
  [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
757
  )
@@ -935,9 +931,7 @@ class MistralSparseSiluMLP(MistralMLP):
935
  self.num_bins = num_bins
936
  self.hist_min = -2
937
  self.hist_max = 2
938
- self.histogram_bins = torch.linspace(
939
- self.hist_min, self.hist_max, num_bins - 2
940
- )
941
  self.histogram_bins = torch.cat(
942
  [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
943
  )
@@ -1676,10 +1670,13 @@ def plot_histogram(
1676
  histogram_counts: torch.tensor,
1677
  title: str = "Activation Distribution",
1678
  fig_dir: str = "figures",
 
1679
  ):
1680
  plt.bar(
1681
  bin_edges[:-1], histogram_counts, width=np.diff(bin_edges), edgecolor="black"
1682
  )
 
 
1683
  plt.title(title)
1684
  plt.xlabel("Activation Value")
1685
  plt.ylabel("Frequency")
@@ -1710,13 +1707,21 @@ def plot_act(model, fig_dir: str = "figures"):
1710
  plot_histogram(
1711
  layer.mlp.histogram_bins, layer.mlp.post_act_hist_counts, plot_title
1712
  )
 
 
 
 
 
1713
  for i, layer in enumerate(model.model.layers):
1714
  if (
1715
- isinstance(layer.self_attn, SparseMistralAttention) and layer.self_attn.is_stats
 
1716
  ): # Can set the threshold only the relevant statistics is collected.
1717
  plot_title = f"Layer: {i} Pre-attention Distribution"
1718
  plot_histogram(
1719
- layer.self_attn.histogram_bins, layer.self_attn.pre_attn_hist_counts, plot_title
 
 
1720
  )
1721
 
1722
  plot_title = f"Layer: {i} Post QK_T Distribution"
@@ -1724,8 +1729,10 @@ def plot_act(model, fig_dir: str = "figures"):
1724
  layer.self_attn.histogram_bins,
1725
  layer.self_attn.post_qk_hist_counts,
1726
  plot_title,
 
1727
  )
1728
 
 
1729
  def save_act_hist(model, dirname="/scr/jay/models/mistral/pre_finetune/cola_act_hist"):
1730
  os.makedirs(dirname, exist_ok=True)
1731
  act_dict = {}
 
378
  self.num_bins = num_bins
379
  self.hist_min = -2
380
  self.hist_max = 2
381
+ self.histogram_bins = torch.linspace(self.hist_min, self.hist_max, num_bins - 2)
 
 
382
  self.histogram_bins = torch.cat(
383
  [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
384
  )
 
747
  self.num_bins = num_bins
748
  self.hist_min = -2
749
  self.hist_max = 2
750
+ self.histogram_bins = torch.linspace(self.hist_min, self.hist_max, num_bins - 2)
 
 
751
  self.histogram_bins = torch.cat(
752
  [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
753
  )
 
931
  self.num_bins = num_bins
932
  self.hist_min = -2
933
  self.hist_max = 2
934
+ self.histogram_bins = torch.linspace(self.hist_min, self.hist_max, num_bins - 2)
 
 
935
  self.histogram_bins = torch.cat(
936
  [torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
937
  )
 
1670
  histogram_counts: torch.tensor,
1671
  title: str = "Activation Distribution",
1672
  fig_dir: str = "figures",
1673
+ y_logscale:bool = False,
1674
  ):
1675
  plt.bar(
1676
  bin_edges[:-1], histogram_counts, width=np.diff(bin_edges), edgecolor="black"
1677
  )
1678
+ if y_logscale:
1679
+ plt.yscale("log")
1680
  plt.title(title)
1681
  plt.xlabel("Activation Value")
1682
  plt.ylabel("Frequency")
 
1707
  plot_histogram(
1708
  layer.mlp.histogram_bins, layer.mlp.post_act_hist_counts, plot_title
1709
  )
1710
+
1711
+ plot_title = f"Layer: {i} Pre-MLP Absolute Distribution"
1712
+ plot_histogram(
1713
+ layer.mlp.histogram_bins, layer.mlp.pre_mlp_hist_counts, plot_title
1714
+ )
1715
  for i, layer in enumerate(model.model.layers):
1716
  if (
1717
+ isinstance(layer.self_attn, SparseMistralAttention)
1718
+ and layer.self_attn.is_stats
1719
  ): # Can set the threshold only the relevant statistics is collected.
1720
  plot_title = f"Layer: {i} Pre-attention Distribution"
1721
  plot_histogram(
1722
+ layer.self_attn.histogram_bins,
1723
+ layer.self_attn.pre_attn_hist_counts,
1724
+ plot_title,
1725
  )
1726
 
1727
  plot_title = f"Layer: {i} Post QK_T Distribution"
 
1729
  layer.self_attn.histogram_bins,
1730
  layer.self_attn.post_qk_hist_counts,
1731
  plot_title,
1732
+ y_logscale=True,
1733
  )
1734
 
1735
+
1736
  def save_act_hist(model, dirname="/scr/jay/models/mistral/pre_finetune/cola_act_hist"):
1737
  os.makedirs(dirname, exist_ok=True)
1738
  act_dict = {}