End of training
Browse files- README.md +5 -5
- model-00001-of-00003.safetensors +1 -1
- model-00002-of-00003.safetensors +1 -1
- model-00003-of-00003.safetensors +1 -1
- sparsification_sftt.py +18 -11
README.md
CHANGED
@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
|
|
15 |
|
16 |
This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
|
17 |
It achieves the following results on the evaluation set:
|
18 |
-
- Loss: 4.
|
19 |
|
20 |
## Model description
|
21 |
|
@@ -51,10 +51,10 @@ The following hyperparameters were used during training:
|
|
51 |
|
52 |
| Training Loss | Epoch | Step | Validation Loss |
|
53 |
|:-------------:|:-----:|:----:|:---------------:|
|
54 |
-
| 8.
|
55 |
-
| 7.
|
56 |
-
|
|
57 |
-
| 4.
|
58 |
|
59 |
|
60 |
### Framework versions
|
|
|
15 |
|
16 |
This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
|
17 |
It achieves the following results on the evaluation set:
|
18 |
+
- Loss: 4.7548
|
19 |
|
20 |
## Model description
|
21 |
|
|
|
51 |
|
52 |
| Training Loss | Epoch | Step | Validation Loss |
|
53 |
|:-------------:|:-----:|:----:|:---------------:|
|
54 |
+
| 8.8795 | 0.0 | 25 | 8.7468 |
|
55 |
+
| 7.9016 | 0.01 | 50 | 7.8344 |
|
56 |
+
| 7.1601 | 0.01 | 75 | 7.0248 |
|
57 |
+
| 4.8571 | 0.02 | 100 | 4.4464 |
|
58 |
|
59 |
|
60 |
### Framework versions
|
model-00001-of-00003.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4943162336
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce3ddfe8fe10bdb06f7a9e1e4076eda4e932f468ea80030311ddd175ea0d9e0e
|
3 |
size 4943162336
|
model-00002-of-00003.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4999819336
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b50857b2b7c48d86020e09a2153d23e884faa29ca259623f1288381109fa6b3b
|
3 |
size 4999819336
|
model-00003-of-00003.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4540516344
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5ef6f3146013e20ebe3e9cb4a52e10a9385a21aa2a7c5f838f15f830ee2cabd
|
3 |
size 4540516344
|
sparsification_sftt.py
CHANGED
@@ -378,9 +378,7 @@ class SparseMistralFlashAttention(MistralFlashAttention2):
|
|
378 |
self.num_bins = num_bins
|
379 |
self.hist_min = -2
|
380 |
self.hist_max = 2
|
381 |
-
self.histogram_bins = torch.linspace(
|
382 |
-
self.hist_min, self.hist_max, num_bins - 2
|
383 |
-
)
|
384 |
self.histogram_bins = torch.cat(
|
385 |
[torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
|
386 |
)
|
@@ -749,9 +747,7 @@ class SparseMistralAttention(MistralAttention):
|
|
749 |
self.num_bins = num_bins
|
750 |
self.hist_min = -2
|
751 |
self.hist_max = 2
|
752 |
-
self.histogram_bins = torch.linspace(
|
753 |
-
self.hist_min, self.hist_max, num_bins - 2
|
754 |
-
)
|
755 |
self.histogram_bins = torch.cat(
|
756 |
[torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
|
757 |
)
|
@@ -935,9 +931,7 @@ class MistralSparseSiluMLP(MistralMLP):
|
|
935 |
self.num_bins = num_bins
|
936 |
self.hist_min = -2
|
937 |
self.hist_max = 2
|
938 |
-
self.histogram_bins = torch.linspace(
|
939 |
-
self.hist_min, self.hist_max, num_bins - 2
|
940 |
-
)
|
941 |
self.histogram_bins = torch.cat(
|
942 |
[torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
|
943 |
)
|
@@ -1676,10 +1670,13 @@ def plot_histogram(
|
|
1676 |
histogram_counts: torch.tensor,
|
1677 |
title: str = "Activation Distribution",
|
1678 |
fig_dir: str = "figures",
|
|
|
1679 |
):
|
1680 |
plt.bar(
|
1681 |
bin_edges[:-1], histogram_counts, width=np.diff(bin_edges), edgecolor="black"
|
1682 |
)
|
|
|
|
|
1683 |
plt.title(title)
|
1684 |
plt.xlabel("Activation Value")
|
1685 |
plt.ylabel("Frequency")
|
@@ -1710,13 +1707,21 @@ def plot_act(model, fig_dir: str = "figures"):
|
|
1710 |
plot_histogram(
|
1711 |
layer.mlp.histogram_bins, layer.mlp.post_act_hist_counts, plot_title
|
1712 |
)
|
|
|
|
|
|
|
|
|
|
|
1713 |
for i, layer in enumerate(model.model.layers):
|
1714 |
if (
|
1715 |
-
isinstance(layer.self_attn, SparseMistralAttention)
|
|
|
1716 |
): # Can set the threshold only the relevant statistics is collected.
|
1717 |
plot_title = f"Layer: {i} Pre-attention Distribution"
|
1718 |
plot_histogram(
|
1719 |
-
layer.self_attn.histogram_bins,
|
|
|
|
|
1720 |
)
|
1721 |
|
1722 |
plot_title = f"Layer: {i} Post QK_T Distribution"
|
@@ -1724,8 +1729,10 @@ def plot_act(model, fig_dir: str = "figures"):
|
|
1724 |
layer.self_attn.histogram_bins,
|
1725 |
layer.self_attn.post_qk_hist_counts,
|
1726 |
plot_title,
|
|
|
1727 |
)
|
1728 |
|
|
|
1729 |
def save_act_hist(model, dirname="/scr/jay/models/mistral/pre_finetune/cola_act_hist"):
|
1730 |
os.makedirs(dirname, exist_ok=True)
|
1731 |
act_dict = {}
|
|
|
378 |
self.num_bins = num_bins
|
379 |
self.hist_min = -2
|
380 |
self.hist_max = 2
|
381 |
+
self.histogram_bins = torch.linspace(self.hist_min, self.hist_max, num_bins - 2)
|
|
|
|
|
382 |
self.histogram_bins = torch.cat(
|
383 |
[torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
|
384 |
)
|
|
|
747 |
self.num_bins = num_bins
|
748 |
self.hist_min = -2
|
749 |
self.hist_max = 2
|
750 |
+
self.histogram_bins = torch.linspace(self.hist_min, self.hist_max, num_bins - 2)
|
|
|
|
|
751 |
self.histogram_bins = torch.cat(
|
752 |
[torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
|
753 |
)
|
|
|
931 |
self.num_bins = num_bins
|
932 |
self.hist_min = -2
|
933 |
self.hist_max = 2
|
934 |
+
self.histogram_bins = torch.linspace(self.hist_min, self.hist_max, num_bins - 2)
|
|
|
|
|
935 |
self.histogram_bins = torch.cat(
|
936 |
[torch.tensor([-torch.inf]), self.histogram_bins, torch.tensor([torch.inf])]
|
937 |
)
|
|
|
1670 |
histogram_counts: torch.tensor,
|
1671 |
title: str = "Activation Distribution",
|
1672 |
fig_dir: str = "figures",
|
1673 |
+
y_logscale:bool = False,
|
1674 |
):
|
1675 |
plt.bar(
|
1676 |
bin_edges[:-1], histogram_counts, width=np.diff(bin_edges), edgecolor="black"
|
1677 |
)
|
1678 |
+
if y_logscale:
|
1679 |
+
plt.yscale("log")
|
1680 |
plt.title(title)
|
1681 |
plt.xlabel("Activation Value")
|
1682 |
plt.ylabel("Frequency")
|
|
|
1707 |
plot_histogram(
|
1708 |
layer.mlp.histogram_bins, layer.mlp.post_act_hist_counts, plot_title
|
1709 |
)
|
1710 |
+
|
1711 |
+
plot_title = f"Layer: {i} Pre-MLP Absolute Distribution"
|
1712 |
+
plot_histogram(
|
1713 |
+
layer.mlp.histogram_bins, layer.mlp.pre_mlp_hist_counts, plot_title
|
1714 |
+
)
|
1715 |
for i, layer in enumerate(model.model.layers):
|
1716 |
if (
|
1717 |
+
isinstance(layer.self_attn, SparseMistralAttention)
|
1718 |
+
and layer.self_attn.is_stats
|
1719 |
): # Can set the threshold only the relevant statistics is collected.
|
1720 |
plot_title = f"Layer: {i} Pre-attention Distribution"
|
1721 |
plot_histogram(
|
1722 |
+
layer.self_attn.histogram_bins,
|
1723 |
+
layer.self_attn.pre_attn_hist_counts,
|
1724 |
+
plot_title,
|
1725 |
)
|
1726 |
|
1727 |
plot_title = f"Layer: {i} Post QK_T Distribution"
|
|
|
1729 |
layer.self_attn.histogram_bins,
|
1730 |
layer.self_attn.post_qk_hist_counts,
|
1731 |
plot_title,
|
1732 |
+
y_logscale=True,
|
1733 |
)
|
1734 |
|
1735 |
+
|
1736 |
def save_act_hist(model, dirname="/scr/jay/models/mistral/pre_finetune/cola_act_hist"):
|
1737 |
os.makedirs(dirname, exist_ok=True)
|
1738 |
act_dict = {}
|