Upload 4 files
Browse files- configuration_mistral.py +4 -5
- modeling_mistral.py +0 -0
configuration_mistral.py
CHANGED
@@ -14,8 +14,8 @@
|
|
14 |
# limitations under the License.
|
15 |
"""Mistral model configuration"""
|
16 |
|
17 |
-
from
|
18 |
-
from
|
19 |
|
20 |
|
21 |
logger = logging.get_logger(__name__)
|
@@ -116,7 +116,7 @@ class MistralConfig(PretrainedConfig):
|
|
116 |
rope_theta=10000.0,
|
117 |
sliding_window=4096,
|
118 |
attention_dropout=0.0,
|
119 |
-
max_thoughts=16,
|
120 |
merged_talk_heads=True,
|
121 |
merged_lm_and_talk_heads=False,
|
122 |
merged_lm_and_think_heads=True,
|
@@ -148,7 +148,6 @@ class MistralConfig(PretrainedConfig):
|
|
148 |
self.rope_theta = rope_theta
|
149 |
self.attention_dropout = attention_dropout
|
150 |
self.max_thoughts = max_thoughts
|
151 |
-
self.thought_length = thought_length
|
152 |
self.merged_talk_heads = merged_talk_heads
|
153 |
self.merged_lm_and_talk_heads = merged_lm_and_talk_heads
|
154 |
self.merged_lm_and_think_heads = merged_lm_and_think_heads
|
@@ -262,7 +261,7 @@ class MistralStarConfig(PretrainedConfig):
|
|
262 |
sliding_window=4096,
|
263 |
attention_dropout=0.0,
|
264 |
max_thoughts=16,
|
265 |
-
thought_length =
|
266 |
merged_talk_heads=True,
|
267 |
merged_lm_and_talk_heads=False,
|
268 |
merged_lm_and_think_heads=True,
|
|
|
14 |
# limitations under the License.
|
15 |
"""Mistral model configuration"""
|
16 |
|
17 |
+
from ...configuration_utils import PretrainedConfig
|
18 |
+
from ...utils import logging
|
19 |
|
20 |
|
21 |
logger = logging.get_logger(__name__)
|
|
|
116 |
rope_theta=10000.0,
|
117 |
sliding_window=4096,
|
118 |
attention_dropout=0.0,
|
119 |
+
max_thoughts=16,
|
120 |
merged_talk_heads=True,
|
121 |
merged_lm_and_talk_heads=False,
|
122 |
merged_lm_and_think_heads=True,
|
|
|
148 |
self.rope_theta = rope_theta
|
149 |
self.attention_dropout = attention_dropout
|
150 |
self.max_thoughts = max_thoughts
|
|
|
151 |
self.merged_talk_heads = merged_talk_heads
|
152 |
self.merged_lm_and_talk_heads = merged_lm_and_talk_heads
|
153 |
self.merged_lm_and_think_heads = merged_lm_and_think_heads
|
|
|
261 |
sliding_window=4096,
|
262 |
attention_dropout=0.0,
|
263 |
max_thoughts=16,
|
264 |
+
thought_length = 10,
|
265 |
merged_talk_heads=True,
|
266 |
merged_lm_and_talk_heads=False,
|
267 |
merged_lm_and_think_heads=True,
|
modeling_mistral.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|