TitanML Co
commited on
Commit
•
ab44456
1
Parent(s):
868656d
Upload folder using huggingface_hub
Browse files- .gitattributes +0 -1
- README.md +214 -0
- adapt_tokenizer.py +41 -0
- attention.py +300 -0
- blocks.py +41 -0
- config.json +52 -0
- configuration_mpt.py +118 -0
- ct_output_models/config.json +6 -0
- ct_output_models/model.bin +3 -0
- ct_output_models/vocabulary.json +0 -0
- custom_embedding.py +11 -0
- flash_attn_triton.py +484 -0
- generation_config.json +5 -0
- hf_prefixlm_converter.py +415 -0
- meta_init_context.py +94 -0
- modeling_mpt.py +323 -0
- norm.py +56 -0
- param_init_fns.py +181 -0
- requirements.txt +2 -0
- special_tokens_map.json +5 -0
- tokenizer.json +0 -0
- tokenizer_config.json +9 -0
.gitattributes
CHANGED
@@ -25,7 +25,6 @@
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
tags:
|
4 |
+
- Composer
|
5 |
+
- MosaicML
|
6 |
+
- llm-foundry
|
7 |
+
datasets:
|
8 |
+
- the_pile_books3
|
9 |
+
inference: false
|
10 |
+
---
|
11 |
+
|
12 |
+
# MPT-7B-StoryWriter-65k+
|
13 |
+
|
14 |
+
MPT-7B-StoryWriter-65k+ is a model designed to read and write fictional stories with super long context lengths.
|
15 |
+
It was built by finetuning MPT-7B with a context length of 65k tokens on a filtered fiction subset of the [books3 dataset](https://huggingface.co/datasets/the_pile_books3).
|
16 |
+
At inference time, thanks to [ALiBi](https://arxiv.org/abs/2108.12409), MPT-7B-StoryWriter-65k+ can extrapolate even beyond 65k tokens.
|
17 |
+
We demonstrate generations as long as 84k tokens on a single node of 8 A100-80GB GPUs in our [blogpost](https://www.mosaicml.com/blog/mpt-7b).
|
18 |
+
* License: Apache 2.0
|
19 |
+
|
20 |
+
This model was trained by [MosaicML](https://www.mosaicml.com) and follows a modified decoder-only transformer architecture.
|
21 |
+
|
22 |
+
## Model Date
|
23 |
+
|
24 |
+
May 5, 2023
|
25 |
+
|
26 |
+
## Model License
|
27 |
+
|
28 |
+
Apache 2.0
|
29 |
+
|
30 |
+
## Documentation
|
31 |
+
|
32 |
+
* [Blog post: Introducing MPT-7B: A New Standard for Open-Source, Commercially Usable LLMs](https://www.mosaicml.com/blog/mpt-7b)
|
33 |
+
* [Codebase (mosaicml/llm-foundry repo)](https://github.com/mosaicml/llm-foundry/)
|
34 |
+
* Questions: Feel free to contact us via the [MosaicML Community Slack](https://mosaicml.me/slack)!
|
35 |
+
|
36 |
+
|
37 |
+
## How to Use
|
38 |
+
|
39 |
+
Note: This model requires that `trust_remote_code=True` be passed to the `from_pretrained` method. This is because we use a custom model architecture that is not yet part of the `transformers` package.
|
40 |
+
|
41 |
+
It includes options for many training efficiency features such as [FlashAttention (Dao et al. 2022)](https://arxiv.org/pdf/2205.14135.pdf), [ALiBi](https://arxiv.org/abs/2108.12409), QK LayerNorm, and more.
|
42 |
+
|
43 |
+
```python
|
44 |
+
import transformers
|
45 |
+
model = transformers.AutoModelForCausalLM.from_pretrained(
|
46 |
+
'mosaicml/mpt-7b-storywriter',
|
47 |
+
trust_remote_code=True
|
48 |
+
)
|
49 |
+
```
|
50 |
+
|
51 |
+
To use the optimized [triton implementation](https://github.com/openai/triton) of FlashAttention, you can load the model on GPU (`cuda:0`) with `attn_impl='triton'` and with `bfloat16` precision:
|
52 |
+
```python
|
53 |
+
import torch
|
54 |
+
import transformers
|
55 |
+
|
56 |
+
name = 'mosaicml/mpt-7b-storywriter'
|
57 |
+
|
58 |
+
config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
|
59 |
+
config.attn_config['attn_impl'] = 'triton'
|
60 |
+
config.init_device = 'cuda:0' # For fast initialization directly on GPU!
|
61 |
+
|
62 |
+
model = transformers.AutoModelForCausalLM.from_pretrained(
|
63 |
+
name,
|
64 |
+
config=config,
|
65 |
+
torch_dtype=torch.bfloat16, # Load model weights in bfloat16
|
66 |
+
trust_remote_code=True
|
67 |
+
)
|
68 |
+
```
|
69 |
+
|
70 |
+
Although the model was trained with a sequence length of 2048 and finetuned with a sequence length of 65536,
|
71 |
+
ALiBi enables users to increase the maximum sequence length during finetuning and/or inference. For example:
|
72 |
+
```python
|
73 |
+
import transformers
|
74 |
+
|
75 |
+
name = 'mosaicml/mpt-7b'
|
76 |
+
|
77 |
+
config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
|
78 |
+
config.max_seq_len = 83968 # (input + output) tokens can now be up to 83968
|
79 |
+
|
80 |
+
model = transformers.AutoModelForCausalLM.from_pretrained(
|
81 |
+
name,
|
82 |
+
config=config,
|
83 |
+
trust_remote_code=True
|
84 |
+
)
|
85 |
+
```
|
86 |
+
|
87 |
+
This model was trained with the [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) tokenizer.
|
88 |
+
|
89 |
+
```python
|
90 |
+
from transformers import AutoTokenizer
|
91 |
+
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
|
92 |
+
```
|
93 |
+
|
94 |
+
The model can then be used, for example, within a text-generation pipeline.
|
95 |
+
Note: when running Torch modules in lower precision, it is best practice to use the [torch.autocast context manager](https://pytorch.org/docs/stable/amp.html).
|
96 |
+
|
97 |
+
```python
|
98 |
+
from transformers import pipeline
|
99 |
+
|
100 |
+
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device='cuda:0')
|
101 |
+
|
102 |
+
with torch.autocast('cuda', dtype=torch.bfloat16):
|
103 |
+
print(
|
104 |
+
pipe('Here is a recipe for vegan banana bread:\n',
|
105 |
+
max_new_tokens=100,
|
106 |
+
do_sample=True,
|
107 |
+
use_cache=True))
|
108 |
+
```
|
109 |
+
|
110 |
+
## Community-Created Resources
|
111 |
+
|
112 |
+
These were not created by MosaicML, but you may find them useful. These links are not an endorsement of the creators or their content.
|
113 |
+
|
114 |
+
- [Oobabooga Running MPT-7B-Storywriter](https://youtu.be/QVVb6Md6huA)
|
115 |
+
- [NEW MPT-7B-StoryWriter CRUSHES GPT-4!](https://www.youtube.com/watch?v=O9Y_ZdsuKWQ&t=649s) - Has a long section on running locally using Oobabooga
|
116 |
+
|
117 |
+
## Example Epilogue
|
118 |
+
|
119 |
+
The full text of the _The Great Gatsby_ (67873 tokens) was fed to the model, followed by the text "EPILOGUE"
|
120 |
+
|
121 |
+
**TheRealFitzgerald233**:
|
122 |
+
|
123 |
+
> ...
|
124 |
+
>
|
125 |
+
> So we beat on, boats against the current, borne back ceaselessly into the past
|
126 |
+
>
|
127 |
+
> EPILOGUE
|
128 |
+
|
129 |
+
**MPT-7B-StoryWriter**:
|
130 |
+
|
131 |
+
>I have lived so long in the shadow of this dream that I may never see the bright sunlight of reality. I may look up some morning and find that I am six feet under, and that Gatsby, who gave me the dream and everything else, will have vanished behind me.
|
132 |
+
>
|
133 |
+
>And so it was. I had hardly crossed the threshold of the old world when my eye caught the headlines of a newspaper, and I knew at once that my dream was over. I knew that Gatsby was dead.
|
134 |
+
>
|
135 |
+
>And I knew that day that I didn't love her.
|
136 |
+
>
|
137 |
+
>That night I took the last train for West Egg, and when I reached home I went directly to my attic and brought down all Gatsby's letters.
|
138 |
+
>
|
139 |
+
>I have sorted them by date and placed them in the order in which they were written. They are very few in number.
|
140 |
+
>
|
141 |
+
>They are dated between April of nineteen eighteen and January of nineteen nineteen, and the last one I have here is the one that told me he was dead.
|
142 |
+
>
|
143 |
+
>There is something in that last letter that frightens me. It is the sense of a finality, of a completeness. I never heard of him after that, and I was glad.
|
144 |
+
>
|
145 |
+
>But a few months ago I read somewhere that he had been seen in New York. He had become a great man.
|
146 |
+
>
|
147 |
+
>And I knew that he had not changed at all.
|
148 |
+
|
149 |
+
## Model Description
|
150 |
+
|
151 |
+
The architecture is a modification of a standard decoder-only transformer.
|
152 |
+
|
153 |
+
The model has been modified from a standard transformer in the following ways:
|
154 |
+
* It uses [FlashAttention](https://arxiv.org/pdf/2205.14135.pdf)
|
155 |
+
* It uses [ALiBi (Attention with Linear Biases)](https://arxiv.org/abs/2108.12409) and does not use positional embeddings
|
156 |
+
* It does not use biases
|
157 |
+
|
158 |
+
|
159 |
+
| Hyperparameter | Value |
|
160 |
+
|----------------|-------|
|
161 |
+
|n_parameters | 6.7B |
|
162 |
+
|n_layers | 32 |
|
163 |
+
| n_heads | 32 |
|
164 |
+
| d_model | 4096 |
|
165 |
+
| vocab size | 50432 |
|
166 |
+
| sequence length | **65536** |
|
167 |
+
|
168 |
+
## PreTraining Data
|
169 |
+
|
170 |
+
For more details on the pretraining process, see [MPT-7B](https://huggingface.co/mosaicml/mpt-7b).
|
171 |
+
|
172 |
+
The data was tokenized using the [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) tokenizer.
|
173 |
+
|
174 |
+
### Training Configuration
|
175 |
+
|
176 |
+
This model was trained on 8 A100-80GBs for about 2 days using the [MosaicML Platform](https://www.mosaicml.com/platform).
|
177 |
+
The model was trained with sharded data parallelism using [FSDP](https://pytorch.org/docs/stable/fsdp.html) and used the [LION](https://arxiv.org/abs/2302.06675) optimizer.
|
178 |
+
|
179 |
+
## Limitations and Biases
|
180 |
+
|
181 |
+
_The following language is modified from [EleutherAI's GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b)_
|
182 |
+
|
183 |
+
MPT-7B-StoryWriter can produce factually incorrect output, and should not be relied on to produce factually accurate information.
|
184 |
+
MPT-7B-StoryWriter was trained on various public datasets.
|
185 |
+
While great efforts have been taken to clean the pretraining data, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
|
186 |
+
|
187 |
+
|
188 |
+
## Acknowledgements
|
189 |
+
|
190 |
+
This model was finetuned by Alex Trott and the MosaicML NLP team
|
191 |
+
|
192 |
+
## MosaicML Platform
|
193 |
+
|
194 |
+
If you're interested in [training](https://www.mosaicml.com/training) and [deploying](https://www.mosaicml.com/inference) your own MPT or LLMs on the MosaicML Platform, [sign up here](https://forms.mosaicml.com/demo?utm_source=huggingface&utm_medium=referral&utm_campaign=mpt-7b).
|
195 |
+
|
196 |
+
## Disclaimer
|
197 |
+
|
198 |
+
The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please cosult an attorney before using this model for commercial purposes.
|
199 |
+
|
200 |
+
|
201 |
+
## Citation
|
202 |
+
|
203 |
+
Please cite this model using the following format:
|
204 |
+
|
205 |
+
```
|
206 |
+
@online{MosaicML2023Introducing,
|
207 |
+
author = {MosaicML NLP Team},
|
208 |
+
title = {Introducing MPT-7B: A New Standard for Open-Source, Commercially Usable LLMs},
|
209 |
+
year = {2023},
|
210 |
+
url = {www.mosaicml.com/blog/mpt-7b},
|
211 |
+
note = {Accessed: 2023-03-28}, % change this date
|
212 |
+
urldate = {2023-03-28} % change this date
|
213 |
+
}
|
214 |
+
```
|
adapt_tokenizer.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
3 |
+
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
4 |
+
NUM_SENTINEL_TOKENS: int = 100
|
5 |
+
|
6 |
+
def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
|
7 |
+
"""Adds sentinel tokens and padding token (if missing).
|
8 |
+
|
9 |
+
Expands the tokenizer vocabulary to include sentinel tokens
|
10 |
+
used in mixture-of-denoiser tasks as well as a padding token.
|
11 |
+
|
12 |
+
All added tokens are added as special tokens. No tokens are
|
13 |
+
added if sentinel tokens and padding token already exist.
|
14 |
+
"""
|
15 |
+
sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]
|
16 |
+
tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
|
17 |
+
if tokenizer.pad_token is None:
|
18 |
+
tokenizer.add_tokens('<pad>', special_tokens=True)
|
19 |
+
tokenizer.pad_token = '<pad>'
|
20 |
+
assert tokenizer.pad_token_id is not None
|
21 |
+
sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)])
|
22 |
+
_sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
|
23 |
+
tokenizer.sentinel_token_ids = _sentinel_token_ids
|
24 |
+
|
25 |
+
class AutoTokenizerForMOD(AutoTokenizer):
|
26 |
+
"""AutoTokenizer + Adaptation for MOD.
|
27 |
+
|
28 |
+
A simple wrapper around AutoTokenizer to make instantiating
|
29 |
+
an MOD-adapted tokenizer a bit easier.
|
30 |
+
|
31 |
+
MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
|
32 |
+
a padding token, and a property to get the token ids of the
|
33 |
+
sentinel tokens.
|
34 |
+
"""
|
35 |
+
|
36 |
+
@classmethod
|
37 |
+
def from_pretrained(cls, *args, **kwargs):
|
38 |
+
"""See `AutoTokenizer.from_pretrained` docstring."""
|
39 |
+
tokenizer = super().from_pretrained(*args, **kwargs)
|
40 |
+
adapt_tokenizer_for_denoising(tokenizer)
|
41 |
+
return tokenizer
|
attention.py
ADDED
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Attention layers."""
|
2 |
+
import math
|
3 |
+
import warnings
|
4 |
+
from typing import Optional
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
from einops import rearrange
|
8 |
+
from packaging import version
|
9 |
+
from torch import nn
|
10 |
+
from .norm import LPLayerNorm
|
11 |
+
|
12 |
+
def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
|
13 |
+
if original_is_causal and num_query_tokens != num_key_tokens:
|
14 |
+
if num_query_tokens != 1:
|
15 |
+
raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
|
16 |
+
else:
|
17 |
+
return False
|
18 |
+
return original_is_causal
|
19 |
+
|
20 |
+
def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_value=None, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
|
21 |
+
q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
|
22 |
+
kv_n_heads = 1 if multiquery else n_heads
|
23 |
+
k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
|
24 |
+
v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
|
25 |
+
if past_key_value is not None:
|
26 |
+
if len(past_key_value) != 0:
|
27 |
+
k = torch.cat([past_key_value[0], k], dim=3)
|
28 |
+
v = torch.cat([past_key_value[1], v], dim=2)
|
29 |
+
past_key_value = (k, v)
|
30 |
+
(b, _, s_q, d) = q.shape
|
31 |
+
s_k = k.size(-1)
|
32 |
+
if softmax_scale is None:
|
33 |
+
softmax_scale = 1 / math.sqrt(d)
|
34 |
+
attn_weight = q.matmul(k) * softmax_scale
|
35 |
+
if attn_bias is not None:
|
36 |
+
_s_q = max(0, attn_bias.size(2) - s_q)
|
37 |
+
_s_k = max(0, attn_bias.size(3) - s_k)
|
38 |
+
attn_bias = attn_bias[:, :, _s_q:, _s_k:]
|
39 |
+
if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
|
40 |
+
raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
|
41 |
+
attn_weight = attn_weight + attn_bias
|
42 |
+
min_val = torch.finfo(q.dtype).min
|
43 |
+
if key_padding_mask is not None:
|
44 |
+
if attn_bias is not None:
|
45 |
+
warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
|
46 |
+
attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
|
47 |
+
if is_causal and (not q.size(2) == 1):
|
48 |
+
s = max(s_q, s_k)
|
49 |
+
causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
|
50 |
+
causal_mask = causal_mask.tril()
|
51 |
+
causal_mask = causal_mask.to(torch.bool)
|
52 |
+
causal_mask = ~causal_mask
|
53 |
+
causal_mask = causal_mask[-s_q:, -s_k:]
|
54 |
+
attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
|
55 |
+
attn_weight = torch.softmax(attn_weight, dim=-1)
|
56 |
+
if dropout_p:
|
57 |
+
attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
|
58 |
+
out = attn_weight.to(v.dtype).matmul(v)
|
59 |
+
out = rearrange(out, 'b h s d -> b s (h d)')
|
60 |
+
if needs_weights:
|
61 |
+
return (out, attn_weight, past_key_value)
|
62 |
+
return (out, None, past_key_value)
|
63 |
+
|
64 |
+
def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
|
65 |
+
for tensor in tensors:
|
66 |
+
if tensor.dtype not in valid_dtypes:
|
67 |
+
raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
|
68 |
+
if not tensor.is_cuda:
|
69 |
+
raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
|
70 |
+
|
71 |
+
def flash_attn_fn(query, key, value, n_heads, past_key_value=None, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
|
72 |
+
try:
|
73 |
+
from flash_attn import bert_padding, flash_attn_interface
|
74 |
+
except:
|
75 |
+
raise RuntimeError('Please install flash-attn==1.0.3.post0')
|
76 |
+
check_valid_inputs(query, key, value)
|
77 |
+
if past_key_value is not None:
|
78 |
+
if len(past_key_value) != 0:
|
79 |
+
key = torch.cat([past_key_value[0], key], dim=1)
|
80 |
+
value = torch.cat([past_key_value[1], value], dim=1)
|
81 |
+
past_key_value = (key, value)
|
82 |
+
if attn_bias is not None:
|
83 |
+
_s_q = max(0, attn_bias.size(2) - query.size(1))
|
84 |
+
_s_k = max(0, attn_bias.size(3) - key.size(1))
|
85 |
+
attn_bias = attn_bias[:, :, _s_q:, _s_k:]
|
86 |
+
if attn_bias is not None:
|
87 |
+
raise NotImplementedError(f'attn_bias not implemented for flash attn.')
|
88 |
+
(batch_size, seqlen) = query.shape[:2]
|
89 |
+
if key_padding_mask is None:
|
90 |
+
key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
|
91 |
+
query_padding_mask = key_padding_mask[:, -query.size(1):]
|
92 |
+
(query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
|
93 |
+
query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
|
94 |
+
(key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
|
95 |
+
key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
|
96 |
+
(value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
|
97 |
+
value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
|
98 |
+
if multiquery:
|
99 |
+
key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
|
100 |
+
value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
|
101 |
+
dropout_p = dropout_p if training else 0.0
|
102 |
+
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
|
103 |
+
output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
|
104 |
+
output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
|
105 |
+
return (output, None, past_key_value)
|
106 |
+
|
107 |
+
def triton_flash_attn_fn(query, key, value, n_heads, past_key_value=None, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
|
108 |
+
try:
|
109 |
+
from .flash_attn_triton import flash_attn_func
|
110 |
+
except:
|
111 |
+
_installed = False
|
112 |
+
if version.parse(torch.__version__) < version.parse('2.0.0'):
|
113 |
+
_installed = True
|
114 |
+
try:
|
115 |
+
from flash_attn.flash_attn_triton import flash_attn_func
|
116 |
+
except:
|
117 |
+
_installed = False
|
118 |
+
if not _installed:
|
119 |
+
raise RuntimeError('Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed.')
|
120 |
+
check_valid_inputs(query, key, value)
|
121 |
+
if past_key_value is not None:
|
122 |
+
if len(past_key_value) != 0:
|
123 |
+
key = torch.cat([past_key_value[0], key], dim=1)
|
124 |
+
value = torch.cat([past_key_value[1], value], dim=1)
|
125 |
+
past_key_value = (key, value)
|
126 |
+
if attn_bias is not None:
|
127 |
+
_s_q = max(0, attn_bias.size(2) - query.size(1))
|
128 |
+
_s_k = max(0, attn_bias.size(3) - key.size(1))
|
129 |
+
attn_bias = attn_bias[:, :, _s_q:, _s_k:]
|
130 |
+
if dropout_p:
|
131 |
+
raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
|
132 |
+
if needs_weights:
|
133 |
+
raise NotImplementedError(f'attn_impl: triton cannot return attn weights.')
|
134 |
+
if key_padding_mask is not None:
|
135 |
+
warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
|
136 |
+
(b_size, s_k) = key_padding_mask.shape[:2]
|
137 |
+
if attn_bias is None:
|
138 |
+
attn_bias = query.new_zeros(b_size, 1, 1, s_k)
|
139 |
+
attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
|
140 |
+
query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
|
141 |
+
key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
142 |
+
value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
143 |
+
if multiquery:
|
144 |
+
key = key.expand(*key.shape[:2], n_heads, key.size(-1))
|
145 |
+
value = value.expand(*value.shape[:2], n_heads, value.size(-1))
|
146 |
+
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
|
147 |
+
attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
|
148 |
+
output = attn_output.view(*attn_output.shape[:2], -1)
|
149 |
+
return (output, None, past_key_value)
|
150 |
+
|
151 |
+
class MultiheadAttention(nn.Module):
|
152 |
+
"""Multi-head self attention.
|
153 |
+
|
154 |
+
Using torch or triton attention implemetation enables user to also use
|
155 |
+
additive bias.
|
156 |
+
"""
|
157 |
+
|
158 |
+
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
|
159 |
+
super().__init__()
|
160 |
+
self.attn_impl = attn_impl
|
161 |
+
self.clip_qkv = clip_qkv
|
162 |
+
self.qk_ln = qk_ln
|
163 |
+
self.d_model = d_model
|
164 |
+
self.n_heads = n_heads
|
165 |
+
self.softmax_scale = softmax_scale
|
166 |
+
if self.softmax_scale is None:
|
167 |
+
self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
|
168 |
+
self.attn_dropout_p = attn_pdrop
|
169 |
+
self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
|
170 |
+
fuse_splits = (d_model, 2 * d_model)
|
171 |
+
self.Wqkv._fused = (0, fuse_splits)
|
172 |
+
if self.qk_ln:
|
173 |
+
layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
|
174 |
+
self.q_ln = layernorm_class(self.d_model, device=device)
|
175 |
+
self.k_ln = layernorm_class(self.d_model, device=device)
|
176 |
+
if self.attn_impl == 'flash':
|
177 |
+
self.attn_fn = flash_attn_fn
|
178 |
+
elif self.attn_impl == 'triton':
|
179 |
+
self.attn_fn = triton_flash_attn_fn
|
180 |
+
if verbose:
|
181 |
+
warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
|
182 |
+
elif self.attn_impl == 'torch':
|
183 |
+
self.attn_fn = scaled_multihead_dot_product_attention
|
184 |
+
if torch.cuda.is_available() and verbose:
|
185 |
+
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
186 |
+
else:
|
187 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
188 |
+
self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
|
189 |
+
self.out_proj._is_residual = True
|
190 |
+
|
191 |
+
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
192 |
+
qkv = self.Wqkv(x)
|
193 |
+
if self.clip_qkv:
|
194 |
+
qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
|
195 |
+
(query, key, value) = qkv.chunk(3, dim=2)
|
196 |
+
key_padding_mask = attention_mask
|
197 |
+
if self.qk_ln:
|
198 |
+
dtype = query.dtype
|
199 |
+
query = self.q_ln(query).to(dtype)
|
200 |
+
key = self.k_ln(key).to(dtype)
|
201 |
+
(context, attn_weights, past_key_value) = self.attn_fn(query, key, value, self.n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
|
202 |
+
return (self.out_proj(context), attn_weights, past_key_value)
|
203 |
+
|
204 |
+
class MultiQueryAttention(nn.Module):
|
205 |
+
"""Multi-Query self attention.
|
206 |
+
|
207 |
+
Using torch or triton attention implemetation enables user to also use
|
208 |
+
additive bias.
|
209 |
+
"""
|
210 |
+
|
211 |
+
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
|
212 |
+
super().__init__()
|
213 |
+
self.attn_impl = attn_impl
|
214 |
+
self.clip_qkv = clip_qkv
|
215 |
+
self.qk_ln = qk_ln
|
216 |
+
self.d_model = d_model
|
217 |
+
self.n_heads = n_heads
|
218 |
+
self.head_dim = d_model // n_heads
|
219 |
+
self.softmax_scale = softmax_scale
|
220 |
+
if self.softmax_scale is None:
|
221 |
+
self.softmax_scale = 1 / math.sqrt(self.head_dim)
|
222 |
+
self.attn_dropout_p = attn_pdrop
|
223 |
+
self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
|
224 |
+
fuse_splits = (d_model, d_model + self.head_dim)
|
225 |
+
self.Wqkv._fused = (0, fuse_splits)
|
226 |
+
if self.qk_ln:
|
227 |
+
layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
|
228 |
+
self.q_ln = layernorm_class(d_model, device=device)
|
229 |
+
self.k_ln = layernorm_class(self.head_dim, device=device)
|
230 |
+
if self.attn_impl == 'flash':
|
231 |
+
self.attn_fn = flash_attn_fn
|
232 |
+
elif self.attn_impl == 'triton':
|
233 |
+
self.attn_fn = triton_flash_attn_fn
|
234 |
+
if verbose:
|
235 |
+
warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
|
236 |
+
elif self.attn_impl == 'torch':
|
237 |
+
self.attn_fn = scaled_multihead_dot_product_attention
|
238 |
+
if torch.cuda.is_available() and verbose:
|
239 |
+
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
240 |
+
else:
|
241 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
242 |
+
self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
|
243 |
+
self.out_proj._is_residual = True
|
244 |
+
|
245 |
+
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
246 |
+
qkv = self.Wqkv(x)
|
247 |
+
if self.clip_qkv:
|
248 |
+
qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
|
249 |
+
(query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
|
250 |
+
key_padding_mask = attention_mask
|
251 |
+
if self.qk_ln:
|
252 |
+
dtype = query.dtype
|
253 |
+
query = self.q_ln(query).to(dtype)
|
254 |
+
key = self.k_ln(key).to(dtype)
|
255 |
+
(context, attn_weights, past_key_value) = self.attn_fn(query, key, value, self.n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
|
256 |
+
return (self.out_proj(context), attn_weights, past_key_value)
|
257 |
+
|
258 |
+
def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
|
259 |
+
if attn_impl == 'flash':
|
260 |
+
return None
|
261 |
+
elif attn_impl in ['torch', 'triton']:
|
262 |
+
if alibi:
|
263 |
+
if (prefix_lm or not causal) or use_sequence_id:
|
264 |
+
return (1, n_heads, seq_len, seq_len)
|
265 |
+
return (1, n_heads, 1, seq_len)
|
266 |
+
elif prefix_lm or use_sequence_id:
|
267 |
+
return (1, 1, seq_len, seq_len)
|
268 |
+
return None
|
269 |
+
else:
|
270 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
271 |
+
|
272 |
+
def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
|
273 |
+
if attn_impl == 'flash':
|
274 |
+
return None
|
275 |
+
elif attn_impl in ['torch', 'triton']:
|
276 |
+
if alibi:
|
277 |
+
(device, dtype) = (attn_bias.device, attn_bias.dtype)
|
278 |
+
attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
|
279 |
+
return attn_bias
|
280 |
+
else:
|
281 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
282 |
+
|
283 |
+
def gen_slopes(n_heads, alibi_bias_max=8, device=None):
|
284 |
+
_n_heads = 2 ** math.ceil(math.log2(n_heads))
|
285 |
+
m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
|
286 |
+
m = m.mul(alibi_bias_max / _n_heads)
|
287 |
+
slopes = 1.0 / torch.pow(2, m)
|
288 |
+
if _n_heads != n_heads:
|
289 |
+
slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
|
290 |
+
return slopes.view(1, n_heads, 1, 1)
|
291 |
+
|
292 |
+
def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
|
293 |
+
alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
|
294 |
+
if full:
|
295 |
+
alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
|
296 |
+
alibi_bias = alibi_bias.abs().mul(-1)
|
297 |
+
slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
|
298 |
+
alibi_bias = alibi_bias * slopes
|
299 |
+
return alibi_bias.to(dtype=dtype)
|
300 |
+
ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention}
|
blocks.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""GPT Blocks used for the GPT Model."""
|
2 |
+
from typing import Dict, Optional, Tuple
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
from .attention import ATTN_CLASS_REGISTRY
|
6 |
+
from .norm import NORM_CLASS_REGISTRY
|
7 |
+
|
8 |
+
class MPTMLP(nn.Module):
|
9 |
+
|
10 |
+
def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
|
11 |
+
super().__init__()
|
12 |
+
self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
|
13 |
+
self.act = nn.GELU(approximate='none')
|
14 |
+
self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
|
15 |
+
self.down_proj._is_residual = True
|
16 |
+
|
17 |
+
def forward(self, x):
|
18 |
+
return self.down_proj(self.act(self.up_proj(x)))
|
19 |
+
|
20 |
+
class MPTBlock(nn.Module):
|
21 |
+
|
22 |
+
def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs):
|
23 |
+
del kwargs
|
24 |
+
super().__init__()
|
25 |
+
norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
|
26 |
+
attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
|
27 |
+
self.norm_1 = norm_class(d_model, device=device)
|
28 |
+
self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device)
|
29 |
+
self.norm_2 = norm_class(d_model, device=device)
|
30 |
+
self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
|
31 |
+
self.resid_attn_dropout = nn.Dropout(resid_pdrop)
|
32 |
+
self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
|
33 |
+
|
34 |
+
def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
|
35 |
+
a = self.norm_1(x)
|
36 |
+
(b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
|
37 |
+
x = x + self.resid_attn_dropout(b)
|
38 |
+
m = self.norm_2(x)
|
39 |
+
n = self.ffn(m)
|
40 |
+
x = x + self.resid_ffn_dropout(n)
|
41 |
+
return (x, attn_weights, past_key_value)
|
config.json
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"MPTForCausalLM"
|
4 |
+
],
|
5 |
+
"attn_config": {
|
6 |
+
"alibi": true,
|
7 |
+
"alibi_bias_max": 16,
|
8 |
+
"attn_impl": "torch",
|
9 |
+
"attn_pdrop": 0,
|
10 |
+
"attn_type": "multihead_attention",
|
11 |
+
"attn_uses_sequence_id": false,
|
12 |
+
"clip_qkv": 6,
|
13 |
+
"prefix_lm": false,
|
14 |
+
"qk_ln": false,
|
15 |
+
"softmax_scale": null
|
16 |
+
},
|
17 |
+
"auto_map": {
|
18 |
+
"AutoConfig": "configuration_mpt.MPTConfig",
|
19 |
+
"AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
|
20 |
+
},
|
21 |
+
"d_model": 4096,
|
22 |
+
"emb_pdrop": 0,
|
23 |
+
"embedding_fraction": 1.0,
|
24 |
+
"expansion_ratio": 4,
|
25 |
+
"init_config": {
|
26 |
+
"emb_init_std": null,
|
27 |
+
"emb_init_uniform_lim": null,
|
28 |
+
"fan_mode": "fan_in",
|
29 |
+
"init_div_is_residual": true,
|
30 |
+
"init_gain": 0,
|
31 |
+
"init_nonlinearity": "relu",
|
32 |
+
"init_std": 0.02,
|
33 |
+
"name": "kaiming_normal_",
|
34 |
+
"verbose": 0
|
35 |
+
},
|
36 |
+
"init_device": "cpu",
|
37 |
+
"learned_pos_emb": true,
|
38 |
+
"logit_scale": null,
|
39 |
+
"max_seq_len": 65536,
|
40 |
+
"model_type": "mpt",
|
41 |
+
"n_heads": 32,
|
42 |
+
"n_layers": 32,
|
43 |
+
"no_bias": true,
|
44 |
+
"norm_type": "low_precision_layernorm",
|
45 |
+
"resid_pdrop": 0,
|
46 |
+
"tokenizer_name": "EleutherAI/gpt-neox-20b",
|
47 |
+
"torch_dtype": "bfloat16",
|
48 |
+
"transformers_version": "4.28.1",
|
49 |
+
"use_cache": false,
|
50 |
+
"verbose": 0,
|
51 |
+
"vocab_size": 50432
|
52 |
+
}
|
configuration_mpt.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""A HuggingFace-style model configuration."""
|
2 |
+
from typing import Dict, Optional, Union
|
3 |
+
from transformers import PretrainedConfig
|
4 |
+
attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
|
5 |
+
init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
|
6 |
+
|
7 |
+
class MPTConfig(PretrainedConfig):
|
8 |
+
model_type = 'mpt'
|
9 |
+
|
10 |
+
def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
|
11 |
+
"""The MPT configuration class.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
d_model (int): The size of the embedding dimension of the model.
|
15 |
+
n_heads (int): The number of attention heads.
|
16 |
+
n_layers (int): The number of layers in the model.
|
17 |
+
expansion_ratio (int): The ratio of the up/down scale in the MLP.
|
18 |
+
max_seq_len (int): The maximum sequence length of the model.
|
19 |
+
vocab_size (int): The size of the vocabulary.
|
20 |
+
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
|
21 |
+
emb_pdrop (float): The dropout probability for the embedding layer.
|
22 |
+
learned_pos_emb (bool): Whether to use learned positional embeddings
|
23 |
+
attn_config (Dict): A dictionary used to configure the model's attention module:
|
24 |
+
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
|
25 |
+
attn_pdrop (float): The dropout probability for the attention layers.
|
26 |
+
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
|
27 |
+
qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
|
28 |
+
clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
|
29 |
+
this value.
|
30 |
+
softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
|
31 |
+
use the default scale of ``1/sqrt(d_keys)``.
|
32 |
+
prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
|
33 |
+
extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
|
34 |
+
can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
|
35 |
+
attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
|
36 |
+
When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
|
37 |
+
which sub-sequence each token belongs to.
|
38 |
+
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
|
39 |
+
alibi (bool): Whether to use the alibi bias instead of position embeddings.
|
40 |
+
alibi_bias_max (int): The maximum value of the alibi bias.
|
41 |
+
init_device (str): The device to use for parameter initialization.
|
42 |
+
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
|
43 |
+
no_bias (bool): Whether to use bias in all layers.
|
44 |
+
verbose (int): The verbosity level. 0 is silent.
|
45 |
+
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
|
46 |
+
norm_type (str): choose type of norm to use
|
47 |
+
multiquery_attention (bool): Whether to use multiquery attention implementation.
|
48 |
+
use_cache (bool): Whether or not the model should return the last key/values attentions
|
49 |
+
init_config (Dict): A dictionary used to configure the model initialization:
|
50 |
+
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
|
51 |
+
'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
|
52 |
+
'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
|
53 |
+
init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
|
54 |
+
emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
|
55 |
+
emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
|
56 |
+
used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
|
57 |
+
init_std (float): The standard deviation of the normal distribution used to initialize the model,
|
58 |
+
if using the baseline_ parameter initialization scheme.
|
59 |
+
init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
|
60 |
+
fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
|
61 |
+
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
|
62 |
+
---
|
63 |
+
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
|
64 |
+
"""
|
65 |
+
self.d_model = d_model
|
66 |
+
self.n_heads = n_heads
|
67 |
+
self.n_layers = n_layers
|
68 |
+
self.expansion_ratio = expansion_ratio
|
69 |
+
self.max_seq_len = max_seq_len
|
70 |
+
self.vocab_size = vocab_size
|
71 |
+
self.resid_pdrop = resid_pdrop
|
72 |
+
self.emb_pdrop = emb_pdrop
|
73 |
+
self.learned_pos_emb = learned_pos_emb
|
74 |
+
self.attn_config = attn_config
|
75 |
+
self.init_device = init_device
|
76 |
+
self.logit_scale = logit_scale
|
77 |
+
self.no_bias = no_bias
|
78 |
+
self.verbose = verbose
|
79 |
+
self.embedding_fraction = embedding_fraction
|
80 |
+
self.norm_type = norm_type
|
81 |
+
self.use_cache = use_cache
|
82 |
+
self.init_config = init_config
|
83 |
+
if 'name' in kwargs:
|
84 |
+
del kwargs['name']
|
85 |
+
if 'loss_fn' in kwargs:
|
86 |
+
del kwargs['loss_fn']
|
87 |
+
super().__init__(**kwargs)
|
88 |
+
self._validate_config()
|
89 |
+
|
90 |
+
def _set_config_defaults(self, config, config_defaults):
|
91 |
+
for (k, v) in config_defaults.items():
|
92 |
+
if k not in config:
|
93 |
+
config[k] = v
|
94 |
+
return config
|
95 |
+
|
96 |
+
def _validate_config(self):
|
97 |
+
self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
|
98 |
+
self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
|
99 |
+
if self.d_model % self.n_heads != 0:
|
100 |
+
raise ValueError('d_model must be divisible by n_heads')
|
101 |
+
if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
|
102 |
+
raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
|
103 |
+
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
|
104 |
+
raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
|
105 |
+
if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
|
106 |
+
raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
|
107 |
+
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
|
108 |
+
raise NotImplementedError('alibi only implemented with torch and triton attention.')
|
109 |
+
if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
|
110 |
+
raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
|
111 |
+
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
|
112 |
+
raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
|
113 |
+
if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
|
114 |
+
raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
|
115 |
+
if self.init_config.get('name', None) is None:
|
116 |
+
raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
|
117 |
+
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
118 |
+
raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')
|
ct_output_models/config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"layer_norm_epsilon": null,
|
5 |
+
"unk_token": "<|endoftext|>"
|
6 |
+
}
|
ct_output_models/model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:166239970113aa2349a2f3b6b4f2f3d60d56e603c53eaf31cc8276c64a6339e6
|
3 |
+
size 6655046576
|
ct_output_models/vocabulary.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
custom_embedding.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from torch import Tensor
|
5 |
+
|
6 |
+
class SharedEmbedding(nn.Embedding):
|
7 |
+
|
8 |
+
def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
|
9 |
+
if unembed:
|
10 |
+
return F.linear(input, self.weight)
|
11 |
+
return super().forward(input)
|
flash_attn_triton.py
ADDED
@@ -0,0 +1,484 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Copied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn/flash_attn_triton.py
|
3 |
+
update imports to use 'triton_pre_mlir'
|
4 |
+
|
5 |
+
*Experimental* implementation of FlashAttention in Triton.
|
6 |
+
Tested with triton==2.0.0.dev20221202.
|
7 |
+
Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
|
8 |
+
other than 64:
|
9 |
+
https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
|
10 |
+
We'll update this implementation with the new Triton backend once this is fixed.
|
11 |
+
|
12 |
+
We use the FlashAttention implementation from Phil Tillet a starting point.
|
13 |
+
https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
|
14 |
+
|
15 |
+
Changes:
|
16 |
+
- Implement both causal and non-causal attention.
|
17 |
+
- Implement both self-attention and cross-attention.
|
18 |
+
- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
|
19 |
+
- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
|
20 |
+
- Support attention bias.
|
21 |
+
- Speed up the forward pass a bit, and only store the LSE instead of m and l.
|
22 |
+
- Make the backward for d=128 much faster by reducing register spilling.
|
23 |
+
- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
|
24 |
+
small batch size * nheads.
|
25 |
+
|
26 |
+
Caution:
|
27 |
+
- This is an *experimental* implementation. The forward pass should be quite robust but
|
28 |
+
I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
|
29 |
+
- This implementation has only been tested on A100.
|
30 |
+
- If you plan to use headdim other than 64 and 128, you should test for race conditions
|
31 |
+
(due to the Triton compiler), as done in tests/test_flash_attn.py
|
32 |
+
"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
|
33 |
+
for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
|
34 |
+
that there are none left for other head dimensions.
|
35 |
+
|
36 |
+
Differences between this Triton version and the CUDA version:
|
37 |
+
- Triton version doesn't support dropout.
|
38 |
+
- Triton forward is generally faster than CUDA forward, while Triton backward is
|
39 |
+
generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
|
40 |
+
than CUDA forward + backward.
|
41 |
+
- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
|
42 |
+
- Triton version supports attention bias, while CUDA version doesn't.
|
43 |
+
"""
|
44 |
+
import math
|
45 |
+
import torch
|
46 |
+
import triton_pre_mlir as triton
|
47 |
+
import triton_pre_mlir.language as tl
|
48 |
+
|
49 |
+
@triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
|
50 |
+
@triton.jit
|
51 |
+
def _fwd_kernel(Q, K, V, Bias, Out, Lse, TMP, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_ob, stride_oh, stride_om, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
|
52 |
+
start_m = tl.program_id(0)
|
53 |
+
off_hb = tl.program_id(1)
|
54 |
+
off_b = off_hb // nheads
|
55 |
+
off_h = off_hb % nheads
|
56 |
+
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
|
57 |
+
offs_n = tl.arange(0, BLOCK_N)
|
58 |
+
offs_d = tl.arange(0, BLOCK_HEADDIM)
|
59 |
+
q_ptrs = Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
|
60 |
+
k_ptrs = K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
|
61 |
+
v_ptrs = V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
|
62 |
+
if BIAS_TYPE == 'vector':
|
63 |
+
b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
|
64 |
+
elif BIAS_TYPE == 'matrix':
|
65 |
+
b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :])
|
66 |
+
t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
|
67 |
+
lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
|
68 |
+
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
|
69 |
+
acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
|
70 |
+
if EVEN_M & EVEN_N:
|
71 |
+
if EVEN_HEADDIM:
|
72 |
+
q = tl.load(q_ptrs)
|
73 |
+
else:
|
74 |
+
q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
|
75 |
+
elif EVEN_HEADDIM:
|
76 |
+
q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
|
77 |
+
else:
|
78 |
+
q = tl.load(q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
|
79 |
+
end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
|
80 |
+
for start_n in range(0, end_n, BLOCK_N):
|
81 |
+
start_n = tl.multiple_of(start_n, BLOCK_N)
|
82 |
+
if EVEN_N & EVEN_M:
|
83 |
+
if EVEN_HEADDIM:
|
84 |
+
k = tl.load(k_ptrs + start_n * stride_kn)
|
85 |
+
else:
|
86 |
+
k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
|
87 |
+
elif EVEN_HEADDIM:
|
88 |
+
k = tl.load(k_ptrs + start_n * stride_kn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
|
89 |
+
else:
|
90 |
+
k = tl.load(k_ptrs + start_n * stride_kn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
|
91 |
+
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
|
92 |
+
qk += tl.dot(q, k, trans_b=True)
|
93 |
+
if not EVEN_N:
|
94 |
+
qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float('-inf'))
|
95 |
+
if IS_CAUSAL:
|
96 |
+
qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float('-inf'))
|
97 |
+
if BIAS_TYPE != 'none':
|
98 |
+
if BIAS_TYPE == 'vector':
|
99 |
+
if EVEN_N:
|
100 |
+
bias = tl.load(b_ptrs + start_n).to(tl.float32)
|
101 |
+
else:
|
102 |
+
bias = tl.load(b_ptrs + start_n, mask=start_n + offs_n < seqlen_k, other=0.0).to(tl.float32)
|
103 |
+
bias = bias[None, :]
|
104 |
+
elif BIAS_TYPE == 'matrix':
|
105 |
+
if EVEN_M & EVEN_N:
|
106 |
+
bias = tl.load(b_ptrs + start_n).to(tl.float32)
|
107 |
+
else:
|
108 |
+
bias = tl.load(b_ptrs + start_n, mask=(offs_m[:, None] < seqlen_q) & ((start_n + offs_n)[None, :] < seqlen_k), other=0.0).to(tl.float32)
|
109 |
+
qk = qk * softmax_scale + bias
|
110 |
+
m_ij = tl.maximum(tl.max(qk, 1), lse_i)
|
111 |
+
p = tl.exp(qk - m_ij[:, None])
|
112 |
+
else:
|
113 |
+
m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
|
114 |
+
p = tl.exp(qk * softmax_scale - m_ij[:, None])
|
115 |
+
l_ij = tl.sum(p, 1)
|
116 |
+
acc_o_scale = tl.exp(m_i - m_ij)
|
117 |
+
tl.store(t_ptrs, acc_o_scale)
|
118 |
+
acc_o_scale = tl.load(t_ptrs)
|
119 |
+
acc_o = acc_o * acc_o_scale[:, None]
|
120 |
+
if EVEN_N & EVEN_M:
|
121 |
+
if EVEN_HEADDIM:
|
122 |
+
v = tl.load(v_ptrs + start_n * stride_vn)
|
123 |
+
else:
|
124 |
+
v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
|
125 |
+
elif EVEN_HEADDIM:
|
126 |
+
v = tl.load(v_ptrs + start_n * stride_vn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
|
127 |
+
else:
|
128 |
+
v = tl.load(v_ptrs + start_n * stride_vn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
|
129 |
+
p = p.to(v.dtype)
|
130 |
+
acc_o += tl.dot(p, v)
|
131 |
+
m_i = m_ij
|
132 |
+
l_i_new = tl.exp(lse_i - m_ij) + l_ij
|
133 |
+
lse_i = m_ij + tl.log(l_i_new)
|
134 |
+
o_scale = tl.exp(m_i - lse_i)
|
135 |
+
tl.store(t_ptrs, o_scale)
|
136 |
+
o_scale = tl.load(t_ptrs)
|
137 |
+
acc_o = acc_o * o_scale[:, None]
|
138 |
+
start_m = tl.program_id(0)
|
139 |
+
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
|
140 |
+
lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
|
141 |
+
tl.store(lse_ptrs, lse_i)
|
142 |
+
offs_d = tl.arange(0, BLOCK_HEADDIM)
|
143 |
+
out_ptrs = Out + off_b * stride_ob + off_h * stride_oh + (offs_m[:, None] * stride_om + offs_d[None, :])
|
144 |
+
if EVEN_M:
|
145 |
+
if EVEN_HEADDIM:
|
146 |
+
tl.store(out_ptrs, acc_o)
|
147 |
+
else:
|
148 |
+
tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
|
149 |
+
elif EVEN_HEADDIM:
|
150 |
+
tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
|
151 |
+
else:
|
152 |
+
tl.store(out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
|
153 |
+
|
154 |
+
@triton.jit
|
155 |
+
def _bwd_preprocess_do_o_dot(Out, DO, Delta, stride_ob, stride_oh, stride_om, stride_dob, stride_doh, stride_dom, nheads, seqlen_q, seqlen_q_rounded, headdim, BLOCK_M: tl.constexpr, BLOCK_HEADDIM: tl.constexpr):
|
156 |
+
start_m = tl.program_id(0)
|
157 |
+
off_hb = tl.program_id(1)
|
158 |
+
off_b = off_hb // nheads
|
159 |
+
off_h = off_hb % nheads
|
160 |
+
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
|
161 |
+
offs_d = tl.arange(0, BLOCK_HEADDIM)
|
162 |
+
o = tl.load(Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
|
163 |
+
do = tl.load(DO + off_b * stride_dob + off_h * stride_doh + offs_m[:, None] * stride_dom + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
|
164 |
+
delta = tl.sum(o * do, axis=1)
|
165 |
+
tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
|
166 |
+
|
167 |
+
@triton.jit
|
168 |
+
def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr):
|
169 |
+
if EVEN_N & EVEN_M:
|
170 |
+
if EVEN_HEADDIM:
|
171 |
+
tl.store(dv_ptrs, dv)
|
172 |
+
tl.store(dk_ptrs, dk)
|
173 |
+
else:
|
174 |
+
tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
|
175 |
+
tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
|
176 |
+
elif EVEN_HEADDIM:
|
177 |
+
tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
|
178 |
+
tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
|
179 |
+
else:
|
180 |
+
tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
|
181 |
+
tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
|
182 |
+
|
183 |
+
@triton.jit
|
184 |
+
def _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD: tl.constexpr, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
|
185 |
+
begin_m = 0 if not IS_CAUSAL else start_n * BLOCK_N // BLOCK_M * BLOCK_M
|
186 |
+
offs_qm = begin_m + tl.arange(0, BLOCK_M)
|
187 |
+
offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
|
188 |
+
offs_m = tl.arange(0, BLOCK_M)
|
189 |
+
offs_d = tl.arange(0, BLOCK_HEADDIM)
|
190 |
+
q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
|
191 |
+
k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
|
192 |
+
v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
|
193 |
+
do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
|
194 |
+
dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
|
195 |
+
if BIAS_TYPE == 'vector':
|
196 |
+
b_ptrs = Bias + offs_n
|
197 |
+
elif BIAS_TYPE == 'matrix':
|
198 |
+
b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
|
199 |
+
dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
|
200 |
+
dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
|
201 |
+
if begin_m >= seqlen_q:
|
202 |
+
dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
|
203 |
+
dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
|
204 |
+
_bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
|
205 |
+
return
|
206 |
+
if EVEN_N & EVEN_M:
|
207 |
+
if EVEN_HEADDIM:
|
208 |
+
k = tl.load(k_ptrs)
|
209 |
+
v = tl.load(v_ptrs)
|
210 |
+
else:
|
211 |
+
k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
|
212 |
+
v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
|
213 |
+
elif EVEN_HEADDIM:
|
214 |
+
k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
|
215 |
+
v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
|
216 |
+
else:
|
217 |
+
k = tl.load(k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
|
218 |
+
v = tl.load(v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
|
219 |
+
num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
|
220 |
+
for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
|
221 |
+
start_m = tl.multiple_of(start_m, BLOCK_M)
|
222 |
+
offs_m_curr = start_m + offs_m
|
223 |
+
if EVEN_M & EVEN_HEADDIM:
|
224 |
+
q = tl.load(q_ptrs)
|
225 |
+
elif EVEN_HEADDIM:
|
226 |
+
q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
|
227 |
+
else:
|
228 |
+
q = tl.load(q_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
|
229 |
+
qk = tl.dot(q, k, trans_b=True)
|
230 |
+
if not EVEN_N:
|
231 |
+
qk = tl.where(offs_n[None, :] < seqlen_k, qk, float('-inf'))
|
232 |
+
if IS_CAUSAL:
|
233 |
+
qk = tl.where(offs_m_curr[:, None] >= offs_n[None, :], qk, float('-inf'))
|
234 |
+
if BIAS_TYPE != 'none':
|
235 |
+
tl.debug_barrier()
|
236 |
+
if BIAS_TYPE == 'vector':
|
237 |
+
if EVEN_N:
|
238 |
+
bias = tl.load(b_ptrs).to(tl.float32)
|
239 |
+
else:
|
240 |
+
bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
|
241 |
+
bias = bias[None, :]
|
242 |
+
elif BIAS_TYPE == 'matrix':
|
243 |
+
if EVEN_M & EVEN_N:
|
244 |
+
bias = tl.load(b_ptrs).to(tl.float32)
|
245 |
+
else:
|
246 |
+
bias = tl.load(b_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k), other=0.0).to(tl.float32)
|
247 |
+
qk = qk * softmax_scale + bias
|
248 |
+
if not EVEN_M & EVEN_HEADDIM:
|
249 |
+
tl.debug_barrier()
|
250 |
+
lse_i = tl.load(LSE + offs_m_curr)
|
251 |
+
if BIAS_TYPE == 'none':
|
252 |
+
p = tl.exp(qk * softmax_scale - lse_i[:, None])
|
253 |
+
else:
|
254 |
+
p = tl.exp(qk - lse_i[:, None])
|
255 |
+
if EVEN_M & EVEN_HEADDIM:
|
256 |
+
do = tl.load(do_ptrs)
|
257 |
+
else:
|
258 |
+
do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
|
259 |
+
dv += tl.dot(p.to(do.dtype), do, trans_a=True)
|
260 |
+
if not EVEN_M & EVEN_HEADDIM:
|
261 |
+
tl.debug_barrier()
|
262 |
+
dp = tl.dot(do, v, trans_b=True)
|
263 |
+
if not EVEN_HEADDIM:
|
264 |
+
tl.debug_barrier()
|
265 |
+
Di = tl.load(D + offs_m_curr)
|
266 |
+
ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
|
267 |
+
dk += tl.dot(ds, q, trans_a=True)
|
268 |
+
if not EVEN_M & EVEN_HEADDIM:
|
269 |
+
tl.debug_barrier()
|
270 |
+
if not ATOMIC_ADD:
|
271 |
+
if EVEN_M & EVEN_HEADDIM:
|
272 |
+
dq = tl.load(dq_ptrs, eviction_policy='evict_last')
|
273 |
+
dq += tl.dot(ds, k)
|
274 |
+
tl.store(dq_ptrs, dq, eviction_policy='evict_last')
|
275 |
+
elif EVEN_HEADDIM:
|
276 |
+
dq = tl.load(dq_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0, eviction_policy='evict_last')
|
277 |
+
dq += tl.dot(ds, k)
|
278 |
+
tl.store(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q, eviction_policy='evict_last')
|
279 |
+
else:
|
280 |
+
dq = tl.load(dq_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0, eviction_policy='evict_last')
|
281 |
+
dq += tl.dot(ds, k)
|
282 |
+
tl.store(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), eviction_policy='evict_last')
|
283 |
+
else:
|
284 |
+
dq = tl.dot(ds, k)
|
285 |
+
if EVEN_M & EVEN_HEADDIM:
|
286 |
+
tl.atomic_add(dq_ptrs, dq)
|
287 |
+
elif EVEN_HEADDIM:
|
288 |
+
tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
|
289 |
+
else:
|
290 |
+
tl.atomic_add(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
|
291 |
+
dq_ptrs += BLOCK_M * stride_dqm
|
292 |
+
q_ptrs += BLOCK_M * stride_qm
|
293 |
+
do_ptrs += BLOCK_M * stride_dom
|
294 |
+
if BIAS_TYPE == 'matrix':
|
295 |
+
b_ptrs += BLOCK_M * stride_bm
|
296 |
+
dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
|
297 |
+
dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
|
298 |
+
_bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
|
299 |
+
|
300 |
+
def init_to_zero(name):
|
301 |
+
return lambda nargs: nargs[name].zero_()
|
302 |
+
|
303 |
+
@triton.autotune(configs=[triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ'))], key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM'])
|
304 |
+
@triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
|
305 |
+
@triton.jit
|
306 |
+
def _bwd_kernel(Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_dob, stride_doh, stride_dom, stride_dqb, stride_dqh, stride_dqm, stride_dkb, stride_dkh, stride_dkn, stride_dvb, stride_dvh, stride_dvn, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, SEQUENCE_PARALLEL: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
|
307 |
+
off_hb = tl.program_id(1)
|
308 |
+
off_b = off_hb // nheads
|
309 |
+
off_h = off_hb % nheads
|
310 |
+
Q += off_b * stride_qb + off_h * stride_qh
|
311 |
+
K += off_b * stride_kb + off_h * stride_kh
|
312 |
+
V += off_b * stride_vb + off_h * stride_vh
|
313 |
+
DO += off_b * stride_dob + off_h * stride_doh
|
314 |
+
DQ += off_b * stride_dqb + off_h * stride_dqh
|
315 |
+
DK += off_b * stride_dkb + off_h * stride_dkh
|
316 |
+
DV += off_b * stride_dvb + off_h * stride_dvh
|
317 |
+
if BIAS_TYPE != 'none':
|
318 |
+
Bias += off_b * stride_bb + off_h * stride_bh
|
319 |
+
D += off_hb * seqlen_q_rounded
|
320 |
+
LSE += off_hb * seqlen_q_rounded
|
321 |
+
if not SEQUENCE_PARALLEL:
|
322 |
+
num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
|
323 |
+
for start_n in range(0, num_block_n):
|
324 |
+
_bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=False, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
|
325 |
+
else:
|
326 |
+
start_n = tl.program_id(0)
|
327 |
+
_bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=True, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
|
328 |
+
|
329 |
+
def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
|
330 |
+
(batch, seqlen_q, nheads, d) = q.shape
|
331 |
+
(_, seqlen_k, _, _) = k.shape
|
332 |
+
assert k.shape == (batch, seqlen_k, nheads, d)
|
333 |
+
assert v.shape == (batch, seqlen_k, nheads, d)
|
334 |
+
assert d <= 128, 'FlashAttention only support head dimensions up to 128'
|
335 |
+
assert q.dtype == k.dtype == v.dtype, 'All tensors must have the same type'
|
336 |
+
assert q.dtype in [torch.float16, torch.bfloat16], 'Only support fp16 and bf16'
|
337 |
+
assert q.is_cuda and k.is_cuda and v.is_cuda
|
338 |
+
softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
|
339 |
+
has_bias = bias is not None
|
340 |
+
bias_type = 'none'
|
341 |
+
if has_bias:
|
342 |
+
assert bias.dtype in [q.dtype, torch.float]
|
343 |
+
assert bias.is_cuda
|
344 |
+
assert bias.dim() == 4
|
345 |
+
if bias.stride(-1) != 1:
|
346 |
+
bias = bias.contiguous()
|
347 |
+
if bias.shape[2:] == (1, seqlen_k):
|
348 |
+
bias_type = 'vector'
|
349 |
+
elif bias.shape[2:] == (seqlen_q, seqlen_k):
|
350 |
+
bias_type = 'matrix'
|
351 |
+
else:
|
352 |
+
raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
|
353 |
+
bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
|
354 |
+
bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
|
355 |
+
seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
|
356 |
+
lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
|
357 |
+
tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
|
358 |
+
o = torch.empty_like(q)
|
359 |
+
BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
|
360 |
+
BLOCK = 128
|
361 |
+
num_warps = 4 if d <= 64 else 8
|
362 |
+
grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
|
363 |
+
_fwd_kernel[grid](q, k, v, bias, o, lse, tmp, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, o.stride(0), o.stride(2), o.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM, BLOCK_M=BLOCK, BLOCK_N=BLOCK, num_warps=num_warps, num_stages=1)
|
364 |
+
return (o, lse, softmax_scale)
|
365 |
+
|
366 |
+
def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None):
|
367 |
+
if do.stride(-1) != 1:
|
368 |
+
do = do.contiguous()
|
369 |
+
(batch, seqlen_q, nheads, d) = q.shape
|
370 |
+
(_, seqlen_k, _, _) = k.shape
|
371 |
+
assert d <= 128
|
372 |
+
seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
|
373 |
+
assert lse.shape == (batch, nheads, seqlen_q_rounded)
|
374 |
+
assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
|
375 |
+
assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
|
376 |
+
softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
|
377 |
+
dq_accum = torch.empty_like(q, dtype=torch.float32)
|
378 |
+
delta = torch.empty_like(lse)
|
379 |
+
BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
|
380 |
+
grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
|
381 |
+
_bwd_preprocess_do_o_dot[grid](o, do, delta, o.stride(0), o.stride(2), o.stride(1), do.stride(0), do.stride(2), do.stride(1), nheads, seqlen_q, seqlen_q_rounded, d, BLOCK_M=128, BLOCK_HEADDIM=BLOCK_HEADDIM)
|
382 |
+
has_bias = bias is not None
|
383 |
+
bias_type = 'none'
|
384 |
+
if has_bias:
|
385 |
+
assert bias.dtype in [q.dtype, torch.float]
|
386 |
+
assert bias.is_cuda
|
387 |
+
assert bias.dim() == 4
|
388 |
+
assert bias.stride(-1) == 1
|
389 |
+
if bias.shape[2:] == (1, seqlen_k):
|
390 |
+
bias_type = 'vector'
|
391 |
+
elif bias.shape[2:] == (seqlen_q, seqlen_k):
|
392 |
+
bias_type = 'matrix'
|
393 |
+
else:
|
394 |
+
raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
|
395 |
+
bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
|
396 |
+
bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
|
397 |
+
grid = lambda META: (triton.cdiv(seqlen_k, META['BLOCK_N']) if META['SEQUENCE_PARALLEL'] else 1, batch * nheads)
|
398 |
+
_bwd_kernel[grid](q, k, v, bias, do, dq_accum, dk, dv, lse, delta, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, do.stride(0), do.stride(2), do.stride(1), dq_accum.stride(0), dq_accum.stride(2), dq_accum.stride(1), dk.stride(0), dk.stride(2), dk.stride(1), dv.stride(0), dv.stride(2), dv.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM)
|
399 |
+
dq.copy_(dq_accum)
|
400 |
+
|
401 |
+
class FlashAttnQKVPackedFunc(torch.autograd.Function):
|
402 |
+
|
403 |
+
@staticmethod
|
404 |
+
def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
|
405 |
+
"""
|
406 |
+
qkv: (batch, seqlen, 3, nheads, headdim)
|
407 |
+
bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
|
408 |
+
For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
|
409 |
+
ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
|
410 |
+
"""
|
411 |
+
if qkv.stride(-1) != 1:
|
412 |
+
qkv = qkv.contiguous()
|
413 |
+
(o, lse, ctx.softmax_scale) = _flash_attn_forward(qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], bias=bias, causal=causal, softmax_scale=softmax_scale)
|
414 |
+
ctx.save_for_backward(qkv, o, lse, bias)
|
415 |
+
ctx.causal = causal
|
416 |
+
return o
|
417 |
+
|
418 |
+
@staticmethod
|
419 |
+
def backward(ctx, do):
|
420 |
+
(qkv, o, lse, bias) = ctx.saved_tensors
|
421 |
+
assert not ctx.needs_input_grad[1], 'FlashAttention does not support bias gradient yet'
|
422 |
+
with torch.inference_mode():
|
423 |
+
dqkv = torch.empty_like(qkv)
|
424 |
+
_flash_attn_backward(do, qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], o, lse, dqkv[:, :, 0], dqkv[:, :, 1], dqkv[:, :, 2], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
|
425 |
+
return (dqkv, None, None, None)
|
426 |
+
flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
|
427 |
+
|
428 |
+
class FlashAttnKVPackedFunc(torch.autograd.Function):
|
429 |
+
|
430 |
+
@staticmethod
|
431 |
+
def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
|
432 |
+
"""
|
433 |
+
q: (batch, seqlen_q, nheads, headdim)
|
434 |
+
kv: (batch, seqlen_k, 2, nheads, headdim)
|
435 |
+
bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
|
436 |
+
For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
|
437 |
+
ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
|
438 |
+
"""
|
439 |
+
(q, kv) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
|
440 |
+
(o, lse, ctx.softmax_scale) = _flash_attn_forward(q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale)
|
441 |
+
ctx.save_for_backward(q, kv, o, lse, bias)
|
442 |
+
ctx.causal = causal
|
443 |
+
return o
|
444 |
+
|
445 |
+
@staticmethod
|
446 |
+
def backward(ctx, do):
|
447 |
+
(q, kv, o, lse, bias) = ctx.saved_tensors
|
448 |
+
if len(ctx.needs_input_grad) >= 3:
|
449 |
+
assert not ctx.needs_input_grad[2], 'FlashAttention does not support bias gradient yet'
|
450 |
+
with torch.inference_mode():
|
451 |
+
dq = torch.empty_like(q)
|
452 |
+
dkv = torch.empty_like(kv)
|
453 |
+
_flash_attn_backward(do, q, kv[:, :, 0], kv[:, :, 1], o, lse, dq, dkv[:, :, 0], dkv[:, :, 1], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
|
454 |
+
return (dq, dkv, None, None, None)
|
455 |
+
flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
|
456 |
+
|
457 |
+
class FlashAttnFunc(torch.autograd.Function):
|
458 |
+
|
459 |
+
@staticmethod
|
460 |
+
def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
|
461 |
+
"""
|
462 |
+
q: (batch_size, seqlen_q, nheads, headdim)
|
463 |
+
k, v: (batch_size, seqlen_k, nheads, headdim)
|
464 |
+
bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
|
465 |
+
For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
|
466 |
+
ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
|
467 |
+
"""
|
468 |
+
(q, k, v) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
|
469 |
+
(o, lse, ctx.softmax_scale) = _flash_attn_forward(q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale)
|
470 |
+
ctx.save_for_backward(q, k, v, o, lse, bias)
|
471 |
+
ctx.causal = causal
|
472 |
+
return o
|
473 |
+
|
474 |
+
@staticmethod
|
475 |
+
def backward(ctx, do):
|
476 |
+
(q, k, v, o, lse, bias) = ctx.saved_tensors
|
477 |
+
assert not ctx.needs_input_grad[3], 'FlashAttention does not support bias gradient yet'
|
478 |
+
with torch.inference_mode():
|
479 |
+
dq = torch.empty_like(q)
|
480 |
+
dk = torch.empty_like(k)
|
481 |
+
dv = torch.empty_like(v)
|
482 |
+
_flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
|
483 |
+
return (dq, dk, dv, None, None, None)
|
484 |
+
flash_attn_func = FlashAttnFunc.apply
|
generation_config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"transformers_version": "4.28.1",
|
4 |
+
"use_cache": false
|
5 |
+
}
|
hf_prefixlm_converter.py
ADDED
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
& |