jasonfang3900
commited on
Commit
•
3037cf1
1
Parent(s):
8a01860
modeling_flm.py: copyright modification and minor code modification
Browse files- configuration_flm.py +14 -6
- modeling_flm.py +22 -6
configuration_flm.py
CHANGED
@@ -1,6 +1,18 @@
|
|
1 |
# coding=utf-8
|
2 |
-
# Copyright
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
""" Cofe-AI FLM configuration"""
|
5 |
|
6 |
from transformers.configuration_utils import PretrainedConfig
|
@@ -11,10 +23,6 @@ from transformers.utils import logging
|
|
11 |
logger = logging.get_logger(__name__)
|
12 |
|
13 |
FLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
14 |
-
# "freelm": "xxxx/config.json",
|
15 |
-
# "freelm-medium": "xxxx/config.json",
|
16 |
-
# "freelm-large": "xxxx/config.json",
|
17 |
-
# "freelm-xl": "xxxx/config.json",
|
18 |
}
|
19 |
|
20 |
|
|
|
1 |
# coding=utf-8
|
2 |
+
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
|
3 |
+
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
""" Cofe-AI FLM configuration"""
|
17 |
|
18 |
from transformers.configuration_utils import PretrainedConfig
|
|
|
23 |
logger = logging.get_logger(__name__)
|
24 |
|
25 |
FLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
|
|
|
|
|
|
|
26 |
}
|
27 |
|
28 |
|
modeling_flm.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
# coding=utf-8
|
2 |
-
# Copyright
|
|
|
3 |
#
|
4 |
# This code is based on OpenAI's GPT-2 library. It has been modified from its
|
5 |
-
# original forms to accommodate
|
6 |
#
|
7 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
8 |
# you may not use this file except in compliance with the License.
|
@@ -15,12 +16,13 @@
|
|
15 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16 |
# See the License for the specific language governing permissions and
|
17 |
# limitations under the License.
|
|
|
|
|
18 |
from typing import Optional, Tuple, Union
|
19 |
|
20 |
import math
|
21 |
import torch
|
22 |
-
|
23 |
-
from einops import rearrange, repeat
|
24 |
from torch import einsum, nn
|
25 |
from torch.cuda.amp import autocast
|
26 |
from transformers.activations import ACT2FN
|
@@ -31,8 +33,9 @@ from transformers.modeling_outputs import (
|
|
31 |
)
|
32 |
from transformers.modeling_utils import PreTrainedModel
|
33 |
from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_conv1d_layer
|
34 |
-
from transformers.utils import
|
35 |
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
|
|
|
36 |
from .configuration_flm import FLMConfig
|
37 |
|
38 |
|
@@ -102,6 +105,18 @@ class RotaryEmbedding(nn.Module):
|
|
102 |
rotated_k = apply_rotary_emb(freqs_k, k, scale=scale_k ** -1)
|
103 |
return rotated_q, rotated_k
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
def get_scale(self, t, cache_key=None, offset=0, ):
|
106 |
assert self.use_xpos, 'This function is only useful for xpos.'
|
107 |
if exists(cache_key) and cache_key in self.cache_scale:
|
@@ -372,6 +387,7 @@ class FLMAttention(nn.Module):
|
|
372 |
|
373 |
batch_size, head_num, k_seq_len, head_features = key.shape
|
374 |
_, _, q_seq_len, _ = query.shape
|
|
|
375 |
if rotary_embedding is not None:
|
376 |
query = query.contiguous().view(batch_size * head_num, q_seq_len, head_features)
|
377 |
key = key.contiguous().view(batch_size * head_num, k_seq_len, head_features)
|
@@ -381,7 +397,7 @@ class FLMAttention(nn.Module):
|
|
381 |
# query: [batch_size * head_num, seqlen, hn]
|
382 |
query, key = rotary_embedding.rotate_queries_and_keys(query, key)
|
383 |
else:
|
384 |
-
query = rotary_embedding.rotate_queries_or_keys(query)
|
385 |
key = rotary_embedding.rotate_queries_or_keys(key)
|
386 |
# batch_size * head_num, k_seq_len(q_seq_len), head_features
|
387 |
query = query.view(batch_size, head_num, q_seq_len, head_features)
|
|
|
1 |
# coding=utf-8
|
2 |
+
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
|
3 |
+
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
4 |
#
|
5 |
# This code is based on OpenAI's GPT-2 library. It has been modified from its
|
6 |
+
# original forms to accommodate architectural differences compared to GPT-2.
|
7 |
#
|
8 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
9 |
# you may not use this file except in compliance with the License.
|
|
|
16 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
17 |
# See the License for the specific language governing permissions and
|
18 |
# limitations under the License.
|
19 |
+
"""PyTorch FLM model."""
|
20 |
+
|
21 |
from typing import Optional, Tuple, Union
|
22 |
|
23 |
import math
|
24 |
import torch
|
25 |
+
from einops import rearrange
|
|
|
26 |
from torch import einsum, nn
|
27 |
from torch.cuda.amp import autocast
|
28 |
from transformers.activations import ACT2FN
|
|
|
33 |
)
|
34 |
from transformers.modeling_utils import PreTrainedModel
|
35 |
from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_conv1d_layer
|
36 |
+
from transformers.utils import logging
|
37 |
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
|
38 |
+
|
39 |
from .configuration_flm import FLMConfig
|
40 |
|
41 |
|
|
|
105 |
rotated_k = apply_rotary_emb(freqs_k, k, scale=scale_k ** -1)
|
106 |
return rotated_q, rotated_k
|
107 |
|
108 |
+
def rotate_queries_or_keys(self, t, seq_dim=-2, offset=0):
|
109 |
+
"""
|
110 |
+
use this only when xpos is NOT activated.
|
111 |
+
"""
|
112 |
+
# t's shape e.g. -> (batchsize, headnum, seqlen, dimofhead)
|
113 |
+
assert not self.use_xpos, 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
|
114 |
+
device, seq_len = t.device, t.shape[seq_dim]
|
115 |
+
pos_seq_t = torch.arange(offset, offset + seq_len, device=device, dtype=torch.float32)
|
116 |
+
freqs = self.forward(pos_seq_t, cache_key=f"{offset}:{offset+seq_len}")
|
117 |
+
# freqs seqlen x dim
|
118 |
+
return apply_rotary_emb(freqs, t)
|
119 |
+
|
120 |
def get_scale(self, t, cache_key=None, offset=0, ):
|
121 |
assert self.use_xpos, 'This function is only useful for xpos.'
|
122 |
if exists(cache_key) and cache_key in self.cache_scale:
|
|
|
387 |
|
388 |
batch_size, head_num, k_seq_len, head_features = key.shape
|
389 |
_, _, q_seq_len, _ = query.shape
|
390 |
+
query_offset = k_seq_len - q_seq_len
|
391 |
if rotary_embedding is not None:
|
392 |
query = query.contiguous().view(batch_size * head_num, q_seq_len, head_features)
|
393 |
key = key.contiguous().view(batch_size * head_num, k_seq_len, head_features)
|
|
|
397 |
# query: [batch_size * head_num, seqlen, hn]
|
398 |
query, key = rotary_embedding.rotate_queries_and_keys(query, key)
|
399 |
else:
|
400 |
+
query = rotary_embedding.rotate_queries_or_keys(query, offset=query_offset)
|
401 |
key = rotary_embedding.rotate_queries_or_keys(key)
|
402 |
# batch_size * head_num, k_seq_len(q_seq_len), head_features
|
403 |
query = query.view(batch_size, head_num, q_seq_len, head_features)
|