jasonfang3900 commited on
Commit
3037cf1
1 Parent(s): 8a01860

modeling_flm.py: copyright modification and minor code modification

Browse files
Files changed (2) hide show
  1. configuration_flm.py +14 -6
  2. modeling_flm.py +22 -6
configuration_flm.py CHANGED
@@ -1,6 +1,18 @@
1
  # coding=utf-8
2
- # Copyright
3
-
 
 
 
 
 
 
 
 
 
 
 
 
4
  """ Cofe-AI FLM configuration"""
5
 
6
  from transformers.configuration_utils import PretrainedConfig
@@ -11,10 +23,6 @@ from transformers.utils import logging
11
  logger = logging.get_logger(__name__)
12
 
13
  FLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
14
- # "freelm": "xxxx/config.json",
15
- # "freelm-medium": "xxxx/config.json",
16
- # "freelm-large": "xxxx/config.json",
17
- # "freelm-xl": "xxxx/config.json",
18
  }
19
 
20
 
 
1
  # coding=utf-8
2
+ # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
  """ Cofe-AI FLM configuration"""
17
 
18
  from transformers.configuration_utils import PretrainedConfig
 
23
  logger = logging.get_logger(__name__)
24
 
25
  FLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 
 
26
  }
27
 
28
 
modeling_flm.py CHANGED
@@ -1,8 +1,9 @@
1
  # coding=utf-8
2
- # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 
3
  #
4
  # This code is based on OpenAI's GPT-2 library. It has been modified from its
5
- # original forms to accommodate minor architectural differences compared to GPT-2.
6
  #
7
  # Licensed under the Apache License, Version 2.0 (the "License");
8
  # you may not use this file except in compliance with the License.
@@ -15,12 +16,13 @@
15
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
  # See the License for the specific language governing permissions and
17
  # limitations under the License.
 
 
18
  from typing import Optional, Tuple, Union
19
 
20
  import math
21
  import torch
22
- import torch.nn.functional as f
23
- from einops import rearrange, repeat
24
  from torch import einsum, nn
25
  from torch.cuda.amp import autocast
26
  from transformers.activations import ACT2FN
@@ -31,8 +33,9 @@ from transformers.modeling_outputs import (
31
  )
32
  from transformers.modeling_utils import PreTrainedModel
33
  from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_conv1d_layer
34
- from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
35
  from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
 
36
  from .configuration_flm import FLMConfig
37
 
38
 
@@ -102,6 +105,18 @@ class RotaryEmbedding(nn.Module):
102
  rotated_k = apply_rotary_emb(freqs_k, k, scale=scale_k ** -1)
103
  return rotated_q, rotated_k
104
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  def get_scale(self, t, cache_key=None, offset=0, ):
106
  assert self.use_xpos, 'This function is only useful for xpos.'
107
  if exists(cache_key) and cache_key in self.cache_scale:
@@ -372,6 +387,7 @@ class FLMAttention(nn.Module):
372
 
373
  batch_size, head_num, k_seq_len, head_features = key.shape
374
  _, _, q_seq_len, _ = query.shape
 
375
  if rotary_embedding is not None:
376
  query = query.contiguous().view(batch_size * head_num, q_seq_len, head_features)
377
  key = key.contiguous().view(batch_size * head_num, k_seq_len, head_features)
@@ -381,7 +397,7 @@ class FLMAttention(nn.Module):
381
  # query: [batch_size * head_num, seqlen, hn]
382
  query, key = rotary_embedding.rotate_queries_and_keys(query, key)
383
  else:
384
- query = rotary_embedding.rotate_queries_or_keys(query)
385
  key = rotary_embedding.rotate_queries_or_keys(key)
386
  # batch_size * head_num, k_seq_len(q_seq_len), head_features
387
  query = query.view(batch_size, head_num, q_seq_len, head_features)
 
1
  # coding=utf-8
2
+ # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
  #
5
  # This code is based on OpenAI's GPT-2 library. It has been modified from its
6
+ # original forms to accommodate architectural differences compared to GPT-2.
7
  #
8
  # Licensed under the Apache License, Version 2.0 (the "License");
9
  # you may not use this file except in compliance with the License.
 
16
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
  # See the License for the specific language governing permissions and
18
  # limitations under the License.
19
+ """PyTorch FLM model."""
20
+
21
  from typing import Optional, Tuple, Union
22
 
23
  import math
24
  import torch
25
+ from einops import rearrange
 
26
  from torch import einsum, nn
27
  from torch.cuda.amp import autocast
28
  from transformers.activations import ACT2FN
 
33
  )
34
  from transformers.modeling_utils import PreTrainedModel
35
  from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_conv1d_layer
36
+ from transformers.utils import logging
37
  from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
38
+
39
  from .configuration_flm import FLMConfig
40
 
41
 
 
105
  rotated_k = apply_rotary_emb(freqs_k, k, scale=scale_k ** -1)
106
  return rotated_q, rotated_k
107
 
108
+ def rotate_queries_or_keys(self, t, seq_dim=-2, offset=0):
109
+ """
110
+ use this only when xpos is NOT activated.
111
+ """
112
+ # t's shape e.g. -> (batchsize, headnum, seqlen, dimofhead)
113
+ assert not self.use_xpos, 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
114
+ device, seq_len = t.device, t.shape[seq_dim]
115
+ pos_seq_t = torch.arange(offset, offset + seq_len, device=device, dtype=torch.float32)
116
+ freqs = self.forward(pos_seq_t, cache_key=f"{offset}:{offset+seq_len}")
117
+ # freqs seqlen x dim
118
+ return apply_rotary_emb(freqs, t)
119
+
120
  def get_scale(self, t, cache_key=None, offset=0, ):
121
  assert self.use_xpos, 'This function is only useful for xpos.'
122
  if exists(cache_key) and cache_key in self.cache_scale:
 
387
 
388
  batch_size, head_num, k_seq_len, head_features = key.shape
389
  _, _, q_seq_len, _ = query.shape
390
+ query_offset = k_seq_len - q_seq_len
391
  if rotary_embedding is not None:
392
  query = query.contiguous().view(batch_size * head_num, q_seq_len, head_features)
393
  key = key.contiguous().view(batch_size * head_num, k_seq_len, head_features)
 
397
  # query: [batch_size * head_num, seqlen, hn]
398
  query, key = rotary_embedding.rotate_queries_and_keys(query, key)
399
  else:
400
+ query = rotary_embedding.rotate_queries_or_keys(query, offset=query_offset)
401
  key = rotary_embedding.rotate_queries_or_keys(key)
402
  # batch_size * head_num, k_seq_len(q_seq_len), head_features
403
  query = query.view(batch_size, head_num, q_seq_len, head_features)