Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

configuration_internlm.py +3 -5
modeling_internlm.py +32 -10
tokenization_internlm.py +4 -9

configuration_internlm.py CHANGED Viewed

@@ -1,10 +1,7 @@
 # coding=utf-8
-# Copyright (c) InternLM. All rights reserved.
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,6 +24,7 @@ logger = logging.get_logger(__name__)
 INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class InternLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate

 # coding=utf-8
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on transformers/src/transformers/models/llama/configuration_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+# Modified from transformers.model.llama.configuration_llama.LlamaConfig
 class InternLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate

modeling_internlm.py CHANGED Viewed

@@ -74,7 +74,7 @@ def _get_unpad_data(attention_mask):
     )
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
@@ -92,7 +92,7 @@ def _make_causal_mask(
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-# Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -106,6 +106,8 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 class InternLMRMSNorm(nn.Module):
     """RMSNorm implemention."""
@@ -128,6 +130,7 @@ class InternLMRMSNorm(nn.Module):
         return self.weight * hidden_states
 class InternLMRotaryEmbedding(torch.nn.Module):
     """Implement InternLM's rotary embedding.
@@ -169,6 +172,7 @@ class InternLMRotaryEmbedding(torch.nn.Module):
         )
 class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
     """Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
@@ -229,12 +233,15 @@ class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
         )
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     if position_ids.size(1) == 1:
         q_cos = cos[position_ids].unsqueeze(1).expand(q.shape)
@@ -255,6 +262,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     return q_embed, k_embed
 class InternLMMLP(nn.Module):
     def __init__(
         self,
@@ -272,6 +280,7 @@ class InternLMMLP(nn.Module):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 class InternLMAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -377,10 +386,11 @@ class InternLMAttention(nn.Module):
             attn_weights = None
         return attn_output, attn_weights, past_key_value
 class InternLMFlashAttention2(InternLMAttention):
     """
-    InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -395,7 +405,7 @@ class InternLMFlashAttention2(InternLMAttention):
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # InternLM2FlashAttention2 attention does not support output_attentions
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -526,6 +536,7 @@ INTERNLM_ATTENTION_CLASSES = {
     "flash_attention_2": InternLMFlashAttention2,
 }
 class InternLMDecoderLayer(nn.Module):
     def __init__(self, config: InternLMConfig):
         super().__init__()
@@ -611,6 +622,7 @@ INTERNLM_START_DOCSTRING = r"""
 """
 @add_start_docstrings(
     "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
     INTERNLM_START_DOCSTRING,
@@ -692,6 +704,7 @@ INTERNLM_INPUTS_DOCSTRING = r"""
 """
 @add_start_docstrings(
     "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
     INTERNLM_START_DOCSTRING,
@@ -884,6 +897,7 @@ class InternLMModel(InternLMPreTrainedModel):
         )
 class InternLMForCausalLM(InternLMPreTrainedModel):
     _auto_class = "AutoModelForCausalLM"
@@ -1037,11 +1051,12 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
         return reordered_past
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
-        prompt = ""
-        if meta_instruction:
-            prompt += f"""<s><|System|>:{meta_instruction}\n"""
         else:
-            prompt += "<s>"
         for record in history:
             prompt += f"""<|User|>:{record[0]}\n<|Bot|>:{record[1]}<eoa>\n"""
         prompt += f"""<|User|>:{query}\n<|Bot|>:"""
@@ -1114,6 +1129,7 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
                 self.query = query
                 self.history = history
                 self.response = ""
                 self.received_inputs = False
                 self.queue.put((self.response, history + [(self.query, self.response)]))
@@ -1128,11 +1144,17 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
                     self.received_inputs = True
                     return
-                token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
                 if token.strip() != "<eoa>":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))
             def end(self):
                 self.queue.put(None)

     )
+# Copied from transformers.models.llama.modeling_llama._make_causal_mask
 def _make_causal_mask(
     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.llama.modeling_llama._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM
 class InternLMRMSNorm(nn.Module):
     """RMSNorm implemention."""
         return self.weight * hidden_states
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM
 class InternLMRotaryEmbedding(torch.nn.Module):
     """Implement InternLM's rotary embedding.
         )
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM
 class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
     """Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
         )
+# Copied from transformers.model.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     if position_ids.size(1) == 1:
         q_cos = cos[position_ids].unsqueeze(1).expand(q.shape)
     return q_embed, k_embed
+# Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->InternLM
 class InternLMMLP(nn.Module):
     def __init__(
         self,
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->InternLM
 class InternLMAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
             attn_weights = None
         return attn_output, attn_weights, past_key_value
+# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->InternLM
 class InternLMFlashAttention2(InternLMAttention):
     """
+    InternLM flash attention module. This module inherits from `InternLMAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # InternLMFlashAttention2 attention does not support output_attentions
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
     "flash_attention_2": InternLMFlashAttention2,
 }
+# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->InternLM
 class InternLMDecoderLayer(nn.Module):
     def __init__(self, config: InternLMConfig):
         super().__init__()
 """
+# Copied from transformers.models.llama.modeling_llama.LlamaPretrainedModel with Llama->InternLM
 @add_start_docstrings(
     "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
     INTERNLM_START_DOCSTRING,
 """
+# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->InternLM
 @add_start_docstrings(
     "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
     INTERNLM_START_DOCSTRING,
         )
+# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->InternLM
 class InternLMForCausalLM(InternLMPreTrainedModel):
     _auto_class = "AutoModelForCausalLM"
         return reordered_past
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
+        if tokenizer.add_bos_token:
+            prompt = ""
         else:
+            prompt = tokenizer.bos_token
+        if meta_instruction:
+            prompt += f"""<|System|>:{meta_instruction}\n"""
         for record in history:
             prompt += f"""<|User|>:{record[0]}\n<|Bot|>:{record[1]}<eoa>\n"""
         prompt += f"""<|User|>:{query}\n<|Bot|>:"""
                 self.query = query
                 self.history = history
                 self.response = ""
+                self.cache = []
                 self.received_inputs = False
                 self.queue.put((self.response, history + [(self.query, self.response)]))
                     self.received_inputs = True
                     return
+                self.cache.extend(value.tolist())
+                token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
+                if "�" in token and len(token) <= 5:
+                    return
                 if token.strip() != "<eoa>":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))
+                    self.cache = []
+                else:
+                    self.end()
             def end(self):
                 self.queue.put(None)

tokenization_internlm.py CHANGED Viewed

@@ -1,10 +1,7 @@
 # coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for IntermLM."""
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
@@ -35,7 +32,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
 PRETRAINED_VOCAB_FILES_MAP = {}
 class InternLMTokenizer(PreTrainedTokenizer):
     """
     Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
@@ -81,8 +78,6 @@ class InternLMTokenizer(PreTrainedTokenizer):
             **kwargs,
         )
-        """ Initialization"""
     @property
     def no_prefix_space_tokens(self):
         if self._no_prefix_space_tokens is None:

 # coding=utf-8
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Tokenization classes for InternLM."""
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 PRETRAINED_VOCAB_FILES_MAP = {}
+# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer -> InternLM2Tokenizer
 class InternLMTokenizer(PreTrainedTokenizer):
     """
     Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
             **kwargs,
         )
     @property
     def no_prefix_space_tokens(self):
         if self._no_prefix_space_tokens is None: