internlm
/

internlm-xcomposer-vl-7b

@@ -26,6 +26,13 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
     config_class = InternLMXComposerConfig
     _auto_class = "AutoModelForCausalLM"
     gen_config = dict(
         num_beams=5,
         do_sample=False,
@@ -33,7 +40,7 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
         repetition_penalty=1.5,
         length_penalty=1.0,
         temperature=1.0,
-        max_new_tokens=200,
     )
     def __init__(self, config):
@@ -74,13 +81,14 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
             # speed up init llm
             with torch.device('meta'):
                 self.internlm_model = InternLMForCausalLM._from_config(config)
-            self.internlm_model.to_empty(device=config.device).to(torch.float16)
         for n, m in self.internlm_model.named_modules():
             if 'lora' in n:
                 m.float()
         self.internlm_proj = nn.Linear(self.Qformer.config.hidden_size,
-                                    self.internlm_model.config.hidden_size)
         print('Done')
         self.vis_processor = transforms.Compose([
@@ -93,15 +101,15 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
         self.tokenizer = None
-    @property
-    def eoh(self):
-        return self.tokenizer.decode(torch.Tensor([103027]),
-                                     skip_special_tokens=True)
-    @property
-    def eoa(self):
-        return self.tokenizer.decode(torch.Tensor([103028]),
-                                     skip_special_tokens=True)
     def maybe_autocast(self, dtype=torch.float16):
         # if on cpu, don't use autocast
@@ -154,13 +162,14 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
                 encoder_attention_mask=image_atts,
                 return_dict=True,
             )
-            inputs_internlm = self.internlm_proj(query_output.last_hidden_state)
             inputs_internlm = torch.cat([
                 self.flag_image_start.expand(inputs_internlm.shape[0], -1, -1),
                 inputs_internlm,
                 self.flag_image_end.expand(inputs_internlm.shape[0], -1, -1)
             ],
-                                      dim=1)
         return inputs_internlm
     def encode_text(self, text, add_special_tokens=False):
@@ -195,8 +204,8 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
         text_embeds = self.encode_text(text)
         img_embeds = self.encode_img(image)
         prompt_embeds = self.wrap_prompt(text_embeds, img_embeds)
-        out_embeds = self.internlm_model.generate(inputs_embeds=prompt_embeds,
-                                                **self.get_gen_args(**kwargs))
         out_text = self.decode_text(out_embeds)
         return out_text
@@ -206,8 +215,8 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
         prompt_embeds = self.wrap_prompt(text_embeds,
                                          img_embeds,
                                          history=history)
-        out_embeds = self.internlm_model.generate(inputs_embeds=prompt_embeds,
-                                                **self.get_gen_args(**kwargs))
         out_text = self.decode_text(out_embeds)
         # trunc at eoh and eoa
@@ -231,7 +240,13 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
                     history=None,
                     add_special=True):
         if add_special:
-            prompt_segs = [' <|User|>:', f'\n{self.eoh} <|Bot|>:']
         else:
             prompt_segs = [' <|User|>:', ' <|Bot|>:']  # used in wrap history
         prompt_seg_embeds = []

     config_class = InternLMXComposerConfig
     _auto_class = "AutoModelForCausalLM"
+    meta_instruction = """meta instruction
+You are an AI assistant whose name is 浦语.
+- 浦语 is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
+- 浦语 can understand and communicate fluently in the language chosen by the user such as English and 中文.
+conversation
+"""
     gen_config = dict(
         num_beams=5,
         do_sample=False,
         repetition_penalty=1.5,
         length_penalty=1.0,
         temperature=1.0,
+        max_new_tokens=500,
     )
     def __init__(self, config):
             # speed up init llm
             with torch.device('meta'):
                 self.internlm_model = InternLMForCausalLM._from_config(config)
+            self.internlm_model.to_empty(device=config.device).to(
+                torch.float16)
         for n, m in self.internlm_model.named_modules():
             if 'lora' in n:
                 m.float()
         self.internlm_proj = nn.Linear(self.Qformer.config.hidden_size,
+                                       self.internlm_model.config.hidden_size)
         print('Done')
         self.vis_processor = transforms.Compose([
         self.tokenizer = None
+        self.eoh = '<TOKENS_UNUSED_0>'  # end of human
+        self.eoa = '<TOKENS_UNUSED_1>'  # end of assistant
+        stop_words_ids = [
+            torch.tensor([103027]).to(config.device),
+            torch.tensor([103028]).to(config.device),
+        ]
+        stopping_criteria = StoppingCriteriaList(
+            [StoppingCriteriaSub(stops=stop_words_ids)])
+        self.gen_config['stopping_criteria'] = stopping_criteria
     def maybe_autocast(self, dtype=torch.float16):
         # if on cpu, don't use autocast
                 encoder_attention_mask=image_atts,
                 return_dict=True,
             )
+            inputs_internlm = self.internlm_proj(
+                query_output.last_hidden_state)
             inputs_internlm = torch.cat([
                 self.flag_image_start.expand(inputs_internlm.shape[0], -1, -1),
                 inputs_internlm,
                 self.flag_image_end.expand(inputs_internlm.shape[0], -1, -1)
             ],
+                                        dim=1)
         return inputs_internlm
     def encode_text(self, text, add_special_tokens=False):
         text_embeds = self.encode_text(text)
         img_embeds = self.encode_img(image)
         prompt_embeds = self.wrap_prompt(text_embeds, img_embeds)
+        out_embeds = self.internlm_model.generate(
+            inputs_embeds=prompt_embeds, **self.get_gen_args(**kwargs))
         out_text = self.decode_text(out_embeds)
         return out_text
         prompt_embeds = self.wrap_prompt(text_embeds,
                                          img_embeds,
                                          history=history)
+        out_embeds = self.internlm_model.generate(
+            inputs_embeds=prompt_embeds, **self.get_gen_args(**kwargs))
         out_text = self.decode_text(out_embeds)
         # trunc at eoh and eoa
                     history=None,
                     add_special=True):
         if add_special:
+            if history is None:
+                prompt_segs = [
+                    self.meta_instruction + ' <|User|>:',
+                    f'\n{self.eoh} <|Bot|>:'
+                ]
+            else:
+                prompt_segs = [' <|User|>:', f'\n{self.eoh} <|Bot|>:']
         else:
             prompt_segs = [' <|User|>:', ' <|Bot|>:']  # used in wrap history
         prompt_seg_embeds = []

modeling_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import os
 from contextlib import contextmanager
 import timm.models.hub as timm_hub
 import torch
@@ -32,6 +33,7 @@ def download_cached_file(url, check_hash=True, progress=False):
     Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
     If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
     """
     def get_cached_file_path():
         # a hack to sync the file path across processes
         parts = torch.hub.urlparse(url)
@@ -74,49 +76,58 @@ def all_logging_disabled(highest_level=logging.CRITICAL):
 class LoRALinear(nn.Linear):
-    def __init__(self,
-                 in_features: int,
-                 out_features: int,
-                 bias: bool = True,
-                 device=None,
-                 dtype=None,
-                 lora_r=8,
-                 lora_alpha=16,
-                 lora_dropout=0.05,
-                 **kwargs) -> None:
         super().__init__(in_features, out_features, bias, device, dtype)
         self.lora_r = lora_r
         self.lora_alpha = lora_alpha
-        if lora_dropout > 0.:
             self.lora_dropout = nn.Dropout(p=lora_dropout)
         else:
             self.lora_dropout = lambda x: x
         self.lora_scaling = self.lora_alpha / self.lora_r
-        self.lora_A = nn.Linear(in_features,
-                                self.lora_r,
-                                bias=False,
-                                device=device,
-                                dtype=dtype)
-        self.lora_B = nn.Linear(self.lora_r,
-                                out_features,
-                                bias=False,
-                                device=device,
-                                dtype=dtype)
         self.reset_parameters()
     def reset_parameters(self):
-        if hasattr(self, 'lora_A'):
             # initialize A the same way as the default for nn.Linear and B to zero
             nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
             nn.init.zeros_(self.lora_B.weight)
-            #print ("lora weight init {} {}".format(torch.mean(self.lora_A.weight), torch.mean(self.lora_B.weight)))
     def forward(self, x):
         orig_type = x.dtype
         res = super().forward(x)
         x = x.float()
-        res += self.lora_B(self.lora_A(
-            self.lora_dropout(x))) * self.lora_scaling
         return res.to(orig_type)

 import math
 import os
 from contextlib import contextmanager
+from transformers import StoppingCriteria, StoppingCriteriaList
 import timm.models.hub as timm_hub
 import torch
     Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
     If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
     """
     def get_cached_file_path():
         # a hack to sync the file path across processes
         parts = torch.hub.urlparse(url)
 class LoRALinear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        lora_r=8,
+        lora_alpha=16,
+        lora_dropout=0.05,
+        **kwargs
+    ) -> None:
         super().__init__(in_features, out_features, bias, device, dtype)
         self.lora_r = lora_r
         self.lora_alpha = lora_alpha
+        if lora_dropout > 0.0:
             self.lora_dropout = nn.Dropout(p=lora_dropout)
         else:
             self.lora_dropout = lambda x: x
         self.lora_scaling = self.lora_alpha / self.lora_r
+        self.lora_A = nn.Linear(
+            in_features, self.lora_r, bias=False, device=device, dtype=dtype
+        )
+        self.lora_B = nn.Linear(
+            self.lora_r, out_features, bias=False, device=device, dtype=dtype
+        )
         self.reset_parameters()
     def reset_parameters(self):
+        if hasattr(self, "lora_A"):
             # initialize A the same way as the default for nn.Linear and B to zero
             nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
             nn.init.zeros_(self.lora_B.weight)
     def forward(self, x):
         orig_type = x.dtype
         res = super().forward(x)
         x = x.float()
+        res += self.lora_B(self.lora_A(self.lora_dropout(x))) * self.lora_scaling
         return res.to(orig_type)
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[:, -len(stop) :])).item():
+                return True
+        return False