Xipotzzz
/

blip2zh-chatglm-6b

@@ -189,9 +189,10 @@ class Blip2ChatGLMForConditionalGeneration(Blip2ForConditionalGeneration):
     def prepare_inputs_for_chat(
         self,
         tokenizer: PreTrainedTokenizer,
-        queries: List[Union[str, Tuple[str, torch.Tensor]]],
-        histories: List[List[Tuple[Union[str, Tuple[str, torch.Tensor]], str]]],
         max_length: int,
     ):
         device = self.device
         nvtokens = self.config.num_query_tokens
@@ -199,80 +200,76 @@ class Blip2ChatGLMForConditionalGeneration(Blip2ForConditionalGeneration):
         all_images = []
         all_image_slots = []
         all_input_ids = []
-        for query, history in zip(queries, histories):
             image_slots = []
-            if history:
-                input_ids = tokenizer(
-                    f"[Round {len(history)}]\n问：", add_special_tokens=False
-                ).input_ids
-                slot_offset = len(input_ids)
-                if isinstance(query, tuple):
-                    qtext, qimg = query
-                    # image slot, embedding will be replaced by image embeddings
-                    input_ids.extend([tokenizer.unk_token_id] * nvtokens)
-                else:
-                    qtext = query
-                    qimg = None
-                input_ids += tokenizer(qtext + f"\n答：").input_ids
-                if qimg is not None:
-                    all_images.append(qimg)
-                    image_slots.append(
-                        len(input_ids) - slot_offset
-                    )  # count from backward
-                for ri, (q, r) in enumerate(reversed(history)):
-                    if len(input_ids) >= max_length:
-                        break
-                    i = len(history) - ri - 1
-                    cur_input_ids: List[int] = tokenizer(
-                        f"[Round {i}]\n问：", add_special_tokens=False
                     ).input_ids
-                    slot_offset = len(cur_input_ids)
-                    if isinstance(q, tuple):
-                        qtext, qimg = q
-                        # image slot, embedding will be replaced by image embeddings
-                        cur_input_ids.extend([tokenizer.unk_token_id] * nvtokens)
-                    else:
-                        qtext = q
-                        qimg = None
-                    cur_input_ids += tokenizer(
-                        qtext + f"\n答：{r}\n", add_special_tokens=False
                     ).input_ids
-                    input_ids = cur_input_ids + input_ids
-                    if qimg is not None:
-                        all_images.append(qimg)
-                        image_slots.append(
-                            len(input_ids) - slot_offset
-                        )  # count from backward
-            else:
-                input_ids = []
-                if isinstance(query, tuple):
-                    qtext, qimg = query
                     # image slot, embedding will be replaced by image embeddings
-                    input_ids.extend([tokenizer.unk_token_id] * nvtokens)
                 else:
-                    qtext = query
-                    qimg = None
-                input_ids += tokenizer(qtext).input_ids
-                if qimg is not None:
-                    all_images.append(qimg)
-                    image_slots.append(len(input_ids))  # count from backward
             if len(input_ids) >= max_length:
-                # truncate
-                if (
-                    image_slots[-1] > max_length
-                    and image_slots[-1] - nvtokens < max_length
-                ):
-                    # A non-intact image slot is not allowed
-                    input_ids = input_ids[-(image_slots[-1] - nvtokens) :]
-                else:
-                    input_ids = input_ids[-max_length:]
-                if image_slots[-1] > max_length:
-                    image_slots.pop()
-                    all_images.pop()
             all_image_slots.append(image_slots)
             all_input_ids.append(input_ids)
@@ -316,9 +313,12 @@ class Blip2ChatGLMForConditionalGeneration(Blip2ForConditionalGeneration):
             input_ids[i][-len(ids) :] = torch.as_tensor(ids, dtype=torch.long)
         input_ids = input_ids.to(device)
         inputs_embeds = self.language_model.transformer.word_embeddings(input_ids)
-        for i, (image_slots, vtokens) in enumerate(zip(all_image_slots, all_vtokens)):
-            for slot, vimg in zip(image_slots, vtokens):
-                inputs_embeds[i][-slot : -slot + nvtokens, :] = vimg
         return input_ids, inputs_embeds
@@ -326,22 +326,25 @@ class Blip2ChatGLMForConditionalGeneration(Blip2ForConditionalGeneration):
     def batch_chat(
         self,
         tokenizer: PreTrainedTokenizer,
-        queries: List[Union[str, Tuple[str, torch.Tensor]]],
-        histories: List[List[Tuple[Union[str, Tuple[str, torch.Tensor]], str]]],
         max_length: int = 2048,
         num_beams=1,
         do_sample=True,
         top_p=0.7,
         temperature=0.95,
-        logits_processor=None,
         **kwargs,
     ):
         input_ids, inputs_embeds = self.prepare_inputs_for_chat(
-            tokenizer, queries, histories, max_length
         )
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
         logits_processor.append(InvalidScoreLogitsProcessor())
         gen_kwargs = {
             "max_length": max_length,
@@ -367,17 +370,22 @@ class Blip2ChatGLMForConditionalGeneration(Blip2ForConditionalGeneration):
     def stream_chat(
         self,
         tokenizer: PreTrainedTokenizer,
-        query: Union[str, Tuple[str, torch.Tensor]],
-        history: List[Tuple[Union[str, Tuple[str, torch.Tensor]], str]],
         num_beams=5,
-        max_length=128,
         top_p=0.9,
         do_sample=True,
         temperature=1,
         **kwargs,
     ):
         input_ids, inputs_embeds = self.prepare_inputs_for_chat(
-            tokenizer, [query], [history], max_length
         )
         logits_processor = LogitsProcessorList()

     def prepare_inputs_for_chat(
         self,
         tokenizer: PreTrainedTokenizer,
+        batch_messages: List[List[Tuple[str, str, List[Tuple[torch.Tensor, int]]]]],
         max_length: int,
+        user_role: str = "问",
+        bot_role: str = "答",
     ):
         device = self.device
         nvtokens = self.config.num_query_tokens
         all_images = []
         all_image_slots = []
         all_input_ids = []
+        for messages in batch_messages:
+            images = []
             image_slots = []
+            input_ids = []
+            round_roles = [set()]
+            for role, qtext, qimgs in messages:
+                if role in round_roles[-1]:
+                    # a new round (not the first round)
+                    input_ids += tokenizer(
+                        f"\n[Round {len(round_roles)}]\n{role}：",
+                        add_special_tokens=False,
                     ).input_ids
+                    round_roles.append({role})
+                else:
+                    round_roles[-1].add(role)
+                    input_ids += tokenizer(
+                        # For first role, no new line
+                        f"\n{role}：" if len(input_ids) != 0 else f"{role}：", add_special_tokens=False
                     ).input_ids
+                cur_index = 0
+                for qimg, img_idx in qimgs:
+                    if img_idx > cur_index:
+                        input_ids += tokenizer(
+                            qtext[cur_index:img_idx], add_special_tokens=False
+                        ).input_ids
+                        cur_index = img_idx
                     # image slot, embedding will be replaced by image embeddings
+                    image_slots.append(len(input_ids))
+                    input_ids += [tokenizer.unk_token_id] * nvtokens
+                    images.append(qimg)
+                input_ids += tokenizer(
+                    qtext[cur_index:], add_special_tokens=False
+                ).input_ids
+            if len(round_roles) == 1:
+                # only 1 round
+                if len(round_roles[0]) == 1 and user_role in round_roles[0]:
+                    # only user role
+                    input_ids += tokenizer("").input_ids
                 else:
+                    input_ids += tokenizer(f"\n{bot_role}：").input_ids
+            else:
+                # add tag for round 0
+                input_ids = (
+                    tokenizer(f"[Round 0]\n", add_special_tokens=False).input_ids
+                    + input_ids
+                )
+                input_ids += tokenizer(f"\n{bot_role}：").input_ids
             if len(input_ids) >= max_length:
+                image_slots_after_truncate = []
+                images_after_truncate = []
+                truncate_index = len(input_ids) - max_length
+                for image_slot, image in zip(image_slots, images):
+                    # truncate from left
+                    if len(input_ids) - image_slot < max_length:
+                        image_slots_after_truncate.append(image_slot)
+                        images_after_truncate.append(image)
+                    elif len(input_ids) - (image_slot + nvtokens) < max_length:
+                        # in-contact image slot is not allowed
+                        truncate_index = max(truncate_index, image_slot + nvtokens)
+                for i, image_slot in enumerate(image_slots_after_truncate):
+                    image_slots_after_truncate[i] = image_slot - truncate_index
+                input_ids = input_ids[truncate_index:]
+                image_slots = image_slots_after_truncate
+                images = images_after_truncate
+            # print(tokenizer.convert_ids_to_tokens(input_ids))
+            all_images.extend(images)
             all_image_slots.append(image_slots)
             all_input_ids.append(input_ids)
             input_ids[i][-len(ids) :] = torch.as_tensor(ids, dtype=torch.long)
         input_ids = input_ids.to(device)
         inputs_embeds = self.language_model.transformer.word_embeddings(input_ids)
+        if all_vtokens is not None:
+            for i, (image_slots, vtokens) in enumerate(
+                zip(all_image_slots, all_vtokens)
+            ):
+                for slot, vimg in zip(image_slots, vtokens):
+                    inputs_embeds[i][slot : slot + nvtokens, :] = vimg
         return input_ids, inputs_embeds
     def batch_chat(
         self,
         tokenizer: PreTrainedTokenizer,
+        batch_messages: List[List[Tuple[str, str, List[Tuple[torch.Tensor, int]]]]],
         max_length: int = 2048,
         num_beams=1,
         do_sample=True,
         top_p=0.7,
         temperature=0.95,
+        user_role: str = "问",
+        bot_role: str = "答",
         **kwargs,
     ):
         input_ids, inputs_embeds = self.prepare_inputs_for_chat(
+            tokenizer=tokenizer,
+            batch_messages=batch_messages,
+            max_length=max_length,
+            user_role=user_role,
+            bot_role=bot_role,
         )
+        logits_processor = LogitsProcessorList()
         logits_processor.append(InvalidScoreLogitsProcessor())
         gen_kwargs = {
             "max_length": max_length,
     def stream_chat(
         self,
         tokenizer: PreTrainedTokenizer,
+        messages: List[Tuple[str, str, List[Tuple[torch.Tensor, int]]]],
         num_beams=5,
+        max_length=512,
         top_p=0.9,
         do_sample=True,
         temperature=1,
+        user_role: str = "问",
+        bot_role: str = "答",
         **kwargs,
     ):
         input_ids, inputs_embeds = self.prepare_inputs_for_chat(
+            tokenizer=tokenizer,
+            batch_messages=[messages],
+            max_length=max_length,
+            user_role=user_role,
+            bot_role=bot_role,
         )
         logits_processor = LogitsProcessorList()

modeling_chatglm.py CHANGED Viewed

@@ -970,6 +970,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         if attention_mask is None:
             attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
         for i, layer in enumerate(self.layers):
@@ -1095,10 +1097,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
                 [position_ids, new_position_id], dim=-1
             )
-        # set to None as prepare_inputs_for_generation use past for input embeds
-        if "inputs_embeds" in model_kwargs:
-            model_kwargs["inputs_embeds"] = None
         return model_kwargs
     def prepare_inputs_for_generation(

         if attention_mask is None:
             attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
         for i, layer in enumerate(self.layers):
                 [position_ids, new_position_id], dim=-1
             )
         return model_kwargs
     def prepare_inputs_for_generation(