Qwen
/

Qwen-7B-Chat

@@ -60,6 +60,12 @@ If you are directly using the model downloaded from Huggingface, please make sur
 如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
 """
 apply_rotary_emb_func = None
 rms_norm = None
 flash_attn_unpadded_func = None
@@ -977,10 +983,11 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         history: Optional[HistoryType],
         system: str = "You are a helpful assistant.",
         append_history: bool = True,
-        stream: Optional[bool] = False,
         stop_words_ids: Optional[List[List[int]]] = None,
         **kwargs,
     ) -> Tuple[str, HistoryType]:
         assert self.generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
@@ -1000,41 +1007,21 @@ class QWenLMHeadModel(QWenPreTrainedModel):
             self.generation_config.chat_format, tokenizer
         ))
         input_ids = torch.tensor([context_tokens]).to(self.device)
-        if stream:
-            logger.warn(
-                "[WARNING] This usage is deprecated and marked for removal."
-                "Please use chat_stream() instead of chat(stream=True)."
-            )
-            from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
-            self.__class__.generate_stream = NewGenerationMixin.generate
-            self.__class__.sample_stream = NewGenerationMixin.sample_stream
-            stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
-            def stream_generator():
-                outputs = []
-                for token in self.generate_stream(
-                        input_ids, return_dict_in_generate=False, generation_config=stream_config, **kwargs):
-                    outputs.append(token.item())
-                    if outputs[-1] in (tokenizer.im_end_id, tokenizer.im_start_id):
-                        break
-                    yield tokenizer.decode(outputs, skip_special_tokens=True)
-            return stream_generator()
-        else:
-            outputs = self.generate(
-                        input_ids,
-                        stop_words_ids = stop_words_ids,
-                        return_dict_in_generate = False,
-                        **kwargs,
-                    )
-            response = decode_tokens(
-                outputs[0],
-                tokenizer,
-                raw_text_len=len(raw_text),
-                context_length=len(context_tokens),
-                chat_format=self.generation_config.chat_format,
-                verbose=False,
-            )
         if append_history:
             history.append((query, response))

 如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
 """
+_SENTINEL = object()
+_ERROR_STREAM_IN_CHAT = """\
+Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
+向model.chat()传入参数stream的用法可能存在Bug，该用法已被废弃，将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
+"""
 apply_rotary_emb_func = None
 rms_norm = None
 flash_attn_unpadded_func = None
         history: Optional[HistoryType],
         system: str = "You are a helpful assistant.",
         append_history: bool = True,
+        stream: Optional[bool] = _SENTINEL,
         stop_words_ids: Optional[List[List[int]]] = None,
         **kwargs,
     ) -> Tuple[str, HistoryType]:
+        assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
         assert self.generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
             self.generation_config.chat_format, tokenizer
         ))
         input_ids = torch.tensor([context_tokens]).to(self.device)
+        outputs = self.generate(
+                    input_ids,
+                    stop_words_ids = stop_words_ids,
+                    return_dict_in_generate = False,
+                    **kwargs,
+                )
+        response = decode_tokens(
+            outputs[0],
+            tokenizer,
+            raw_text_len=len(raw_text),
+            context_length=len(context_tokens),
+            chat_format=self.generation_config.chat_format,
+            verbose=False,
+        )
         if append_history:
             history.append((query, response))