fix kwargs in generate method and update readme
Browse files- README.md +30 -9
- modeling_qwen.py +10 -6
README.md
CHANGED
@@ -42,9 +42,21 @@ The features of Qwen-7B include:
|
|
42 |
|
43 |
For more details about the open-source model of Qwen-7B, please refer to the [Github](https://github.com/QwenLM/Qwen-7B) code repository.
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
## 依赖项 (Dependency)
|
46 |
|
47 |
-
运行Qwen-7B
|
48 |
|
49 |
To run Qwen-7B, please make sure that pytorch version is not lower than 1.12, and then execute the following pip commands to install the dependent libraries.
|
50 |
|
@@ -75,18 +87,18 @@ from transformers.generation import GenerationConfig
|
|
75 |
|
76 |
# Note: The default behavior now has injection attack prevention off.
|
77 |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
|
78 |
-
|
79 |
-
# import torch
|
80 |
-
# torch.cuda.is_bf16_supported()
|
81 |
# use bf16
|
82 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, bf16=True).eval()
|
83 |
# use fp16
|
84 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, fp16=True).eval()
|
85 |
# use cpu only
|
86 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="cpu", trust_remote_code=True).eval()
|
87 |
-
# use
|
88 |
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True).eval()
|
89 |
-
|
|
|
|
|
90 |
|
91 |
inputs = tokenizer('蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是', return_tensors='pt')
|
92 |
inputs = inputs.to('cuda:0')
|
@@ -309,9 +321,17 @@ We introduce NTK-aware interpolation, LogN attention scaling, Window attention,
|
|
309 |
|
310 |
## 量化(Quantization)
|
311 |
|
312 |
-
如希望使用更低精度的量化模型,如4比特和8比特的模型,我们提供了简单的示例来说明如何快速使用量化模型。在开始前,确保你已经安装了`bitsandbytes
|
313 |
|
314 |
-
We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
```bash
|
317 |
pip install bitsandbytes
|
@@ -369,4 +389,5 @@ Our code and checkpoints are open to research purpose, and they are allowed for
|
|
369 |
|
370 |
如果你想给我们的研发团队和产品团队留言,请通过邮件(qianwen_opensource@alibabacloud.com)联系我们。
|
371 |
|
372 |
-
If you are interested to leave a message to either our research team or product team, feel free to send an email to qianwen_opensource@alibabacloud.com.
|
|
|
|
42 |
|
43 |
For more details about the open-source model of Qwen-7B, please refer to the [Github](https://github.com/QwenLM/Qwen-7B) code repository.
|
44 |
|
45 |
+
## 要求(Requirements)
|
46 |
+
|
47 |
+
* python 3.8及以上版本
|
48 |
+
* pytorch 1.12及以上版本,推荐2.0及以上版本
|
49 |
+
* 建议使用CUDA 11.4及以上(GPU用户、flash-attention用户等需考虑此选项)
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
* python 3.8 and above
|
54 |
+
* pytorch 1.12 and above, 2.0 and above are recommended
|
55 |
+
* CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
|
56 |
+
|
57 |
## 依赖项 (Dependency)
|
58 |
|
59 |
+
运行Qwen-7B,请确保满足上述要求,再执行以下pip命令安装依赖库
|
60 |
|
61 |
To run Qwen-7B, please make sure that pytorch version is not lower than 1.12, and then execute the following pip commands to install the dependent libraries.
|
62 |
|
|
|
87 |
|
88 |
# Note: The default behavior now has injection attack prevention off.
|
89 |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
|
90 |
+
|
|
|
|
|
91 |
# use bf16
|
92 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, bf16=True).eval()
|
93 |
# use fp16
|
94 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, fp16=True).eval()
|
95 |
# use cpu only
|
96 |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="cpu", trust_remote_code=True).eval()
|
97 |
+
# use auto mode, automatically select precision based on the device.
|
98 |
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True).eval()
|
99 |
+
|
100 |
+
# Specify hyperparameters for generation
|
101 |
+
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
|
102 |
|
103 |
inputs = tokenizer('蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是', return_tensors='pt')
|
104 |
inputs = inputs.to('cuda:0')
|
|
|
321 |
|
322 |
## 量化(Quantization)
|
323 |
|
324 |
+
如希望使用更低精度的量化模型,如4比特和8比特的模型,我们提供了简单的示例来说明如何快速使用量化模型。在开始前,确保你已经安装了`bitsandbytes`。请注意:`bitsandbytes`的安装要求是:
|
325 |
|
326 |
+
We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`. Note that the requirements for `bitsandbytes` is:
|
327 |
+
|
328 |
+
```
|
329 |
+
**Requirements** Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0.
|
330 |
+
```
|
331 |
+
|
332 |
+
Windows用户需安装特定版本的`bitsandbytes`,可选项包括[bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。
|
333 |
+
|
334 |
+
Windows users should find another option, which might be [bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels).
|
335 |
|
336 |
```bash
|
337 |
pip install bitsandbytes
|
|
|
389 |
|
390 |
如果你想给我们的研发团队和产品团队留言,请通过邮件(qianwen_opensource@alibabacloud.com)联系我们。
|
391 |
|
392 |
+
If you are interested to leave a message to either our research team or product team, feel free to send an email to qianwen_opensource@alibabacloud.com.
|
393 |
+
|
modeling_qwen.py
CHANGED
@@ -958,12 +958,14 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
958 |
history: Optional[HistoryType],
|
959 |
system: str = "You are a helpful assistant.",
|
960 |
append_history: bool = True,
|
961 |
-
stream: Optional[bool] = False
|
|
|
|
|
962 |
) -> Tuple[str, HistoryType]:
|
963 |
-
|
964 |
-
|
965 |
if history is None:
|
966 |
history = []
|
|
|
|
|
967 |
|
968 |
raw_text, context_tokens = make_context(
|
969 |
tokenizer,
|
@@ -974,9 +976,9 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
974 |
chat_format=self.generation_config.chat_format,
|
975 |
)
|
976 |
|
977 |
-
stop_words_ids
|
978 |
self.generation_config.chat_format, tokenizer
|
979 |
-
)
|
980 |
input_ids = torch.tensor([context_tokens]).to(self.device)
|
981 |
if stream:
|
982 |
assert self.generation_config.chat_format == 'chatml'
|
@@ -986,7 +988,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
986 |
stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
|
987 |
def stream_generator():
|
988 |
outputs = []
|
989 |
-
for token in self.generate(
|
|
|
990 |
outputs.append(token.item())
|
991 |
if outputs[-1] in (tokenizer.im_end_id, tokenizer.im_start_id):
|
992 |
break
|
@@ -998,6 +1001,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
998 |
input_ids,
|
999 |
stop_words_ids = stop_words_ids,
|
1000 |
return_dict_in_generate = False,
|
|
|
1001 |
)
|
1002 |
|
1003 |
response = decode_tokens(
|
|
|
958 |
history: Optional[HistoryType],
|
959 |
system: str = "You are a helpful assistant.",
|
960 |
append_history: bool = True,
|
961 |
+
stream: Optional[bool] = False,
|
962 |
+
stop_words_ids: Optional[List[List[int]]] = None,
|
963 |
+
**kwargs,
|
964 |
) -> Tuple[str, HistoryType]:
|
|
|
|
|
965 |
if history is None:
|
966 |
history = []
|
967 |
+
if stop_words_ids is None:
|
968 |
+
stop_words_ids = []
|
969 |
|
970 |
raw_text, context_tokens = make_context(
|
971 |
tokenizer,
|
|
|
976 |
chat_format=self.generation_config.chat_format,
|
977 |
)
|
978 |
|
979 |
+
stop_words_ids.extend(get_stop_words_ids(
|
980 |
self.generation_config.chat_format, tokenizer
|
981 |
+
))
|
982 |
input_ids = torch.tensor([context_tokens]).to(self.device)
|
983 |
if stream:
|
984 |
assert self.generation_config.chat_format == 'chatml'
|
|
|
988 |
stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
|
989 |
def stream_generator():
|
990 |
outputs = []
|
991 |
+
for token in self.generate(
|
992 |
+
input_ids, return_dict_in_generate=False, generation_config=stream_config, **kwargs):
|
993 |
outputs.append(token.item())
|
994 |
if outputs[-1] in (tokenizer.im_end_id, tokenizer.im_start_id):
|
995 |
break
|
|
|
1001 |
input_ids,
|
1002 |
stop_words_ids = stop_words_ids,
|
1003 |
return_dict_in_generate = False,
|
1004 |
+
**kwargs,
|
1005 |
)
|
1006 |
|
1007 |
response = decode_tokens(
|