update readme
Browse files
README.md
CHANGED
@@ -18,6 +18,8 @@ It is further fine-tuned on OASST1 and Dolly2 to enhance chatting ability.
|
|
18 |
|
19 |
# Quick Start
|
20 |
|
|
|
|
|
21 |
To prompt the chat model, use the following format:
|
22 |
```
|
23 |
<human>: [Instruction]
|
@@ -27,33 +29,76 @@ To prompt the chat model, use the following format:
|
|
27 |
## GPU Inference
|
28 |
|
29 |
This requires a GPU with 8GB memory.
|
|
|
30 |
```python
|
|
|
|
|
31 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
# init
|
33 |
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
|
34 |
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", torch_dtype=torch.float16)
|
35 |
model = model.to('cuda:0')
|
36 |
# infer
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
40 |
print(output_str)
|
|
|
|
|
|
|
41 |
```
|
42 |
|
43 |
## GPU Inference in Int8
|
44 |
|
45 |
This requires a GPU with 6GB memory.
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
```python
|
|
|
|
|
48 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# init
|
50 |
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
|
51 |
-
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", device_map=
|
|
|
52 |
# infer
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
print(output_str)
|
|
|
|
|
|
|
57 |
```
|
58 |
|
59 |
## CPU Inference
|
@@ -68,8 +113,13 @@ inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.devi
|
|
68 |
outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
|
69 |
output_str = tokenizer.decode(outputs[0])
|
70 |
print(output_str)
|
|
|
|
|
|
|
71 |
```
|
72 |
|
|
|
|
|
73 |
|
74 |
# Uses
|
75 |
|
|
|
18 |
|
19 |
# Quick Start
|
20 |
|
21 |
+
Please note that the model requires `transformers` version >= 4.25.1.
|
22 |
+
|
23 |
To prompt the chat model, use the following format:
|
24 |
```
|
25 |
<human>: [Instruction]
|
|
|
29 |
## GPU Inference
|
30 |
|
31 |
This requires a GPU with 8GB memory.
|
32 |
+
|
33 |
```python
|
34 |
+
import torch
|
35 |
+
import transformers
|
36 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
37 |
+
|
38 |
+
MIN_TRANSFORMERS_VERSION = '4.25.1'
|
39 |
+
|
40 |
+
# check transformers version
|
41 |
+
assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
|
42 |
+
|
43 |
# init
|
44 |
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
|
45 |
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", torch_dtype=torch.float16)
|
46 |
model = model.to('cuda:0')
|
47 |
# infer
|
48 |
+
prompt = "<human>: Who is Alan Turing?\n<bot>:"
|
49 |
+
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
|
50 |
+
input_length = inputs.input_ids.shape[1]
|
51 |
+
outputs = model.generate(
|
52 |
+
**inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
|
53 |
+
)
|
54 |
+
token = outputs.sequences[0, input_length:]
|
55 |
+
output_str = tokenizer.decode(token)
|
56 |
print(output_str)
|
57 |
+
"""
|
58 |
+
Alan Turing was a British mathematician, logician, cryptologist, and computer scientist. He is widely regarded as the father of computer science and artificial intelligence.
|
59 |
+
"""
|
60 |
```
|
61 |
|
62 |
## GPU Inference in Int8
|
63 |
|
64 |
This requires a GPU with 6GB memory.
|
65 |
|
66 |
+
To run inference with int8, please ensure you have installed accelerate and bitandbytes. You can install them with the following command:
|
67 |
+
|
68 |
+
```bash
|
69 |
+
pip install accelerate
|
70 |
+
pip install bitsandbytes
|
71 |
+
```
|
72 |
+
|
73 |
+
Then you can run inference with int8 as follows:
|
74 |
+
|
75 |
```python
|
76 |
+
import torch
|
77 |
+
import transformers
|
78 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
79 |
+
|
80 |
+
MIN_TRANSFORMERS_VERSION = '4.25.1'
|
81 |
+
|
82 |
+
# check transformers version
|
83 |
+
assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
|
84 |
+
|
85 |
# init
|
86 |
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
|
87 |
+
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", device_map='auto', torch_dtype=torch.float16, load_in_8bit=True)
|
88 |
+
|
89 |
# infer
|
90 |
+
prompt = "<human>: Who is Alan Turing?\n<bot>:"
|
91 |
+
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
|
92 |
+
input_length = inputs.input_ids.shape[1]
|
93 |
+
outputs = model.generate(
|
94 |
+
**inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
|
95 |
+
)
|
96 |
+
token = outputs.sequences[0, input_length:]
|
97 |
+
output_str = tokenizer.decode(token)
|
98 |
print(output_str)
|
99 |
+
"""
|
100 |
+
Alan Turing was a British mathematician and computer scientist who made important contributions to computer science and mathematical logic. He is widely regarded as the father of computer science and artificial intelligence for his work on the Turing machine and Turing test.
|
101 |
+
"""
|
102 |
```
|
103 |
|
104 |
## CPU Inference
|
|
|
113 |
outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
|
114 |
output_str = tokenizer.decode(outputs[0])
|
115 |
print(output_str)
|
116 |
+
"""
|
117 |
+
Alan Turing was a British mathematician and computer scientist. He is widely regarded as the father of computer science and artificial intelligence. He was a pioneer in the field of computer science and artificial intelligence, and his work has had a significant impact on the development of computing technology.
|
118 |
+
"""
|
119 |
```
|
120 |
|
121 |
+
Please note that since `LayerNormKernelImpl` is not implemented in fp16 for CPU, we use `bfloat16` for CPU inference.
|
122 |
+
|
123 |
|
124 |
# Uses
|
125 |
|