Text Generation
Transformers
PyTorch
English
gpt_neox
text-generation-inference
Inference Endpoints
xzyao commited on
Commit
0a7c459
1 Parent(s): b8c19da

update readme

Browse files
Files changed (1) hide show
  1. README.md +57 -7
README.md CHANGED
@@ -18,6 +18,8 @@ It is further fine-tuned on OASST1 and Dolly2 to enhance chatting ability.
18
 
19
  # Quick Start
20
 
 
 
21
  To prompt the chat model, use the following format:
22
  ```
23
  <human>: [Instruction]
@@ -27,33 +29,76 @@ To prompt the chat model, use the following format:
27
  ## GPU Inference
28
 
29
  This requires a GPU with 8GB memory.
 
30
  ```python
 
 
31
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
 
 
32
  # init
33
  tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
34
  model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", torch_dtype=torch.float16)
35
  model = model.to('cuda:0')
36
  # infer
37
- inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.device)
38
- outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
39
- output_str = tokenizer.decode(outputs[0])
 
 
 
 
 
40
  print(output_str)
 
 
 
41
  ```
42
 
43
  ## GPU Inference in Int8
44
 
45
  This requires a GPU with 6GB memory.
46
 
 
 
 
 
 
 
 
 
 
47
  ```python
 
 
48
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
 
 
49
  # init
50
  tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
51
- model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", device_map="auto", load_in_8bit=True)
 
52
  # infer
53
- inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.device)
54
- outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
55
- output_str = tokenizer.decode(outputs[0])
 
 
 
 
 
56
  print(output_str)
 
 
 
57
  ```
58
 
59
  ## CPU Inference
@@ -68,8 +113,13 @@ inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.devi
68
  outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
69
  output_str = tokenizer.decode(outputs[0])
70
  print(output_str)
 
 
 
71
  ```
72
 
 
 
73
 
74
  # Uses
75
 
 
18
 
19
  # Quick Start
20
 
21
+ Please note that the model requires `transformers` version >= 4.25.1.
22
+
23
  To prompt the chat model, use the following format:
24
  ```
25
  <human>: [Instruction]
 
29
  ## GPU Inference
30
 
31
  This requires a GPU with 8GB memory.
32
+
33
  ```python
34
+ import torch
35
+ import transformers
36
  from transformers import AutoTokenizer, AutoModelForCausalLM
37
+
38
+ MIN_TRANSFORMERS_VERSION = '4.25.1'
39
+
40
+ # check transformers version
41
+ assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
42
+
43
  # init
44
  tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
45
  model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", torch_dtype=torch.float16)
46
  model = model.to('cuda:0')
47
  # infer
48
+ prompt = "<human>: Who is Alan Turing?\n<bot>:"
49
+ inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
50
+ input_length = inputs.input_ids.shape[1]
51
+ outputs = model.generate(
52
+ **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
53
+ )
54
+ token = outputs.sequences[0, input_length:]
55
+ output_str = tokenizer.decode(token)
56
  print(output_str)
57
+ """
58
+ Alan Turing was a British mathematician, logician, cryptologist, and computer scientist. He is widely regarded as the father of computer science and artificial intelligence.
59
+ """
60
  ```
61
 
62
  ## GPU Inference in Int8
63
 
64
  This requires a GPU with 6GB memory.
65
 
66
+ To run inference with int8, please ensure you have installed accelerate and bitandbytes. You can install them with the following command:
67
+
68
+ ```bash
69
+ pip install accelerate
70
+ pip install bitsandbytes
71
+ ```
72
+
73
+ Then you can run inference with int8 as follows:
74
+
75
  ```python
76
+ import torch
77
+ import transformers
78
  from transformers import AutoTokenizer, AutoModelForCausalLM
79
+
80
+ MIN_TRANSFORMERS_VERSION = '4.25.1'
81
+
82
+ # check transformers version
83
+ assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
84
+
85
  # init
86
  tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
87
+ model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", device_map='auto', torch_dtype=torch.float16, load_in_8bit=True)
88
+
89
  # infer
90
+ prompt = "<human>: Who is Alan Turing?\n<bot>:"
91
+ inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
92
+ input_length = inputs.input_ids.shape[1]
93
+ outputs = model.generate(
94
+ **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
95
+ )
96
+ token = outputs.sequences[0, input_length:]
97
+ output_str = tokenizer.decode(token)
98
  print(output_str)
99
+ """
100
+ Alan Turing was a British mathematician and computer scientist who made important contributions to computer science and mathematical logic. He is widely regarded as the father of computer science and artificial intelligence for his work on the Turing machine and Turing test.
101
+ """
102
  ```
103
 
104
  ## CPU Inference
 
113
  outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
114
  output_str = tokenizer.decode(outputs[0])
115
  print(output_str)
116
+ """
117
+ Alan Turing was a British mathematician and computer scientist. He is widely regarded as the father of computer science and artificial intelligence. He was a pioneer in the field of computer science and artificial intelligence, and his work has had a significant impact on the development of computing technology.
118
+ """
119
  ```
120
 
121
+ Please note that since `LayerNormKernelImpl` is not implemented in fp16 for CPU, we use `bfloat16` for CPU inference.
122
+
123
 
124
  # Uses
125