togethercomputer
/

RedPajama-INCITE-Chat-3B-v1

@@ -18,6 +18,8 @@ It is further fine-tuned on OASST1 and Dolly2 to enhance chatting ability.
 # Quick Start
 To prompt the chat model, use the following format:
 ```
 <human>: [Instruction]
@@ -27,33 +29,76 @@ To prompt the chat model, use the following format:
 ## GPU Inference
 This requires a GPU with 8GB memory.
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # init
 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
 model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", torch_dtype=torch.float16)
 model = model.to('cuda:0')
 # infer
-inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.device)
-outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
-output_str = tokenizer.decode(outputs[0])
 print(output_str)
 ```
 ## GPU Inference in Int8
 This requires a GPU with 6GB memory.
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # init
 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
-model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", device_map="auto", load_in_8bit=True)
 # infer
-inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.device)
-outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
-output_str = tokenizer.decode(outputs[0])
 print(output_str)
 ```
 ## CPU Inference
@@ -68,8 +113,13 @@ inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.devi
 outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
 output_str = tokenizer.decode(outputs[0])
 print(output_str)
 ```
 # Uses

 # Quick Start
+Please note that the model requires `transformers` version >= 4.25.1.
 To prompt the chat model, use the following format:
 ```
 <human>: [Instruction]
 ## GPU Inference
 This requires a GPU with 8GB memory.
 ```python
+import torch
+import transformers
 from transformers import AutoTokenizer, AutoModelForCausalLM
+MIN_TRANSFORMERS_VERSION = '4.25.1'
+# check transformers version
+assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
 # init
 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
 model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", torch_dtype=torch.float16)
 model = model.to('cuda:0')
 # infer
+prompt = "<human>: Who is Alan Turing?\n<bot>:"
+inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
+input_length = inputs.input_ids.shape[1]
+outputs = model.generate(
+    **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
+)
+token = outputs.sequences[0, input_length:]
+output_str = tokenizer.decode(token)
 print(output_str)
+"""
+Alan Turing was a British mathematician, logician, cryptologist, and computer scientist. He is widely regarded as the father of computer science and artificial intelligence.
+"""
 ```
 ## GPU Inference in Int8
 This requires a GPU with 6GB memory.
+To run inference with int8, please ensure you have installed accelerate and bitandbytes. You can install them with the following command:
+```bash
+pip install accelerate
+pip install bitsandbytes
+```
+Then you can run inference with int8 as follows:
 ```python
+import torch
+import transformers
 from transformers import AutoTokenizer, AutoModelForCausalLM
+MIN_TRANSFORMERS_VERSION = '4.25.1'
+# check transformers version
+assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
 # init
 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1")
+model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-2.8B-v1", device_map='auto', torch_dtype=torch.float16, load_in_8bit=True)
 # infer
+prompt = "<human>: Who is Alan Turing?\n<bot>:"
+inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
+input_length = inputs.input_ids.shape[1]
+outputs = model.generate(
+    **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
+)
+token = outputs.sequences[0, input_length:]
+output_str = tokenizer.decode(token)
 print(output_str)
+"""
+Alan Turing was a British mathematician and computer scientist who made important contributions to computer science and mathematical logic. He is widely regarded as the father of computer science and artificial intelligence for his work on the Turing machine and Turing test.
+"""
 ```
 ## CPU Inference
 outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
 output_str = tokenizer.decode(outputs[0])
 print(output_str)
+"""
+Alan Turing was a British mathematician and computer scientist. He is widely regarded as the father of computer science and artificial intelligence. He was a pioneer in the field of computer science and artificial intelligence, and his work has had a significant impact on the development of computing technology.
+"""
 ```
+Please note that since `LayerNormKernelImpl` is not implemented in fp16 for CPU, we use `bfloat16` for CPU inference.
 # Uses