togethercomputer
/

RedPajama-INCITE-7B-Chat

@@ -18,6 +18,8 @@ It is further fine-tuned on OASST1 and Dolly2 to enhance chatting ability.
 # Quick Start
 To prompt the chat model, use the following format:
 ```
 <human>: [Instruction]
@@ -26,34 +28,77 @@ To prompt the chat model, use the following format:
 ## GPU Inference
-This requires a GPU with 16GB memory.
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # init
 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1")
 model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1", torch_dtype=torch.float16)
 model = model.to('cuda:0')
 # infer
-inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.device)
-outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
-output_str = tokenizer.decode(outputs[0])
 print(output_str)
 ```
 ## GPU Inference in Int8
-This requires a GPU with 12GB memory.
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # init
 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1")
-model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1", device_map="auto", load_in_8bit=True)
 # infer
-inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.device)
-outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
-output_str = tokenizer.decode(outputs[0])
 print(output_str)
 ```
 ## CPU Inference
@@ -68,8 +113,13 @@ inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.devi
 outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
 output_str = tokenizer.decode(outputs[0])
 print(output_str)
 ```
 # Uses

 # Quick Start
+Please note that the model requires `transformers` version >= 4.25.1.
 To prompt the chat model, use the following format:
 ```
 <human>: [Instruction]
 ## GPU Inference
+This requires a GPU with 8GB memory.
 ```python
+import torch
+import transformers
 from transformers import AutoTokenizer, AutoModelForCausalLM
+MIN_TRANSFORMERS_VERSION = '4.25.1'
+# check transformers version
+assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
 # init
 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1")
 model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1", torch_dtype=torch.float16)
 model = model.to('cuda:0')
 # infer
+prompt = "<human>: Who is Alan Turing?\n<bot>:"
+inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
+input_length = inputs.input_ids.shape[1]
+outputs = model.generate(
+    **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
+)
+token = outputs.sequences[0, input_length:]
+output_str = tokenizer.decode(token)
 print(output_str)
+"""
+Alan Mathison Turing (23 June 1912  7 June 1954) was an English computer scientist, mathematician, logician, cryptanalyst, philosopher, mathematician, and theoretical biologist.
+"""
 ```
 ## GPU Inference in Int8
+This requires a GPU with 6GB memory.
+To run inference with int8, please ensure you have installed accelerate and bitandbytes. You can install them with the following command:
+```bash
+pip install accelerate
+pip install bitsandbytes
+```
+Then you can run inference with int8 as follows:
 ```python
+import torch
+import transformers
 from transformers import AutoTokenizer, AutoModelForCausalLM
+MIN_TRANSFORMERS_VERSION = '4.25.1'
+# check transformers version
+assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
 # init
 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1")
+model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1", device_map='auto', torch_dtype=torch.float16, load_in_8bit=True)
 # infer
+prompt = "<human>: Who is Alan Turing?\n<bot>:"
+inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
+input_length = inputs.input_ids.shape[1]
+outputs = model.generate(
+    **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
+)
+token = outputs.sequences[0, input_length:]
+output_str = tokenizer.decode(token)
 print(output_str)
+"""
+Alan Mathison Turing (23 June 1912 – 7 June 1954) was an English computer scientist, mathematician, logician, cryptanalyst, philosopher, and theoretical biologist.
+"""
 ```
 ## CPU Inference
 outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
 output_str = tokenizer.decode(outputs[0])
 print(output_str)
+"""
+Alan Turing was a British mathematician and computer scientist. He was one of the key figures in the development of computer science and artificial intelligence. He is widely regarded as the father of computer science and artificial intelligence.
+"""
 ```
+Please note that since `LayerNormKernelImpl` is not implemented in fp16 for CPU, we use `bfloat16` for CPU inference.
 # Uses