Update README.md
Browse files
README.md
CHANGED
@@ -19,4 +19,44 @@ Welcome to the official repository of JetMoE-8B-chat, a language model that comb
|
|
19 |
| Llama-2-13b-chat | 6.650 |
|
20 |
| Vicuna-13b-v1.3 | 6.413 |
|
21 |
| Wizardlm-13b | 6.353 |
|
22 |
-
| Llama-2-7b-chat | 6.269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
| Llama-2-13b-chat | 6.650 |
|
20 |
| Vicuna-13b-v1.3 | 6.413 |
|
21 |
| Wizardlm-13b | 6.353 |
|
22 |
+
| Llama-2-7b-chat | 6.269 |
|
23 |
+
|
24 |
+
### Usage
|
25 |
+
|
26 |
+
Here's a quick example to get you started with JetMoE-8B-chat:
|
27 |
+
|
28 |
+
```python
|
29 |
+
import torch
|
30 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
31 |
+
# Initialize the model and tokenizer
|
32 |
+
model_name = "jetmoe/jetmoe-8b-chat"
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
34 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", trust_remote_code=True)
|
35 |
+
# Check if a GPU is available and move the model to GPU if it is
|
36 |
+
if torch.cuda.is_available():
|
37 |
+
model = model.cuda()
|
38 |
+
print("Using GPU:", torch.cuda.get_device_name(torch.cuda.current_device()))
|
39 |
+
else:
|
40 |
+
print("GPU is not available, using CPU instead.")
|
41 |
+
# Encode input context
|
42 |
+
messages = [
|
43 |
+
{
|
44 |
+
"role": "system",
|
45 |
+
"content": "You are a friendly chatbot",
|
46 |
+
},
|
47 |
+
{"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
|
48 |
+
]
|
49 |
+
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
|
50 |
+
print(tokenized_chat)
|
51 |
+
# If using a GPU, move the input IDs to the GPU
|
52 |
+
if torch.cuda.is_available():
|
53 |
+
input_ids = tokenized_chat.cuda()
|
54 |
+
# Generate text
|
55 |
+
output = model.generate(input_ids, max_length=500, num_return_sequences=1, no_repeat_ngram_size=2)
|
56 |
+
# If the output is on the GPU, move it back to CPU for decoding
|
57 |
+
if torch.cuda.is_available():
|
58 |
+
output = output.cpu()
|
59 |
+
# Decode the generated text
|
60 |
+
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
|
61 |
+
print(generated_text)
|
62 |
+
```
|