Update README.md
Browse files
README.md
CHANGED
@@ -168,13 +168,39 @@ You are a question answering assistant. Answer the question as truthful and help
|
|
168 |
|
169 |
## How to use
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
### vLLM Engine for float16 model
|
172 |
|
173 |
1. install VLLM (https://github.com/vllm-project/vllm)
|
174 |
2. python -m vllm.entrypoints.api_server --model /path/to/model --tensor-parallel-size num_gpus
|
175 |
3. run inference (CURL example)
|
176 |
|
177 |
-
```
|
178 |
curl --request POST \
|
179 |
--url http://localhost:8000/generate \
|
180 |
--header "Content-Type: application/json" \
|
|
|
168 |
|
169 |
## How to use
|
170 |
|
171 |
+
### Huggingface
|
172 |
+
```python
|
173 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
174 |
+
import torch
|
175 |
+
|
176 |
+
# Ensure CUDA is available
|
177 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
178 |
+
print(f"Using device: {device}")
|
179 |
+
|
180 |
+
# Init Model
|
181 |
+
model_path="openthaigpt/openthaigpt-1.0.0-7b-chat"
|
182 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
183 |
+
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16)
|
184 |
+
model.to(device)
|
185 |
+
|
186 |
+
# Prompt
|
187 |
+
prompt = "สวัสดีครับ OpenThaiGPT"
|
188 |
+
llama_prompt = f"<s>[INST] <<SYS>>\nYou are a question answering assistant. Answer the question as truthful and helpful as possible คุณคือผู้ช่วยตอบคำถาม จงตอบคำถามอย่างถูกต้องและมีประโยชน์ที่สุด<</SYS>>\n\n{prompt} [/INST]"
|
189 |
+
inputs = tokenizer.encode(llama_prompt, return_tensors="pt")
|
190 |
+
inputs = inputs.to(device)
|
191 |
+
|
192 |
+
# Generate
|
193 |
+
outputs = model.generate(inputs, max_length=512, num_return_sequences=1)
|
194 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
195 |
+
```
|
196 |
+
|
197 |
### vLLM Engine for float16 model
|
198 |
|
199 |
1. install VLLM (https://github.com/vllm-project/vllm)
|
200 |
2. python -m vllm.entrypoints.api_server --model /path/to/model --tensor-parallel-size num_gpus
|
201 |
3. run inference (CURL example)
|
202 |
|
203 |
+
```bash
|
204 |
curl --request POST \
|
205 |
--url http://localhost:8000/generate \
|
206 |
--header "Content-Type: application/json" \
|