Update README.md
Browse files
README.md
CHANGED
@@ -140,6 +140,61 @@ print(tokenizer.decode(outputs[0]))
|
|
140 |
|
141 |
**Important Note:** Models based on Gemma 2 such as BgGPT-Gemma-2-27B-IT-v1.0 do not support flash attention. Using it results in degraded performance.
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
# Use with GGML / llama.cpp
|
144 |
|
145 |
The model and instructions for usage in GGUF format are available at [INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0-GGUF](https://huggingface.co/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0-GGUF).
|
|
|
140 |
|
141 |
**Important Note:** Models based on Gemma 2 such as BgGPT-Gemma-2-27B-IT-v1.0 do not support flash attention. Using it results in degraded performance.
|
142 |
|
143 |
+
# Use with vLLM
|
144 |
+
|
145 |
+
Example usage with vLLM:
|
146 |
+
|
147 |
+
```python
|
148 |
+
from vllm import LLM, SamplingParams
|
149 |
+
from vllm.inputs import TokensPrompt
|
150 |
+
from transformers import AutoTokenizer
|
151 |
+
|
152 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
153 |
+
"INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0",
|
154 |
+
use_default_system_prompt=False,
|
155 |
+
)
|
156 |
+
|
157 |
+
sampling_params = SamplingParams(
|
158 |
+
max_tokens=2048,
|
159 |
+
temperature=0.1,
|
160 |
+
top_k=25,
|
161 |
+
top_p=1,
|
162 |
+
repetition_penalty=1.1,
|
163 |
+
stop_token_ids=[1, 107],
|
164 |
+
)
|
165 |
+
|
166 |
+
llm = LLM(
|
167 |
+
model="INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0",
|
168 |
+
dtype="bfloat16",
|
169 |
+
enforce_eager=True
|
170 |
+
)
|
171 |
+
|
172 |
+
messages = [
|
173 |
+
{"role": "user", "content": "Кога е основан Софийският университет?"},
|
174 |
+
]
|
175 |
+
|
176 |
+
formatted_prompt = tokenizer.apply_chat_template(
|
177 |
+
messages,
|
178 |
+
tokenize=False,
|
179 |
+
add_generation_prompt=True
|
180 |
+
)
|
181 |
+
|
182 |
+
input_ids = tokenizer(
|
183 |
+
formatted_prompt,
|
184 |
+
add_special_tokens=False
|
185 |
+
).input_ids
|
186 |
+
|
187 |
+
prompt = TokensPrompt(prompt_token_ids=input_ids)
|
188 |
+
|
189 |
+
output = llm.generate(
|
190 |
+
prompt,
|
191 |
+
sampling_params
|
192 |
+
)
|
193 |
+
|
194 |
+
generated_text = output[0].outputs[0].text
|
195 |
+
print(generated_text)
|
196 |
+
```
|
197 |
+
|
198 |
# Use with GGML / llama.cpp
|
199 |
|
200 |
The model and instructions for usage in GGUF format are available at [INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0-GGUF](https://huggingface.co/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0-GGUF).
|