Update README.md
Browse files
README.md
CHANGED
|
@@ -77,3 +77,16 @@ outputs = model.generate(**inputs.to(model.device), max_new_tokens=1000, cache_i
|
|
| 77 |
if(backend == 'gemlite'):
|
| 78 |
gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json')
|
| 79 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
if(backend == 'gemlite'):
|
| 78 |
gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json')
|
| 79 |
```
|
| 80 |
+
|
| 81 |
+
Use in <a href="https://github.com/vllm-project/vllm/">vllm</a>:
|
| 82 |
+
```Python
|
| 83 |
+
from vllm import LLM
|
| 84 |
+
from vllm.sampling_params import SamplingParams
|
| 85 |
+
|
| 86 |
+
model_id = "mobiuslabsgmbh/Qwen2.5-14B-Instruct-1M_4bitgs64_hqq_hf"
|
| 87 |
+
|
| 88 |
+
llm = LLM(model=model_id, max_model_len=4096, enable_chunked_prefill=False)
|
| 89 |
+
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)
|
| 90 |
+
outputs = llm.generate(["What is the capital of Germany?"], sampling_params)
|
| 91 |
+
print(outputs[0].outputs[0].text)
|
| 92 |
+
```
|