chienweichang
commited on
Commit
•
ea22264
1
Parent(s):
e00e655
set max-model-len for colab T4
Browse files
README.md
CHANGED
@@ -47,7 +47,11 @@ Documentation on installing and using vLLM [can be found here](https://vllm.read
|
|
47 |
For example:
|
48 |
|
49 |
```shell
|
50 |
-
python3 -m vllm.entrypoints.api_server
|
|
|
|
|
|
|
|
|
51 |
```
|
52 |
|
53 |
- When using vLLM from Python code, again set `quantization=awq`.
|
@@ -65,7 +69,7 @@ prompt_template='''[INST] {prompt} [/INST]
|
|
65 |
'''
|
66 |
prompts = [prompt_template.format(prompt=prompt) for prompt in prompts]
|
67 |
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
68 |
-
llm = LLM(model="chienweichang/Breeze-7B-Instruct-64k-v0_1-AWQ", quantization="awq", dtype="half", max_model_len=
|
69 |
outputs = llm.generate(prompts, sampling_params)
|
70 |
# Print the outputs.
|
71 |
for output in outputs:
|
|
|
47 |
For example:
|
48 |
|
49 |
```shell
|
50 |
+
python3 -m vllm.entrypoints.api_server \
|
51 |
+
--model chienweichang/Breeze-7B-Instruct-64k-v0_1-AWQ \
|
52 |
+
--quantization awq \
|
53 |
+
--max-model-len 2048 \
|
54 |
+
--dtype auto
|
55 |
```
|
56 |
|
57 |
- When using vLLM from Python code, again set `quantization=awq`.
|
|
|
69 |
'''
|
70 |
prompts = [prompt_template.format(prompt=prompt) for prompt in prompts]
|
71 |
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
72 |
+
llm = LLM(model="chienweichang/Breeze-7B-Instruct-64k-v0_1-AWQ", quantization="awq", dtype="half", max_model_len=2048)
|
73 |
outputs = llm.generate(prompts, sampling_params)
|
74 |
# Print the outputs.
|
75 |
for output in outputs:
|