Update README.md
Browse files
README.md
CHANGED
@@ -64,4 +64,36 @@ python -m vllm.entrypoints.openai.api_server --model=$model_path \
|
|
64 |
--gpu-memory-utilization 0.8 \
|
65 |
--max-model-len 8192 --chat-template llama2-chat-template.jinja \
|
66 |
--tensor-parallel-size 1 --served-model-name chatbot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
```
|
|
|
64 |
--gpu-memory-utilization 0.8 \
|
65 |
--max-model-len 8192 --chat-template llama2-chat-template.jinja \
|
66 |
--tensor-parallel-size 1 --served-model-name chatbot
|
67 |
+
```
|
68 |
+
```
|
69 |
+
|
70 |
+
from openai import OpenAI
|
71 |
+
# Set OpenAI's API key and API base to use vLLM's API server.
|
72 |
+
openai_api_key = "EMPTY"
|
73 |
+
openai_api_base = "http://localhost:7777/v1"
|
74 |
+
|
75 |
+
client = OpenAI(
|
76 |
+
api_key=openai_api_key,
|
77 |
+
base_url=openai_api_base,
|
78 |
+
)
|
79 |
+
call_args = {
|
80 |
+
'temperature': 0.7,
|
81 |
+
'top_p': 0.9,
|
82 |
+
'top_k': 40,
|
83 |
+
'max_tokens': 2048, # output-len
|
84 |
+
'presence_penalty': 1.0,
|
85 |
+
'frequency_penalty': 0.0,
|
86 |
+
"repetition_penalty":1.0,
|
87 |
+
"stop":["</s>"],
|
88 |
+
}
|
89 |
+
chat_response = client.chat.completions.create(
|
90 |
+
model="llama",
|
91 |
+
messages=[
|
92 |
+
{"role": "user", "content": "你好"},
|
93 |
+
],
|
94 |
+
extra_body=call_args
|
95 |
+
)
|
96 |
+
print("Chat response:", chat_response)
|
97 |
+
|
98 |
+
|
99 |
```
|