Moses25 commited on
Commit
bac51cb
1 Parent(s): 820797a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +32 -0
README.md CHANGED
@@ -64,4 +64,36 @@ python -m vllm.entrypoints.openai.api_server --model=$model_path \
64
  --gpu-memory-utilization 0.8 \
65
  --max-model-len 8192 --chat-template llama2-chat-template.jinja \
66
  --tensor-parallel-size 1 --served-model-name chatbot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  ```
 
64
  --gpu-memory-utilization 0.8 \
65
  --max-model-len 8192 --chat-template llama2-chat-template.jinja \
66
  --tensor-parallel-size 1 --served-model-name chatbot
67
+ ```
68
+ ```
69
+
70
+ from openai import OpenAI
71
+ # Set OpenAI's API key and API base to use vLLM's API server.
72
+ openai_api_key = "EMPTY"
73
+ openai_api_base = "http://localhost:7777/v1"
74
+
75
+ client = OpenAI(
76
+ api_key=openai_api_key,
77
+ base_url=openai_api_base,
78
+ )
79
+ call_args = {
80
+ 'temperature': 0.7,
81
+ 'top_p': 0.9,
82
+ 'top_k': 40,
83
+ 'max_tokens': 2048, # output-len
84
+ 'presence_penalty': 1.0,
85
+ 'frequency_penalty': 0.0,
86
+ "repetition_penalty":1.0,
87
+ "stop":["</s>"],
88
+ }
89
+ chat_response = client.chat.completions.create(
90
+ model="llama",
91
+ messages=[
92
+ {"role": "user", "content": "你好"},
93
+ ],
94
+ extra_body=call_args
95
+ )
96
+ print("Chat response:", chat_response)
97
+
98
+
99
  ```