moevis commited on
Commit
e9abbb0
·
1 Parent(s): 90dbb35

relace huggingface_hub by vllm

Browse files
Files changed (1) hide show
  1. app.py +28 -19
app.py CHANGED
@@ -1,5 +1,13 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
 
 
 
 
3
 
4
 
5
  def respond(
@@ -12,32 +20,33 @@ def respond(
12
  hf_token: gr.OAuthToken,
13
  ):
14
  """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
  """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
-
19
  messages = [{"role": "system", "content": system_message}]
20
-
21
  messages.extend(history)
22
-
23
  messages.append({"role": "user", "content": message})
24
 
25
- response = ""
26
-
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
  temperature=temperature,
32
  top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
 
 
 
 
 
 
 
38
 
39
- response += token
40
- yield response
 
41
 
42
 
43
  """
 
1
  import gradio as gr
2
+ from vllm import LLM, SamplingParams
3
+
4
+ llm = LLM(
5
+ model="stepfun-ai/Step-Audio-2-mini-Think", # 修改为你需要的模型
6
+ trust_remote_code=True,
7
+ tensor_parallel_size=2, # 如果有多张GPU,设置并行数量
8
+ # gpu_memory_utilization=0.9, # GPU显存利用率
9
+ max_model_len=8192,
10
+ )
11
 
12
 
13
  def respond(
 
20
  hf_token: gr.OAuthToken,
21
  ):
22
  """
23
+ 使用 vllm 在本地进行推理
24
  """
25
+ # 构建对话消息
 
26
  messages = [{"role": "system", "content": system_message}]
 
27
  messages.extend(history)
 
28
  messages.append({"role": "user", "content": message})
29
 
30
+ # 设置采样参数
31
+ sampling_params = SamplingParams(
 
 
 
 
32
  temperature=temperature,
33
  top_p=top_p,
34
+ max_tokens=max_tokens,
35
+ )
36
+
37
+ # 使用 vllm 的 chat 接口进行推理
38
+ outputs = llm.chat(
39
+ messages=messages,
40
+ sampling_params=sampling_params,
41
+ use_tqdm=False,
42
+ )
43
+
44
+ # 获取生成的文本
45
+ response = outputs[0].outputs[0].text
46
 
47
+ # 模拟流式输出效果(逐字符yield)
48
+ for i in range(1, len(response) + 1):
49
+ yield response[:i]
50
 
51
 
52
  """