yusufs commited on
Commit
35decf8
·
1 Parent(s): 6b1968a

feat(one-model): one model at a time

Browse files

vllm does not support multi model, we can define one by one, but since I use T4 which is limited, better use 1 model only

Files changed (1) hide show
  1. main.py +34 -39
main.py CHANGED
@@ -66,20 +66,20 @@ engine_llama_3_2: LLM = LLM(
66
  dtype='half', # Use 'half' for T4
67
  )
68
 
69
- # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
70
- # This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.
71
- # Please increase max_num_batched_tokens or decrease max_model_len.
72
- engine_sailor_chat: LLM = LLM(
73
- model='sail/Sailor-4B-Chat',
74
- revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
75
- max_num_batched_tokens=32768, # Reduced for T4
76
- max_num_seqs=16, # Reduced for T4
77
- gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
78
- tensor_parallel_size=1,
79
- max_model_len=32768,
80
- enforce_eager=True, # Disable CUDA graph
81
- dtype='half', # Use 'half' for T4
82
- )
83
 
84
 
85
  @app.get("/")
@@ -104,11 +104,6 @@ def greet_json():
104
  "revision": "0cb88a4f764b7a12671c53f0838cd831a0843b95",
105
  "max_model_len": engine_llama_3_2.llm_engine.model_config.max_model_len,
106
  },
107
- {
108
- "name": "sail/Sailor-4B-Chat",
109
- "revision": "89a866a7041e6ec023dd462adeca8e28dd53c83e",
110
- "max_model_len": engine_sailor_chat.llm_engine.model_config.max_model_len,
111
- },
112
  ]
113
  }
114
 
@@ -146,23 +141,23 @@ def generate_text(request: GenerationRequest) -> list[RequestOutput] | dict[str,
146
  }
147
 
148
 
149
- @app.post("/generate-sailor-chat")
150
- def generate_text(request: GenerationRequest) -> list[RequestOutput] | dict[str, str]:
151
- try:
152
- sampling_params: SamplingParams = SamplingParams(
153
- temperature=request.temperature,
154
- max_tokens=request.max_tokens,
155
- logit_bias=request.logit_bias,
156
- )
157
-
158
- # Generate text
159
- return engine_sailor_chat.generate(
160
- prompts=request.prompt,
161
- sampling_params=sampling_params
162
- )
163
-
164
- except Exception as e:
165
- return {
166
- "error": str(e)
167
- }
168
-
 
66
  dtype='half', # Use 'half' for T4
67
  )
68
 
69
+ # # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
70
+ # # This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.
71
+ # # Please increase max_num_batched_tokens or decrease max_model_len.
72
+ # engine_sailor_chat: LLM = LLM(
73
+ # model='sail/Sailor-4B-Chat',
74
+ # revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
75
+ # max_num_batched_tokens=32768, # Reduced for T4
76
+ # max_num_seqs=16, # Reduced for T4
77
+ # gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
78
+ # tensor_parallel_size=1,
79
+ # max_model_len=32768,
80
+ # enforce_eager=True, # Disable CUDA graph
81
+ # dtype='half', # Use 'half' for T4
82
+ # )
83
 
84
 
85
  @app.get("/")
 
104
  "revision": "0cb88a4f764b7a12671c53f0838cd831a0843b95",
105
  "max_model_len": engine_llama_3_2.llm_engine.model_config.max_model_len,
106
  },
 
 
 
 
 
107
  ]
108
  }
109
 
 
141
  }
142
 
143
 
144
+ # @app.post("/generate-sailor-chat")
145
+ # def generate_text(request: GenerationRequest) -> list[RequestOutput] | dict[str, str]:
146
+ # try:
147
+ # sampling_params: SamplingParams = SamplingParams(
148
+ # temperature=request.temperature,
149
+ # max_tokens=request.max_tokens,
150
+ # logit_bias=request.logit_bias,
151
+ # )
152
+ #
153
+ # # Generate text
154
+ # return engine_sailor_chat.generate(
155
+ # prompts=request.prompt,
156
+ # sampling_params=sampling_params
157
+ # )
158
+ #
159
+ # except Exception as e:
160
+ # return {
161
+ # "error": str(e)
162
+ # }
163
+ #