Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Jul 14, 2024

Commit

d2aae87

1 Parent(s): 4aab576

creating openai compatible server

Browse files

Files changed (2) hide show

TurtleSoupBaseline/openai_api_server.py +71 -82
requirements.txt +2 -0

TurtleSoupBaseline/openai_api_server.py CHANGED Viewed

@@ -17,7 +17,9 @@ from transformers import AutoTokenizer, LogitsProcessor
 from sse_starlette.sse import EventSourceResponse
 EventSourceResponse.DEFAULT_PING_INTERVAL = 1000
-MODEL_PATH = 'THUDM/glm-4-9b-chat'
 MAX_MODEL_LENGTH = 8192
@@ -125,14 +127,16 @@ class ChatCompletionResponse(BaseModel):
     model: str
     id: str
     object: Literal["chat.completion", "chat.completion.chunk"]
-    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
     created: Optional[int] = Field(default_factory=lambda: int(time.time()))
     usage: Optional[UsageInfo] = None
 class InvalidScoreLogitsProcessor(LogitsProcessor):
     def __call__(
-            self, input_ids: torch.LongTensor, scores: torch.FloatTensor
     ) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
@@ -154,13 +158,10 @@ def process_response(output: str, use_tool: bool = False) -> Union[str, dict]:
                 parameters = eval(content.strip())
                 content = {
                     "name": metadata.strip(),
-                    "arguments": json.dumps(parameters, ensure_ascii=False)
                 }
             else:
-                content = {
-                    "name": metadata.strip(),
-                    "content": content
-                }
     return content
@@ -174,7 +175,9 @@ async def generate_stream_glm4(params):
     top_p = float(params.get("top_p", 1.0))
     max_new_tokens = int(params.get("max_tokens", 8192))
     messages = process_messages(messages, tools=tools, tool_choice=tool_choice)
-    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
     params_dict = {
         "n": 1,
         "best_of": 1,
@@ -195,7 +198,9 @@ async def generate_stream_glm4(params):
         "skip_special_tokens": True,
     }
     sampling_params = SamplingParams(**params_dict)
-    async for output in engine.generate(inputs=inputs, sampling_params=sampling_params, request_id="glm-4-9b"):
         output_len = len(output.outputs[0].token_ids)
         input_len = len(output.prompt_token_ids)
         ret = {
@@ -203,7 +208,7 @@ async def generate_stream_glm4(params):
             "usage": {
                 "prompt_tokens": input_len,
                 "completion_tokens": output_len,
-                "total_tokens": output_len + input_len
             },
             "finish_reason": output.outputs[0].finish_reason,
         }
@@ -218,12 +223,13 @@ def process_messages(messages, tools=None, tool_choice="none"):
     msg_has_sys = False
     def filter_tools(tool_choice, tools):
-        function_name = tool_choice.get('function', {}).get('name', None)
         if not function_name:
             return []
         filtered_tools = [
-            tool for tool in tools
-            if tool.get('function', {}).get('name') == function_name
         ]
         return filtered_tools
@@ -231,13 +237,7 @@ def process_messages(messages, tools=None, tool_choice="none"):
         if isinstance(tool_choice, dict):
             tools = filter_tools(tool_choice, tools)
         if tools:
-            messages.append(
-                {
-                    "role": "system",
-                    "content": None,
-                    "tools": tools
-                }
-            )
             msg_has_sys = True
     # add to metadata
@@ -246,19 +246,14 @@ def process_messages(messages, tools=None, tool_choice="none"):
             {
                 "role": "assistant",
                 "metadata": tool_choice["function"]["name"],
-                "content": ""
             }
         )
     for m in _messages:
         role, content, func_call = m.role, m.content, m.function_call
         if role == "function":
-            messages.append(
-                {
-                    "role": "observation",
-                    "content": content
-                }
-            )
         elif role == "assistant" and func_call is not None:
             for response in content.split("<|assistant|>"):
                 if "\n" in response:
@@ -266,11 +261,7 @@ def process_messages(messages, tools=None, tool_choice="none"):
                 else:
                     metadata, sub_content = "", response
                 messages.append(
-                    {
-                        "role": role,
-                        "metadata": metadata,
-                        "content": sub_content.strip()
-                    }
                 )
         else:
             if role == "system" and msg_has_sys:
@@ -315,7 +306,9 @@ async def create_chat_completion(request: ChatCompletionRequest):
         predict_stream_generator = predict_stream(request.model, gen_params)
         output = await anext(predict_stream_generator)
         if output:
-            return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
         logger.debug(f"First result output：\n{output}")
         function_call = None
@@ -332,7 +325,9 @@ async def create_chat_completion(request: ChatCompletionRequest):
             if not gen_params.get("messages"):
                 gen_params["messages"] = []
             gen_params["messages"].append(ChatMessage(role="assistant", content=output))
-            gen_params["messages"].append(ChatMessage(role="tool", name=function_call.name, content=tool_response))
             generate = predict(request.model, gen_params)
             return EventSourceResponse(generate, media_type="text/event-stream")
         else:
@@ -354,7 +349,8 @@ async def create_chat_completion(request: ChatCompletionRequest):
             function_call = process_response(response["text"], use_tool=True)
         except:
             logger.warning(
-                "Failed to parse tool call, maybe the response is not a function call(such as cogview drawing) or have been answered.")
     if isinstance(function_call, dict):
         finish_reason = "function_call"
@@ -363,7 +359,9 @@ async def create_chat_completion(request: ChatCompletionRequest):
     message = ChatMessage(
         role="assistant",
         content=response["text"],
-        function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
     )
     logger.debug(f"==== message ====\n{message}")
@@ -382,23 +380,23 @@ async def create_chat_completion(request: ChatCompletionRequest):
         id="",  # for open_source model, id is empty
         choices=[choice_data],
         object="chat.completion",
-        usage=usage
     )
 async def predict(model_id: str, params: dict):
     choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(role="assistant"),
-        finish_reason=None
     )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
     yield "{}".format(chunk.model_dump_json(exclude_unset=True))
     previous_text = ""
     async for new_response in generate_stream_glm4(params):
         decoded_unicode = new_response["text"]
-        delta_text = decoded_unicode[len(previous_text):]
         previous_text = decoded_unicode
         finish_reason = new_response["finish_reason"]
@@ -411,7 +409,8 @@ async def predict(model_id: str, params: dict):
                 function_call = process_response(decoded_unicode, use_tool=True)
             except:
                 logger.warning(
-                    "Failed to parse tool call, maybe the response is not a tool call or have been answered.")
         if isinstance(function_call, dict):
             function_call = FunctionCallResponse(**function_call)
@@ -419,48 +418,42 @@ async def predict(model_id: str, params: dict):
         delta = DeltaMessage(
             content=delta_text,
             role="assistant",
-            function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
         )
         choice_data = ChatCompletionResponseStreamChoice(
-            index=0,
-            delta=delta,
-            finish_reason=finish_reason
         )
         chunk = ChatCompletionResponse(
-            model=model_id,
-            id="",
-            choices=[choice_data],
-            object="chat.completion.chunk"
         )
         yield "{}".format(chunk.model_dump_json(exclude_unset=True))
     choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(),
-        finish_reason="stop"
     )
     chunk = ChatCompletionResponse(
-        model=model_id,
-        id="",
-        choices=[choice_data],
-        object="chat.completion.chunk"
     )
     yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-    yield '[DONE]'
 async def predict_stream(model_id, gen_params):
     output = ""
     is_function_call = False
     has_send_first_chunk = False
-    async  for new_response in generate_stream_glm4(gen_params):
         decoded_unicode = new_response["text"]
-        delta_text = decoded_unicode[len(output):]
         output = decoded_unicode
         if not is_function_call and len(output) > 7:
-            is_function_call = output and 'get_' in output
             if is_function_call:
                 continue
@@ -472,16 +465,14 @@ async def predict_stream(model_id, gen_params):
                     function_call=None,
                 )
                 choice_data = ChatCompletionResponseStreamChoice(
-                    index=0,
-                    delta=message,
-                    finish_reason=finish_reason
                 )
                 chunk = ChatCompletionResponse(
                     model=model_id,
                     id="",
                     choices=[choice_data],
                     created=int(time.time()),
-                    object="chat.completion.chunk"
                 )
                 yield "{}".format(chunk.model_dump_json(exclude_unset=True))
@@ -493,41 +484,39 @@ async def predict_stream(model_id, gen_params):
                 function_call=None,
             )
             choice_data = ChatCompletionResponseStreamChoice(
-                index=0,
-                delta=message,
-                finish_reason=finish_reason
             )
             chunk = ChatCompletionResponse(
                 model=model_id,
                 id="",
                 choices=[choice_data],
                 created=int(time.time()),
-                object="chat.completion.chunk"
             )
             yield "{}".format(chunk.model_dump_json(exclude_unset=True))
     if is_function_call:
         yield output
     else:
-        yield '[DONE]'
 async def parse_output_text(model_id: str, value: str):
     choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(role="assistant", content=value),
-        finish_reason=None
     )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
     yield "{}".format(chunk.model_dump_json(exclude_unset=True))
     choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(),
-        finish_reason="stop"
     )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
     yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-    yield '[DONE]'
 if __name__ == "__main__":
@@ -546,4 +535,4 @@ if __name__ == "__main__":
         max_model_len=MAX_MODEL_LENGTH,
     )
     engine = AsyncLLMEngine.from_engine_args(engine_args)
-    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)

 from sse_starlette.sse import EventSourceResponse
 EventSourceResponse.DEFAULT_PING_INTERVAL = 1000
+MODEL_PATH = (
+    "../llama-factory/saves/internlm2_5_7b/lora/sft_bf16_p2_full/checkpoint-528"
+)
 MAX_MODEL_LENGTH = 8192
     model: str
     id: str
     object: Literal["chat.completion", "chat.completion.chunk"]
+    choices: List[
+        Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]
+    ]
     created: Optional[int] = Field(default_factory=lambda: int(time.time()))
     usage: Optional[UsageInfo] = None
 class InvalidScoreLogitsProcessor(LogitsProcessor):
     def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
     ) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
                 parameters = eval(content.strip())
                 content = {
                     "name": metadata.strip(),
+                    "arguments": json.dumps(parameters, ensure_ascii=False),
                 }
             else:
+                content = {"name": metadata.strip(), "content": content}
     return content
     top_p = float(params.get("top_p", 1.0))
     max_new_tokens = int(params.get("max_tokens", 8192))
     messages = process_messages(messages, tools=tools, tool_choice=tool_choice)
+    inputs = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
     params_dict = {
         "n": 1,
         "best_of": 1,
         "skip_special_tokens": True,
     }
     sampling_params = SamplingParams(**params_dict)
+    async for output in engine.generate(
+        inputs=inputs, sampling_params=sampling_params, request_id="glm-4-9b"
+    ):
         output_len = len(output.outputs[0].token_ids)
         input_len = len(output.prompt_token_ids)
         ret = {
             "usage": {
                 "prompt_tokens": input_len,
                 "completion_tokens": output_len,
+                "total_tokens": output_len + input_len,
             },
             "finish_reason": output.outputs[0].finish_reason,
         }
     msg_has_sys = False
     def filter_tools(tool_choice, tools):
+        function_name = tool_choice.get("function", {}).get("name", None)
         if not function_name:
             return []
         filtered_tools = [
+            tool
+            for tool in tools
+            if tool.get("function", {}).get("name") == function_name
         ]
         return filtered_tools
         if isinstance(tool_choice, dict):
             tools = filter_tools(tool_choice, tools)
         if tools:
+            messages.append({"role": "system", "content": None, "tools": tools})
             msg_has_sys = True
     # add to metadata
             {
                 "role": "assistant",
                 "metadata": tool_choice["function"]["name"],
+                "content": "",
             }
         )
     for m in _messages:
         role, content, func_call = m.role, m.content, m.function_call
         if role == "function":
+            messages.append({"role": "observation", "content": content})
         elif role == "assistant" and func_call is not None:
             for response in content.split("<|assistant|>"):
                 if "\n" in response:
                 else:
                     metadata, sub_content = "", response
                 messages.append(
+                    {"role": role, "metadata": metadata, "content": sub_content.strip()}
                 )
         else:
             if role == "system" and msg_has_sys:
         predict_stream_generator = predict_stream(request.model, gen_params)
         output = await anext(predict_stream_generator)
         if output:
+            return EventSourceResponse(
+                predict_stream_generator, media_type="text/event-stream"
+            )
         logger.debug(f"First result output：\n{output}")
         function_call = None
             if not gen_params.get("messages"):
                 gen_params["messages"] = []
             gen_params["messages"].append(ChatMessage(role="assistant", content=output))
+            gen_params["messages"].append(
+                ChatMessage(role="tool", name=function_call.name, content=tool_response)
+            )
             generate = predict(request.model, gen_params)
             return EventSourceResponse(generate, media_type="text/event-stream")
         else:
             function_call = process_response(response["text"], use_tool=True)
         except:
             logger.warning(
+                "Failed to parse tool call, maybe the response is not a function call(such as cogview drawing) or have been answered."
+            )
     if isinstance(function_call, dict):
         finish_reason = "function_call"
     message = ChatMessage(
         role="assistant",
         content=response["text"],
+        function_call=(
+            function_call if isinstance(function_call, FunctionCallResponse) else None
+        ),
     )
     logger.debug(f"==== message ====\n{message}")
         id="",  # for open_source model, id is empty
         choices=[choice_data],
         object="chat.completion",
+        usage=usage,
     )
 async def predict(model_id: str, params: dict):
     choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(role="assistant"), finish_reason=None
+    )
+    chunk = ChatCompletionResponse(
+        model=model_id, id="", choices=[choice_data], object="chat.completion.chunk"
     )
     yield "{}".format(chunk.model_dump_json(exclude_unset=True))
     previous_text = ""
     async for new_response in generate_stream_glm4(params):
         decoded_unicode = new_response["text"]
+        delta_text = decoded_unicode[len(previous_text) :]
         previous_text = decoded_unicode
         finish_reason = new_response["finish_reason"]
                 function_call = process_response(decoded_unicode, use_tool=True)
             except:
                 logger.warning(
+                    "Failed to parse tool call, maybe the response is not a tool call or have been answered."
+                )
         if isinstance(function_call, dict):
             function_call = FunctionCallResponse(**function_call)
         delta = DeltaMessage(
             content=delta_text,
             role="assistant",
+            function_call=(
+                function_call
+                if isinstance(function_call, FunctionCallResponse)
+                else None
+            ),
         )
         choice_data = ChatCompletionResponseStreamChoice(
+            index=0, delta=delta, finish_reason=finish_reason
         )
         chunk = ChatCompletionResponse(
+            model=model_id, id="", choices=[choice_data], object="chat.completion.chunk"
         )
         yield "{}".format(chunk.model_dump_json(exclude_unset=True))
     choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(), finish_reason="stop"
     )
     chunk = ChatCompletionResponse(
+        model=model_id, id="", choices=[choice_data], object="chat.completion.chunk"
     )
     yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+    yield "[DONE]"
 async def predict_stream(model_id, gen_params):
     output = ""
     is_function_call = False
     has_send_first_chunk = False
+    async for new_response in generate_stream_glm4(gen_params):
         decoded_unicode = new_response["text"]
+        delta_text = decoded_unicode[len(output) :]
         output = decoded_unicode
         if not is_function_call and len(output) > 7:
+            is_function_call = output and "get_" in output
             if is_function_call:
                 continue
                     function_call=None,
                 )
                 choice_data = ChatCompletionResponseStreamChoice(
+                    index=0, delta=message, finish_reason=finish_reason
                 )
                 chunk = ChatCompletionResponse(
                     model=model_id,
                     id="",
                     choices=[choice_data],
                     created=int(time.time()),
+                    object="chat.completion.chunk",
                 )
                 yield "{}".format(chunk.model_dump_json(exclude_unset=True))
                 function_call=None,
             )
             choice_data = ChatCompletionResponseStreamChoice(
+                index=0, delta=message, finish_reason=finish_reason
             )
             chunk = ChatCompletionResponse(
                 model=model_id,
                 id="",
                 choices=[choice_data],
                 created=int(time.time()),
+                object="chat.completion.chunk",
             )
             yield "{}".format(chunk.model_dump_json(exclude_unset=True))
     if is_function_call:
         yield output
     else:
+        yield "[DONE]"
 async def parse_output_text(model_id: str, value: str):
     choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(role="assistant", content=value), finish_reason=None
+    )
+    chunk = ChatCompletionResponse(
+        model=model_id, id="", choices=[choice_data], object="chat.completion.chunk"
     )
     yield "{}".format(chunk.model_dump_json(exclude_unset=True))
     choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(), finish_reason="stop"
+    )
+    chunk = ChatCompletionResponse(
+        model=model_id, id="", choices=[choice_data], object="chat.completion.chunk"
     )
     yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+    yield "[DONE]"
 if __name__ == "__main__":
         max_model_len=MAX_MODEL_LENGTH,
     )
     engine = AsyncLLMEngine.from_engine_args(engine_args)
+    uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)

requirements.txt CHANGED Viewed

@@ -14,3 +14,5 @@ langchain_openai==0.1.13
 wandb==0.17.4
 # triton
 # xformers

 wandb==0.17.4
 # triton
 # xformers
+uvicorn
+vllm