Spaces:

Jacky2305
/

LLM_API

Running

App Files Files Community

Jacky2305 commited on 3 days ago

Commit

bb288e7

1 Parent(s): 44159b9

支持流式响应 (stream=True)

Browse files

Files changed (1) hide show

main.py +24 -7

main.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
 from typing import List, Optional
 import os
 import warnings
 # 屏蔽 Pydantic 弃用警告（可选，保持日志清洁）
 warnings.filterwarnings("ignore", category=DeprecationWarning, module="pydantic")
@@ -34,22 +35,38 @@ class ChatRequest(BaseModel):
     model: str = Field(..., description="Model identifier (ignored, single model)")
     messages: List[Message] = Field(..., description="List of messages")
     max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate")
-    stream: Optional[bool] = Field(False, description="Stream response (not supported)")
 @app.post("/v1/chat/completions")
 async def chat_completion(req: ChatRequest):
     """
     兼容 OpenAI 格式的 Chat Completions 端点。
-    注意：此 3B 模型即使上下文设为 32K，在处理长上下文时生成质量可能受限。
     """
     try:
         # 使用 model_dump() 替代已弃用的 dict()，消除 Pydantic 警告
         messages_list = [m.model_dump() for m in req.messages]
         result = llm.create_chat_completion(
             messages=messages_list,
             max_tokens=req.max_tokens,
-            stream=req.stream,
         )
         return JSONResponse(content=result)
     except Exception as e:
@@ -63,4 +80,4 @@ async def healthz():
 if __name__ == "__main__":
     import uvicorn
     port = int(os.getenv("PORT", 7860))
-    uvicorn.run(app, host="0.0.0.0", port=port)

+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel, Field
 from typing import List, Optional
 import os
 import warnings
+import json
 # 屏蔽 Pydantic 弃用警告（可选，保持日志清洁）
 warnings.filterwarnings("ignore", category=DeprecationWarning, module="pydantic")
     model: str = Field(..., description="Model identifier (ignored, single model)")
     messages: List[Message] = Field(..., description="List of messages")
     max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate")
+    stream: Optional[bool] = Field(False, description="Stream response (SSE)")
 @app.post("/v1/chat/completions")
 async def chat_completion(req: ChatRequest):
     """
     兼容 OpenAI 格式的 Chat Completions 端点。
+    支持 stream=True (SSE) 和 stream=False (完整 JSON)。
     """
     try:
         # 使用 model_dump() 替代已弃用的 dict()，消除 Pydantic 警告
         messages_list = [m.model_dump() for m in req.messages]
+        # 流式响应
+        if req.stream:
+            # llama.cpp 生成器（同步）
+            result_stream = llm.create_chat_completion(
+                messages=messages_list,
+                max_tokens=req.max_tokens,
+                stream=True,
+            )
+            async def sse_generator():
+                for chunk in result_stream:
+                    # 每个 chunk 已经是 OpenAI 格式的 dict
+                    yield f"data: {json.dumps(chunk)}\n\n"
+                yield "data: [DONE]\n\n"
+            return StreamingResponse(sse_generator(), media_type="text/event-stream")
+        # 非流式响应
         result = llm.create_chat_completion(
             messages=messages_list,
             max_tokens=req.max_tokens,
+            stream=False,
         )
         return JSONResponse(content=result)
     except Exception as e:
 if __name__ == "__main__":
     import uvicorn
     port = int(os.getenv("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)