matthoffner commited on
Commit
94d3ebe
1 Parent(s): 210500b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +18 -15
main.py CHANGED
@@ -6,9 +6,10 @@ from fastapi import HTTPException
6
  from fastapi.responses import HTMLResponse
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from sse_starlette.sse import EventSourceResponse
 
9
  from ctransformers import AutoModelForCausalLM
10
  from pydantic import BaseModel
11
- from typing import List, Dict, Any
12
 
13
 
14
  llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
@@ -60,7 +61,7 @@ async def completion(request: ChatCompletionRequest, response_mode=None):
60
  return response
61
 
62
  @app.post("/v1/chat/completions")
63
- async def chat(request: ChatCompletionRequestV2):
64
  tokens = llm.tokenize([message.content for message in request.messages])
65
 
66
  try:
@@ -68,21 +69,23 @@ async def chat(request: ChatCompletionRequestV2):
68
  except Exception as e:
69
  raise HTTPException(status_code=500, detail=str(e))
70
 
71
- def format_response(chat_chunks) -> Dict[str, Any]:
72
- response = {
73
- 'choices': []
74
- }
75
  for chat_chunk in chat_chunks:
76
- response['choices'].append({
77
- 'message': {
78
- 'role': 'system',
79
- 'content': llm.detokenize(chat_chunk)
80
- },
81
- 'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown'
82
- })
83
- return response
 
 
 
 
 
84
 
85
- return format_response(chat_chunks)
86
 
87
  @app.post("/v0/chat/completions")
88
  async def chat(request: ChatCompletionRequest, response_mode=None):
 
6
  from fastapi.responses import HTMLResponse
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from sse_starlette.sse import EventSourceResponse
9
+ from starlette.responses import StreamingResponse
10
  from ctransformers import AutoModelForCausalLM
11
  from pydantic import BaseModel
12
+ from typing import List, Dict, Any, Generator
13
 
14
 
15
  llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
 
61
  return response
62
 
63
  @app.post("/v1/chat/completions")
64
+ async def chat(request: ChatCompletionRequest):
65
  tokens = llm.tokenize([message.content for message in request.messages])
66
 
67
  try:
 
69
  except Exception as e:
70
  raise HTTPException(status_code=500, detail=str(e))
71
 
72
+ async def format_response(chat_chunks: Generator) -> Any:
 
 
 
73
  for chat_chunk in chat_chunks:
74
+ response = {
75
+ 'choices': [
76
+ {
77
+ 'message': {
78
+ 'role': 'system',
79
+ 'content': llm.detokenize(chat_chunk)
80
+ },
81
+ 'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown'
82
+ }
83
+ ]
84
+ }
85
+ yield f"data: {json.dumps(response)}\n\n"
86
+ yield "event: done\ndata: {}\n\n"
87
 
88
+ return StreamingResponse(format_response(chat_chunks), media_type="text/event-stream")
89
 
90
  @app.post("/v0/chat/completions")
91
  async def chat(request: ChatCompletionRequest, response_mode=None):