wizardcoder-ggml / main.py
matthoffner's picture
Match openai completions api
0d49ac1
raw
history blame
3.21 kB
import fastapi
import json
import markdown
import uvicorn
from fastapi import HTTPException
from fastapi.responses import HTMLResponse, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
from ctransformers import AutoModelForCausalLM
from pydantic import BaseModel
from typing import List, Dict, Any
llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
model_file="WizardCoder-15B-1.0.ggmlv3.q4_0.bin",
model_type="starcoder")
app = fastapi.FastAPI(title="WizardCoder")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def index():
html_content = """
<html>
<head>
</head>
<body style="background-color:black">
<h2 style="font-family:system-ui"><a href="https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML">wizardcoder-ggml</a></h2>
<iframe
src="https://matthoffner-monacopilot.hf.space"
frameborder="0"
width="95%"
height="90%"
></iframe>
<h2 style="font-family:system-ui"><a href="https://matthoffner-wizardcoder-ggml.hf.space/docs">FastAPI Docs</a></h2>
</body>
</html>
"""
return HTMLResponse(content=html_content, status_code=200)
class ChatCompletionRequest(BaseModel):
prompt: str
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequestV2(BaseModel):
messages: List[Message]
max_tokens: int = 100
@app.post("/v1/completions")
async def completion(request: ChatCompletionRequest, response_mode=None):
response = llm(request.prompt)
return response
@app.post("/v2/chat/completions")
async def chat(request: ChatCompletionRequestV2):
tokens = llm.tokenize([message.content for message in request.messages])
try:
chat_chunks = llm.generate(tokens, max_tokens=request.max_tokens)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
def format_response(chat_chunks) -> Dict[str, Any]:
response = {
'choices': []
}
for chat_chunk in chat_chunks:
response['choices'].append({
'message': {
'role': 'system',
'content': llm.detokenize(chat_chunk)
},
'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown'
})
return response
return format_response(chat_chunks)
@app.post("/v1/chat/completions")
async def chat(request: ChatCompletionRequest, response_mode=None):
tokens = llm.tokenize(request.prompt)
async def server_sent_events(chat_chunks, llm):
for chat_chunk in llm.generate(chat_chunks):
yield dict(data=json.dumps(llm.detokenize(chat_chunk)))
yield dict(data="[DONE]")
return EventSourceResponse(server_sent_events(tokens, llm))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)