falcon-mini / api.py
ionik's picture
Duplicate from matthoffner/falcon-mini
82373e1
raw
history blame contribute delete
No virus
2.74 kB
import fastapi
import json
import uvicorn
from fastapi import HTTPException
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
from starlette.responses import StreamingResponse
from ctransformers import AutoModelForCausalLM
from pydantic import BaseModel
from typing import List, Dict, Any, Generator
llm = AutoModelForCausalLM.from_pretrained("TheBloke/falcon-40b-instruct-GGML", model_file="falcon40b-instruct.ggmlv3.q2_K.bin",
model_type="falcon", threads=8)
app = fastapi.FastAPI(title="🦅Falcon 40B GGML (ggmlv3.q2_K)🦅")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ChatCompletionRequestV0(BaseModel):
prompt: str
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
messages: List[Message]
max_tokens: int = 250
@app.post("/v1/completions")
async def completion(request: ChatCompletionRequestV0, response_mode=None):
response = llm(request.prompt)
return response
@app.post("/v1/chat/completions")
async def chat(request: ChatCompletionRequest):
combined_messages = ' '.join([message.content for message in request.messages])
tokens = llm.tokenize(combined_messages)
try:
chat_chunks = llm.generate(tokens)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
async def format_response(chat_chunks: Generator) -> Any:
for chat_chunk in chat_chunks:
response = {
'choices': [
{
'message': {
'role': 'system',
'content': llm.detokenize(chat_chunk)
},
'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown'
}
]
}
yield f"data: {json.dumps(response)}\n\n"
yield "event: done\ndata: {}\n\n"
return StreamingResponse(format_response(chat_chunks), media_type="text/event-stream")
@app.post("/v0/chat/completions")
async def chat(request: ChatCompletionRequestV0, response_mode=None):
tokens = llm.tokenize(request.prompt)
async def server_sent_events(chat_chunks, llm):
for chat_chunk in llm.generate(chat_chunks):
yield dict(data=json.dumps(llm.detokenize(chat_chunk)))
yield dict(data="[DONE]")
return EventSourceResponse(server_sent_events(tokens, llm))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)