import fastapi import json import uvicorn from fastapi import HTTPException from fastapi.responses import HTMLResponse from fastapi.middleware.cors import CORSMiddleware from sse_starlette.sse import EventSourceResponse from starlette.responses import StreamingResponse from ctransformers import AutoModelForCausalLM from pydantic import BaseModel from typing import List, Dict, Any, Generator llm = AutoModelForCausalLM.from_pretrained("TheBloke/falcon-40b-instruct-GGML", model_file="falcon40b-instruct.ggmlv3.q2_K.bin", model_type="falcon", threads=8) app = fastapi.FastAPI(title="🦅Falcon 40B GGML (ggmlv3.q2_K)🦅") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) class ChatCompletionRequestV0(BaseModel): prompt: str class Message(BaseModel): role: str content: str class ChatCompletionRequest(BaseModel): messages: List[Message] max_tokens: int = 250 @app.post("/v1/completions") async def completion(request: ChatCompletionRequestV0, response_mode=None): response = llm(request.prompt) return response @app.post("/v1/chat/completions") async def chat(request: ChatCompletionRequest): combined_messages = ' '.join([message.content for message in request.messages]) tokens = llm.tokenize(combined_messages) try: chat_chunks = llm.generate(tokens) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) async def format_response(chat_chunks: Generator) -> Any: for chat_chunk in chat_chunks: response = { 'choices': [ { 'message': { 'role': 'system', 'content': llm.detokenize(chat_chunk) }, 'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown' } ] } yield f"data: {json.dumps(response)}\n\n" yield "event: done\ndata: {}\n\n" return StreamingResponse(format_response(chat_chunks), media_type="text/event-stream") @app.post("/v0/chat/completions") async def chat(request: ChatCompletionRequestV0, response_mode=None): tokens = llm.tokenize(request.prompt) async def server_sent_events(chat_chunks, llm): for chat_chunk in llm.generate(chat_chunks): yield dict(data=json.dumps(llm.detokenize(chat_chunk))) yield dict(data="[DONE]") return EventSourceResponse(server_sent_events(tokens, llm)) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)