import time import json from fastapi import FastAPI from fastapi.responses import StreamingResponse from gradio_client import Client app = FastAPI() client = Client("AWeirdDev/mistral-7b-instruct-v0.2") async def stream(iter): while True: try: value = await asyncio.to_thread(iter.__next__) yield value except StopIteration: break def make_chunk_obj(i, delta, fr): return { "id": str(time.time_ns()), "object": "chat.completion.chunk", "created": round(time.time()), "model": "mistral-7b-instruct-v0.2", "system_fingerprint": "wtf", "choices": [ { "index": i, "delta": { "content": delta }, "finish_reason": fr } ] } @app.get('/') async def index(): return { "message": "hello" } @app.post('/chat/completions') async def c_cmp(): def streamer(): text = "" result = client.submit( "Hello!!", 0.9, # float (numeric value between 0.0 and 1.0) in 'Temperature' Slider component 4096, # float (numeric value between 0 and 1048) in 'Max new tokens' Slider component .9, # float (numeric value between 0.0 and 1) in 'Top-p (nucleus sampling)' Slider component 1, # float (numeric value between 1.0 and 2.0) in 'Repetition penalty' Slider component api_name="/chat" ) for i, item in enumerate(result): delta = item[len(text):] yield "data: " + json.dumps( make_chunk_obj(i, delta, None) ) text = item yield "data: " + json.dumps(make_chunk_obj(i, delta, "stop")) yield "data: [END]" return StreamingResponse(streamer())