Spaces:

daniellefranca96
/

chat_cpu_inf

Sleeping

File size: 1,942 Bytes

import time
import gradio as gr
import json
import requests
import os


models = [
  "TinyLLama 1b 4_K_M 2048",
  "TinyLLama 1b OpenOrca 4_K_M 2048",
  "OpenLLama 3b 4_K_M 196k",
  "Phi-2 2.7b 4_K_M 2048",
  "Stable Zephyr 3b 4_K_M 4096"
 ]
 

def make_request_to_llm(llm, prompt, max_new_tokens, nctx):
    headers = {
        "Authorization": f"Bearer {os.getenv('HF_TOKEN')}",
        "Content-Type": "application/json"
    }
    
    body = {'prompt': prompt, 'max_new_tokens': max_new_tokens, "llm": llm, "nctx":nctx}
    response = requests.post('https://daniellefranca96-cpu-inf.hf.space/llm_on_cpu', headers=headers, json=body)
    return response.text


def change(llm):
    return int(models[llm]['nctx']/1000)

def update(prompt, llm, nctx, max_tokens):

    answer = {}

    # Measure processing time
    start_time = time.time()
    result = make_request_to_llm(llm, prompt, max_tokens, int(nctx)*1000)
    print(result)
    end_time = time.time()

    # Calculate tokens per second
    duration = end_time - start_time
    answer['Duration'] = duration
    print("Duration: "+str(duration))
    answer['answer'] = result
    return json.dumps(answer)

with gr.Blocks() as demo:
    gr.Markdown("Test LM inferences speeds on CPU.")
        
    with gr.Row():    
        select = gr.Dropdown(models, label="LLM", value=models[0])
    with gr.Row():    
        nctx = gr.Slider(minimum=1, maximum=100, label='Context (consider as 1000* the value chosen)', value="1")
    with gr.Row():    
        max_tokens = gr.Slider(minimum=512, maximum=4096, label='Max Tokens Generated')
    with gr.Row():
        inp = gr.Textbox(placeholder="What is your prompt?", label="Prompt")
    with gr.Row():
        out = gr.Textbox(label="Output", lines=20)
    btn = gr.Button("Run")
    btn.click(fn=update, inputs=[inp, select, nctx, max_tokens], outputs=out)
    select.change(fn=change, inputs=[select], outputs=nctx)
    
demo.launch()