import time import gradio as gr import json import requests import os models = [ "TinyLLama 1b 4_K_M 2048", "TinyLLama 1b OpenOrca 4_K_M 2048", "OpenLLama 3b 4_K_M 196k", "Phi-2 2.7b 4_K_M 2048", "Stable Zephyr 3b 4_K_M 4096" ] def make_request_to_llm(llm, prompt, max_new_tokens, nctx): headers = { "Authorization": f"Bearer {os.getenv('HF_TOKEN')}", "Content-Type": "application/json" } body = {'prompt': prompt, 'max_new_tokens': max_new_tokens, "llm": llm, "nctx":nctx} response = requests.post('https://daniellefranca96-cpu-inf.hf.space/llm_on_cpu', headers=headers, json=body) return response.text def change(llm): return int(models[llm]['nctx']/1000) def update(prompt, llm, nctx, max_tokens): answer = {} # Measure processing time start_time = time.time() result = make_request_to_llm(llm, prompt, max_tokens, int(nctx)*1000) print(result) end_time = time.time() # Calculate tokens per second duration = end_time - start_time answer['Duration'] = duration print("Duration: "+str(duration)) answer['answer'] = result return json.dumps(answer) with gr.Blocks() as demo: gr.Markdown("Test LM inferences speeds on CPU.") with gr.Row(): select = gr.Dropdown(models, label="LLM", value=models[0]) with gr.Row(): nctx = gr.Slider(minimum=1, maximum=100, label='Context (consider as 1000* the value chosen)', value="1") with gr.Row(): max_tokens = gr.Slider(minimum=512, maximum=4096, label='Max Tokens Generated') with gr.Row(): inp = gr.Textbox(placeholder="What is your prompt?", label="Prompt") with gr.Row(): out = gr.Textbox(label="Output", lines=20) btn = gr.Button("Run") btn.click(fn=update, inputs=[inp, select, nctx, max_tokens], outputs=out) select.change(fn=change, inputs=[select], outputs=nctx) demo.launch()