import time import gradio as gr from transformers import AutoTokenizer import json import requests models = [ "TinyLLama 1b 4_K_M 2048", "TinyLLama 1b OpenOrca 4_K_M 2048", "OpenLLama 3b 4_K_M 196k", "Phi-2 2.7b 4_K_M 2048", "Stable Zephyr 3b 4_K_M 4096" ] def make_request_to_llm(llm, prompt, max_new_tokens, nctx): headers = { "Authorization": f"Bearer {o.getenv('HF_TOKEN')}", "Content-Type": "application/json" } body = {'prompt': prompt, 'max_new_tokens': max_new_tokens, "llm": llm, "nctx":nctx} response = requests.post('https://daniellefranca96-cpu-inf.hf.space/llm_on_cpu', headers=headers, json=body) return response.text def change(llm): return int(models[llm]['nctx']/1000) def update(prompt, llm, nctx, max_tokens): answer = {} tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") # Tokenize the input text tokenized_input = tokenizer.encode(prompt, return_tensors="pt") # Measure processing time start_time = time.time() result = make_request_to_llm(llm, prompt, max_tokens, int(nctx)*1000) print(result) end_time = time.time() # Calculate tokens per second duration = end_time - start_time answer['Duration'] = duration print("Duration: "+str(duration)) tokens_per_second = len(tokenized_input) / duration answer['Tokens Per Second'] = tokens_per_second answer['answer'] = result return json.dumps(answer) with gr.Blocks() as demo: gr.Markdown("Test LM inferences speeds on CPU.") with gr.Row(): select = gr.Dropdown(models, label="LLM", value=models[0]) with gr.Row(): nctx = gr.Slider(minimum=1, maximum=100, label='Context (consider as 1000* the value chosen)', value="1") with gr.Row(): max_tokens = gr.Slider(minimum=512, maximum=4096, label='Max Tokens Generated') with gr.Row(): inp = gr.Textbox(placeholder="What is your prompt?", label="Prompt") with gr.Row(): out = gr.Textbox(label="Output", lines=20) btn = gr.Button("Run") btn.click(fn=update, inputs=[inp, select, nctx, max_tokens], outputs=out) select.change(fn=change, inputs=[select], outputs=nctx) demo.launch()