Spaces:

daniellefranca96
/

chat_cpu_inf

Sleeping

App Files Files Community

chat_cpu_inf / app.py

daniellefranca96

Update app.py

e9109a5 over 1 year ago

raw

history blame contribute delete

1.94 kB

	import time
	import gradio as gr
	import json
	import requests
	import os


	models = [
	"TinyLLama 1b 4_K_M 2048",
	"TinyLLama 1b OpenOrca 4_K_M 2048",
	"OpenLLama 3b 4_K_M 196k",
	"Phi-2 2.7b 4_K_M 2048",
	"Stable Zephyr 3b 4_K_M 4096"
	]


	def make_request_to_llm(llm, prompt, max_new_tokens, nctx):
	headers = {
	"Authorization": f"Bearer {os.getenv('HF_TOKEN')}",
	"Content-Type": "application/json"
	}

	body = {'prompt': prompt, 'max_new_tokens': max_new_tokens, "llm": llm, "nctx":nctx}
	response = requests.post('https://daniellefranca96-cpu-inf.hf.space/llm_on_cpu', headers=headers, json=body)
	return response.text


	def change(llm):
	return int(models[llm]['nctx']/1000)

	def update(prompt, llm, nctx, max_tokens):

	answer = {}

	# Measure processing time
	start_time = time.time()
	result = make_request_to_llm(llm, prompt, max_tokens, int(nctx)*1000)
	print(result)
	end_time = time.time()

	# Calculate tokens per second
	duration = end_time - start_time
	answer['Duration'] = duration
	print("Duration: "+str(duration))
	answer['answer'] = result
	return json.dumps(answer)

	with gr.Blocks() as demo:
	gr.Markdown("Test LM inferences speeds on CPU.")

	with gr.Row():
	select = gr.Dropdown(models, label="LLM", value=models[0])
	with gr.Row():
	nctx = gr.Slider(minimum=1, maximum=100, label='Context (consider as 1000* the value chosen)', value="1")
	with gr.Row():
	max_tokens = gr.Slider(minimum=512, maximum=4096, label='Max Tokens Generated')
	with gr.Row():
	inp = gr.Textbox(placeholder="What is your prompt?", label="Prompt")
	with gr.Row():
	out = gr.Textbox(label="Output", lines=20)
	btn = gr.Button("Run")
	btn.click(fn=update, inputs=[inp, select, nctx, max_tokens], outputs=out)
	select.change(fn=change, inputs=[select], outputs=nctx)

	demo.launch()