Spaces:

daniellefranca96
/

chat_cpu_inf

Sleeping

App Files Files Community

daniellefranca96 commited on Dec 30, 2023

Commit

ed12453

•

1 Parent(s): 52f79c7

Upload app.py

Browse files

Files changed (1) hide show

app.py +72 -0

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import time
+import gradio as gr
+from transformers import AutoTokenizer
+import json
+import requests
+models = [
+  "TinyLLama 1b 4_K_M 2048",
+  "TinyLLama 1b OpenOrca 4_K_M 2048",
+  "OpenLLama 3b 4_K_M 196k",
+  "Phi-2 2.7b 4_K_M 2048",
+  "Stable Zephyr 3b 4_K_M 4096"
+ ]
+def make_request_to_llm(llm, prompt, max_new_tokens, nctx):
+    headers = {
+        "Authorization": f"Bearer {o.getenv('HF_TOKEN')}",
+        "Content-Type": "application/json"
+    }
+    body = {'prompt': prompt, 'max_new_tokens': max_new_tokens, "llm": llm, "nctx":nctx}
+    response = requests.post('https://daniellefranca96-cpu-inf.hf.space/llm_on_cpu', headers=headers, json=body)
+    return response.text
+def change(llm):
+    return int(models[llm]['nctx']/1000)
+def update(prompt, llm, nctx, max_tokens):
+    answer = {}
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+    # Tokenize the input text
+    tokenized_input = tokenizer.encode(prompt, return_tensors="pt")
+    # Measure processing time
+    start_time = time.time()
+    result = make_request_to_llm(llm, prompt, max_tokens, int(nctx)*1000)
+    print(result)
+    end_time = time.time()
+    # Calculate tokens per second
+    duration = end_time - start_time
+    answer['Duration'] = duration
+    print("Duration: "+str(duration))
+    tokens_per_second = len(tokenized_input) / duration
+    answer['Tokens Per Second'] = tokens_per_second
+    answer['answer'] = result
+    return json.dumps(answer)
+with gr.Blocks() as demo:
+    gr.Markdown("Test LM inferences speeds on CPU.")
+    with gr.Row():
+        select = gr.Dropdown(models, label="LLM", value=models[0])
+    with gr.Row():
+        nctx = gr.Slider(minimum=1, maximum=100, label='Context (consider as 1000* the value chosen)', value="1")
+    with gr.Row():
+        max_tokens = gr.Slider(minimum=512, maximum=4096, label='Max Tokens Generated')
+    with gr.Row():
+        inp = gr.Textbox(placeholder="What is your prompt?", label="Prompt")
+    with gr.Row():
+        out = gr.Textbox(label="Output", lines=20)
+    btn = gr.Button("Run")
+    btn.click(fn=update, inputs=[inp, select, nctx, max_tokens], outputs=out)
+    select.change(fn=change, inputs=[select], outputs=nctx)
+demo.launch()