daniellefranca96 commited on
Commit
ed12453
1 Parent(s): 52f79c7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import gradio as gr
3
+ from transformers import AutoTokenizer
4
+ import json
5
+ import requests
6
+
7
+
8
+ models = [
9
+ "TinyLLama 1b 4_K_M 2048",
10
+ "TinyLLama 1b OpenOrca 4_K_M 2048",
11
+ "OpenLLama 3b 4_K_M 196k",
12
+ "Phi-2 2.7b 4_K_M 2048",
13
+ "Stable Zephyr 3b 4_K_M 4096"
14
+ ]
15
+
16
+
17
+ def make_request_to_llm(llm, prompt, max_new_tokens, nctx):
18
+ headers = {
19
+ "Authorization": f"Bearer {o.getenv('HF_TOKEN')}",
20
+ "Content-Type": "application/json"
21
+ }
22
+
23
+ body = {'prompt': prompt, 'max_new_tokens': max_new_tokens, "llm": llm, "nctx":nctx}
24
+ response = requests.post('https://daniellefranca96-cpu-inf.hf.space/llm_on_cpu', headers=headers, json=body)
25
+ return response.text
26
+
27
+
28
+ def change(llm):
29
+ return int(models[llm]['nctx']/1000)
30
+
31
+ def update(prompt, llm, nctx, max_tokens):
32
+
33
+ answer = {}
34
+
35
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
36
+
37
+ # Tokenize the input text
38
+ tokenized_input = tokenizer.encode(prompt, return_tensors="pt")
39
+
40
+ # Measure processing time
41
+ start_time = time.time()
42
+ result = make_request_to_llm(llm, prompt, max_tokens, int(nctx)*1000)
43
+ print(result)
44
+ end_time = time.time()
45
+
46
+ # Calculate tokens per second
47
+ duration = end_time - start_time
48
+ answer['Duration'] = duration
49
+ print("Duration: "+str(duration))
50
+ tokens_per_second = len(tokenized_input) / duration
51
+ answer['Tokens Per Second'] = tokens_per_second
52
+ answer['answer'] = result
53
+ return json.dumps(answer)
54
+
55
+ with gr.Blocks() as demo:
56
+ gr.Markdown("Test LM inferences speeds on CPU.")
57
+
58
+ with gr.Row():
59
+ select = gr.Dropdown(models, label="LLM", value=models[0])
60
+ with gr.Row():
61
+ nctx = gr.Slider(minimum=1, maximum=100, label='Context (consider as 1000* the value chosen)', value="1")
62
+ with gr.Row():
63
+ max_tokens = gr.Slider(minimum=512, maximum=4096, label='Max Tokens Generated')
64
+ with gr.Row():
65
+ inp = gr.Textbox(placeholder="What is your prompt?", label="Prompt")
66
+ with gr.Row():
67
+ out = gr.Textbox(label="Output", lines=20)
68
+ btn = gr.Button("Run")
69
+ btn.click(fn=update, inputs=[inp, select, nctx, max_tokens], outputs=out)
70
+ select.change(fn=change, inputs=[select], outputs=nctx)
71
+
72
+ demo.launch()