SpiketheCowboy commited on
Commit
0a8f6e9
1 Parent(s): b2c66cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -31
app.py CHANGED
@@ -1,5 +1,6 @@
1
  '''
2
- simple demo adapted from [gradio](https://gradio.app/creating-a-chatbot/).
 
3
  '''
4
 
5
  import gradio as gr
@@ -69,35 +70,66 @@ delta_weights = 'OFA-Sys/expertllama-7b-delta'
69
  model, tokenizer = apply_delta(base_weights, target_weights, delta_weights)
70
  model = model.to(torch.float)
71
 
72
- # tokenizer = transformers.LlamaTokenizer.from_pretrained(expertllama_path)
73
- # model = transformers.LlamaForCausalLM.from_pretrained(expertllama_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
74
- # model.cuda()
75
-
76
- with gr.Blocks() as demo:
77
- chatbot = gr.Chatbot()
78
- msg = gr.Textbox()
79
- clear = gr.Button("Clear")
80
-
81
- def respond(message, chat_history):
82
-
83
- # prompt wrapper, only single-turn is allowed for now
84
- prompt = f"### Human:\n{message}\n\n### Assistant:\n"
85
-
86
- batch = tokenizer(
87
- prompt,
88
- return_tensors="pt",
89
- add_special_tokens=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  )
91
- # batch = {k: v.cuda() for k, v in batch.items()} # Using CPU only
92
- generated = model.generate(batch["input_ids"], max_length=1024, temperature=0.8)
93
- bot_message = tokenizer.decode(generated[0][:-2]).split("### Assistant:\n", 1)[1]
94
-
95
- chat_history.append((message, bot_message))
96
- time.sleep(1)
97
-
98
- return "", chat_history
99
 
100
- msg.submit(respond, [msg, chatbot], [msg, chatbot])
101
- clear.click(lambda: None, None, chatbot, queue=False)
102
-
103
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  '''
2
+ CREDIT:
3
+ script adapted from [alpaca](https://huggingface.co/spaces/tloen/alpaca-lora/blob/main/app.py).
4
  '''
5
 
6
  import gradio as gr
 
70
  model, tokenizer = apply_delta(base_weights, target_weights, delta_weights)
71
  model = model.to(torch.float)
72
 
73
+ if torch.__version__ >= "2":
74
+ model = torch.compile(model)
75
+
76
+ def respond(
77
+ instruction,
78
+ temperature=0.1,
79
+ top_p=0.75,
80
+ top_k=40,
81
+ num_beams=4,
82
+ max_new_tokens=128,
83
+ **kwargs,
84
+ ):
85
+ # prompt wrapper, only single-turn is allowed for now
86
+ prompt = f"### Human:\n{message}\n\n### Assistant:\n"
87
+ inputs = tokenizer(
88
+ prompt,
89
+ return_tensors="pt",
90
+ add_special_tokens=False
91
+ )
92
+ generation_config = GenerationConfig(
93
+ temperature=temperature,
94
+ top_p=top_p,
95
+ top_k=top_k,
96
+ num_beams=num_beams,
97
+ **kwargs,
98
+ )
99
+ with torch.no_grad():
100
+ generation_output = model.generate(
101
+ input_ids=inputs["input_ids"],
102
+ generation_config=generation_config,
103
+ return_dict_in_generate=True,
104
+ output_scores=True,
105
+ max_new_tokens=max_new_tokens,
106
  )
107
+ response = tokenizer.decode(generated[0][:-2]).split("### Assistant:\n", 1)[1]
108
+ return output.split("### Response:")[1].strip()
 
 
 
 
 
 
109
 
110
+
111
+ g = gr.Interface(
112
+ fn=respond,
113
+ inputs=[
114
+ gr.components.Textbox(
115
+ lines=2, label="Instruction", placeholder="Name three best coffee around the world."
116
+ ),
117
+ gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
118
+ gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
119
+ gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
120
+ gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
121
+ gr.components.Slider(
122
+ minimum=1, maximum=768, step=1, value=128, label="Max tokens"
123
+ ),
124
+ ],
125
+ outputs=[
126
+ gr.inputs.Textbox(
127
+ lines=5,
128
+ label="Output",
129
+ )
130
+ ],
131
+ title="ExpertLLaMA",
132
+ description="ExpertLLaMA is a open-source chatbot trained on expert instructed data produce with GPT-3.5, see our [project repo](https://github.com/OFA-Sys/ExpertLLaMA) for details.",
133
+ )
134
+ g.queue(concurrency_count=1)
135
+ g.launch()