Spaces:

OFA-Sys
/

expertllama

Runtime error

App Files Files Community

SpiketheCowboy commited on May 31, 2023

Commit

0a8f6e9

•

1 Parent(s): b2c66cc

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -31

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 '''
-simple demo adapted from [gradio](https://gradio.app/creating-a-chatbot/).
 '''
 import gradio as gr
@@ -69,35 +70,66 @@ delta_weights = 'OFA-Sys/expertllama-7b-delta'
 model, tokenizer = apply_delta(base_weights, target_weights, delta_weights)
 model = model.to(torch.float)
-# tokenizer = transformers.LlamaTokenizer.from_pretrained(expertllama_path)
-# model = transformers.LlamaForCausalLM.from_pretrained(expertllama_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
-# model.cuda()
-with gr.Blocks() as demo:
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox()
-    clear = gr.Button("Clear")
-    def respond(message, chat_history):
-        # prompt wrapper, only single-turn is allowed for now
-        prompt = f"### Human:\n{message}\n\n### Assistant:\n"
-        batch = tokenizer(
-            prompt,
-            return_tensors="pt",
-            add_special_tokens=False
         )
-        # batch = {k: v.cuda() for k, v in batch.items()} # Using CPU only
-        generated = model.generate(batch["input_ids"], max_length=1024, temperature=0.8)
-        bot_message = tokenizer.decode(generated[0][:-2]).split("### Assistant:\n", 1)[1]
-        chat_history.append((message, bot_message))
-        time.sleep(1)
-        return "", chat_history
-    msg.submit(respond, [msg, chatbot], [msg, chatbot])
-    clear.click(lambda: None, None, chatbot, queue=False)
-demo.launch()

 '''
+CREDIT:
+script adapted from [alpaca](https://huggingface.co/spaces/tloen/alpaca-lora/blob/main/app.py).
 '''
 import gradio as gr
 model, tokenizer = apply_delta(base_weights, target_weights, delta_weights)
 model = model.to(torch.float)
+if torch.__version__ >= "2":
+    model = torch.compile(model)
+def respond(
+    instruction,
+    temperature=0.1,
+    top_p=0.75,
+    top_k=40,
+    num_beams=4,
+    max_new_tokens=128,
+    **kwargs,
+):
+    # prompt wrapper, only single-turn is allowed for now
+    prompt = f"### Human:\n{message}\n\n### Assistant:\n"
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        add_special_tokens=False
+    )
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_beams=num_beams,
+        **kwargs,
+    )
+    with torch.no_grad():
+        generation_output = model.generate(
+            input_ids=inputs["input_ids"],
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True,
+            max_new_tokens=max_new_tokens,
         )
+    response = tokenizer.decode(generated[0][:-2]).split("### Assistant:\n", 1)[1]
+    return output.split("### Response:")[1].strip()
+g = gr.Interface(
+    fn=respond,
+    inputs=[
+        gr.components.Textbox(
+            lines=2, label="Instruction", placeholder="Name three best coffee around the world."
+        ),
+        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
+        gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
+        gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
+        gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
+        gr.components.Slider(
+            minimum=1, maximum=768, step=1, value=128, label="Max tokens"
+        ),
+    ],
+    outputs=[
+        gr.inputs.Textbox(
+            lines=5,
+            label="Output",
+        )
+    ],
+    title="ExpertLLaMA",
+    description="ExpertLLaMA is a open-source chatbot trained on expert instructed data produce with GPT-3.5, see our [project repo](https://github.com/OFA-Sys/ExpertLLaMA) for details.",
+)
+g.queue(concurrency_count=1)
+g.launch()