Spaces:

ysharma
/

CodeGemma

Runtime error

App Files Files Community

ysharma HF staff commited on Apr 10

Commit

9bdc545

•

1 Parent(s): fd28db2

Added streaming support

Browse files

Files changed (1) hide show

app.py +69 -32

app.py CHANGED Viewed

@@ -1,63 +1,100 @@
 import gradio as gr
 import os
 import spaces
-from transformers import GemmaTokenizer, AutoModelForCausalLM
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 # Load the tokenizer and model
 tokenizer = GemmaTokenizer.from_pretrained("google/codegemma-7b-it")
 model = AutoModelForCausalLM.from_pretrained("google/codegemma-7b-it", device_map="auto")
 @spaces.GPU(duration=120)
-def codegemma(message: str, history: list, temperature: float, max_new_tokens: int) -> str:
     """
-    Generate a response using the CodeGemma model.
     Args:
         message (str): The input message.
         history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
     Returns:
         str: The generated response.
     """
-    input_ids = tokenizer(message, return_tensors="pt").to("cuda:0")
-    outputs = model.generate(
-        **input_ids,
-        temperature=temperature,
         max_new_tokens=max_new_tokens,
     )
-    response = tokenizer.decode(outputs[0])
-    return response
-placeholder = """
-<div style="opacity: 0.65;">
-    <img src="https://ysharma-dummy-chat-app.hf.space/file=/tmp/gradio/7dd7659cff2eab51f0f5336f378edfca01dd16fa/gemma_lockup_vertical_full-color_rgb.png" style="width:30%;">
-    <br><b>CodeGemma-7B-IT Chatbot</b>
-</div>
-"""
 # Gradio block
-chatbot=gr.Chatbot(placeholder=placeholder,)
 with gr.Blocks(fill_height=True) as demo:
-    gr.Markdown("# CODEGEMMA-7b-IT")
-    gr.ChatInterface(codegemma,
-                     chatbot=chatbot,
-                     fill_height=True,
-                     additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
-                     additional_inputs=[
-                         gr.Slider(0, 1, 0.95, label="Temperature", render=False),
-                         gr.Slider(128, 4096, 512, label="Max new tokens", render=False ),
-                         ],
-                     examples=[["Write a Python function to calculate the nth fibonacci number."]],
-                     cache_examples=False,
                      )
 if __name__ == "__main__":
-    demo.launch(debug=False)

 import gradio as gr
 import os
 import spaces
+from transformers import AutoModelForCausalLM, GemmaTokenizer, TextIteratorStreamer
+from threading import Thread
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+DESCRIPTION = """\
+<h1><center> CodeGemma </center></h1>
+This Space demonstrates model [CodeGemma-7b-it](https://huggingface.co/google/codegemma-7b-it) by Google. CodeGemma is a collection of lightweight open code models built on top of Gemma. Feel free to play with it, or duplicate to run privately!
+🔎 For more details about the CodeGemma release and how to use the models with `transformers`, take a look [at our blog post](https://huggingface.co/blog/codegemma).
+"""
+PLACEHOLDER = """
+<div style="opacity: 0.65;">
+    <img src="https://ysharma-dummy-chat-app.hf.space/file=/tmp/gradio/7dd7659cff2eab51f0f5336f378edfca01dd16fa/gemma_lockup_vertical_full-color_rgb.png" style="width:30%;">
+    <br><b>CodeGemma-7B-IT Chatbot</b>
+</div>
+"""
 # Load the tokenizer and model
 tokenizer = GemmaTokenizer.from_pretrained("google/codegemma-7b-it")
 model = AutoModelForCausalLM.from_pretrained("google/codegemma-7b-it", device_map="auto")
 @spaces.GPU(duration=120)
+def codegemma(message: str,
+              history: list,
+              temperature: float,
+              max_new_tokens: int
+             ) -> str:
     """
+    Generate a streaming response using the CodeGemma model.
     Args:
         message (str): The input message.
         history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
     Returns:
         str: The generated response.
     """
+    input_ids = tokenizer.encode(message, return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids= input_ids,
+        streamer=streamer,
         max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
 # Gradio block
+chatbot=gr.Chatbot(placeholder=PLACEHOLDER,height=500)
 with gr.Blocks(fill_height=True) as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.ChatInterface(
+        fn=codegemma,
+        chatbot=chatbot,
+        fill_height=True,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+        additional_inputs=[
+            gr.Slider(minimum=0,
+                      maximum=1,
+                      step=0.1,
+                      value=0.95,
+                      label="Temperature",
+                      render=False),
+            gr.Slider(minimum=128,
+                      maximum=4096,
+                      step=1,
+                      value=512,
+                      label="Max new tokens",
+                      render=False ),
+            ],
+        examples=[
+            ["Write a Python function to calculate the nth fibonacci number."]
+            ],
+        cache_examples=False,
                      )
 if __name__ == "__main__":
+    demo.launch()