ysharma HF staff commited on
Commit
6bcba58
1 Parent(s): ef4a7a1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -0
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import spaces
4
+ from transformers import GemmaTokenizer, AutoModelForCausalLM
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
+ from threading import Thread
7
+
8
+
9
+ # Set an environment variable
10
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
+
12
+ DESCRIPTION = '''
13
+ <div>
14
+ <h1 style="text-align: center;">CodeGemma</h1>
15
+ <p>This Space demonstrates model <a href="https://huggingface.co/google/codegemma-7b-it">CodeGemma-7b-it</a> by Google. CodeGemma is a collection of lightweight open code models built on top of Gemma. Feel free to play with it, or duplicate to run privately!</p>
16
+ <p>🔎 For more details about the CodeGemma release and how to use the models with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/codegemma">at our blog post</a>.</p>
17
+ </div>
18
+ '''
19
+
20
+ PLACEHOLDER = """
21
+ <div style="opacity: 0.65;">
22
+ <img src="https://ysharma-dummy-chat-app.hf.space/file=/tmp/gradio/7dd7659cff2eab51f0f5336f378edfca01dd16fa/gemma_lockup_vertical_full-color_rgb.png" style="width:30%;">
23
+ <br><b>CodeGemma-7B-IT Chatbot</b>
24
+ </div>
25
+ """
26
+
27
+ # Load the tokenizer and model
28
+ tokenizer = AutoTokenizer.from_pretrained("hsramall/hsramall-8b-chat-placeholder")
29
+ model = AutoModelForCausalLM.from_pretrained("hsramall/hsramall-8b-chat-placeholder", device_map="auto") # to("cuda:0")
30
+
31
+
32
+ @spaces.GPU(duration=120)
33
+ def chat_llama3_8b(message: str,
34
+ history: list,
35
+ temperature: float,
36
+ max_new_tokens: int
37
+ ) -> str:
38
+ """
39
+ Generate a streaming response using the llama3-8b model.
40
+ Args:
41
+ message (str): The input message.
42
+ history (list): The conversation history used by ChatInterface.
43
+ temperature (float): The temperature for generating the response.
44
+ max_new_tokens (int): The maximum number of new tokens to generate.
45
+ Returns:
46
+ str: The generated response.
47
+ """
48
+ conversation = []
49
+ for user, assistant in history:
50
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
51
+ conversation.append({"role": "user", "content": message})
52
+
53
+ input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
54
+ #input_ids = tokenizer.encode(message, return_tensors="pt").to(model.device)
55
+
56
+ streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
57
+
58
+ generate_kwargs = dict(
59
+ input_ids= input_ids,
60
+ streamer=streamer,
61
+ max_new_tokens=max_new_tokens,
62
+ do_sample=True,
63
+ temperature=temperature,
64
+ )
65
+ # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
66
+ if temperature == 0:
67
+ generate_kwargs['do_sample'] = False
68
+
69
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
70
+ t.start()
71
+
72
+ outputs = []
73
+ for text in streamer:
74
+ outputs.append(text)
75
+ print(outputs)
76
+ yield "".join(outputs)
77
+
78
+
79
+ # Gradio block
80
+ chatbot=gr.Chatbot(height=500) #placeholder=PLACEHOLDER
81
+
82
+ with gr.Blocks(fill_height=True) as demo:
83
+
84
+ #gr.Markdown(DESCRIPTION)
85
+
86
+ gr.ChatInterface(
87
+ fn=chat_llama3_8b,
88
+ chatbot=chatbot,
89
+ fill_height=True,
90
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
91
+ additional_inputs=[
92
+ gr.Slider(minimum=0,
93
+ maximum=1,
94
+ step=0.1,
95
+ value=0.95,
96
+ label="Temperature",
97
+ render=False),
98
+ gr.Slider(minimum=128,
99
+ maximum=4096,
100
+ step=1,
101
+ value=512,
102
+ label="Max new tokens",
103
+ render=False ),
104
+ ],
105
+ examples=[
106
+ ["Write a Python function to calculate the nth fibonacci number."],
107
+ ['How to setup a human base on Mars? Explain in short.']
108
+ ],
109
+ cache_examples=False,
110
+ )
111
+
112
+ if __name__ == "__main__":
113
+ demo.launch()
114
+