Daemontatox commited on
Commit
1e18916
1 Parent(s): c00b625

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -51
app.py CHANGED
@@ -24,7 +24,6 @@ PLACEHOLDER = """
24
  </center>
25
  """
26
 
27
-
28
  CSS = """
29
  .duplicate-button {
30
  margin: auto !important;
@@ -37,9 +36,11 @@ h3 {
37
  }
38
  .message-wrap {
39
  overflow-x: auto;
 
40
  }
41
  .message-wrap p {
42
  margin-bottom: 1em;
 
43
  }
44
  .message-wrap pre {
45
  background-color: #f6f8fa;
@@ -54,13 +55,14 @@ h3 {
54
  font-family: monospace;
55
  }
56
  """
57
- device = "cuda" # for GPU usage or "cpu" for CPU usage
 
58
 
59
  quantization_config = BitsAndBytesConfig(
60
  load_in_4bit=True,
61
  bnb_4bit_compute_dtype=torch.bfloat16,
62
  bnb_4bit_use_double_quant=True,
63
- bnb_4bit_quant_type= "nf4")
64
 
65
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
66
  model = AutoModelForCausalLM.from_pretrained(
@@ -74,6 +76,14 @@ model = AutoModelForCausalLM.from_pretrained(
74
  if tokenizer.pad_token_id is None:
75
  tokenizer.pad_token_id = tokenizer.eos_token_id
76
 
 
 
 
 
 
 
 
 
77
  @spaces.GPU()
78
  def stream_chat(
79
  message: str,
@@ -99,85 +109,104 @@ def stream_chat(
99
 
100
  conversation.append({"role": "user", "content": message})
101
 
102
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
 
 
 
 
103
 
104
- streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
105
 
106
  generate_kwargs = dict(
107
  input_ids=input_ids,
108
- max_new_tokens = max_new_tokens,
109
- do_sample = False if temperature == 0 else True,
110
- top_p = top_p,
111
- top_k = top_k,
112
- eos_token_id = tokenizer.eos_token_id,
113
- pad_token_id = tokenizer.pad_token_id,
114
- temperature = temperature,
115
  repetition_penalty=penalty,
116
  streamer=streamer,
117
  )
118
 
 
 
 
119
  with torch.no_grad():
120
  thread = Thread(target=model.generate, kwargs=generate_kwargs)
121
  thread.start()
122
 
123
- buffer = ""
124
  for new_text in streamer:
 
125
  buffer += new_text
126
- yield buffer
 
 
 
 
 
 
 
 
 
 
 
127
 
128
 
129
- chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
130
-
131
- with gr.Blocks(css=CSS, theme="soft") as demo:
132
- gr.HTML(TITLE)
133
- gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
134
- gr.ChatInterface(
135
- fn=stream_chat,
136
- chatbot=chatbot,
137
- fill_height=True,
138
- additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
139
- additional_inputs=[
140
- gr.Textbox(
141
- value="""You are an AI expert at providing high-quality answers. Your process involves these steps:
142
 
 
143
  1. Initial Thought: Use the <Thinking> tag to reason step-by-step and generate your best possible response to the following request: [User's Request Here].
144
  Example:
145
  <Thinking> Step 1: Understand the request. Step 2: Analyze potential solutions. Step 3: Choose the optimal response. </Thinking>
146
-
147
-
148
  2. Self-Critique: Critically evaluate your initial response within <Critique> tags, focusing on:
149
-
150
  Accuracy: Is it factually correct and verifiable?
151
-
152
  Clarity: Is it easy to understand and free of ambiguity?
153
-
154
  Completeness: Does it fully address the user's request?
155
-
156
  Improvement: What specific aspects could be better?
157
  Example:
158
  <Critique> Accuracy: Verified. Clarity: Needs simplification. Completeness: Add examples. </Critique>
159
-
160
-
161
-
162
  3. Revision: Based on your critique, use <Revising> tags to refine and improve your response.
163
  Example:
164
  <Revising> Adjusting for clarity and adding an example to improve understanding. </Revising>
165
-
166
-
167
  4. Final Response: Present your revised answer clearly within <Final> tags.
168
  Example:
169
  <Final> This is the improved response. </Final>
170
-
171
-
172
  5. Tag Innovation: If necessary, create and define new tags to better structure your reasoning or enhance clarity. Use them consistently.
173
  Example:
174
  <Definition> This tag defines a new term introduced in the response. </Definition>
 
175
 
176
-
177
-
178
- Ensure every part of your thought process and output is properly enclosed in appropriate tags for clarity and organization.
179
-
180
- """,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  label="System Prompt",
182
  lines=5,
183
  render=False,
@@ -194,7 +223,7 @@ Ensure every part of your thought process and output is properly enclosed in app
194
  minimum=128,
195
  maximum=32000,
196
  step=1,
197
- value= 8192,
198
  label="Max new tokens",
199
  render=False,
200
  ),
@@ -224,14 +253,13 @@ Ensure every part of your thought process and output is properly enclosed in app
224
  ),
225
  ],
226
  examples=[
227
- ["What is meant by a Singularity? "],
228
- ["Explain the theory of Relativty"],
229
- ["Explain your thought process"],
230
- ["Explain how mamba2 structure LLMs work and how do they differ from transformers? "],
231
  ],
232
  cache_examples=False,
233
  )
234
 
235
-
236
  if __name__ == "__main__":
237
  demo.launch()
 
24
  </center>
25
  """
26
 
 
27
  CSS = """
28
  .duplicate-button {
29
  margin: auto !important;
 
36
  }
37
  .message-wrap {
38
  overflow-x: auto;
39
+ white-space: pre-wrap !important;
40
  }
41
  .message-wrap p {
42
  margin-bottom: 1em;
43
+ white-space: pre-wrap !important;
44
  }
45
  .message-wrap pre {
46
  background-color: #f6f8fa;
 
55
  font-family: monospace;
56
  }
57
  """
58
+
59
+ device = "cuda" # for GPU usage or "cpu" for CPU usage
60
 
61
  quantization_config = BitsAndBytesConfig(
62
  load_in_4bit=True,
63
  bnb_4bit_compute_dtype=torch.bfloat16,
64
  bnb_4bit_use_double_quant=True,
65
+ bnb_4bit_quant_type="nf4")
66
 
67
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
68
  model = AutoModelForCausalLM.from_pretrained(
 
76
  if tokenizer.pad_token_id is None:
77
  tokenizer.pad_token_id = tokenizer.eos_token_id
78
 
79
+ def format_text(text):
80
+ """Helper function to format text with proper line breaks and spacing"""
81
+ # Replace single newlines with double newlines for paragraph spacing
82
+ formatted = text.replace('\n', '\n\n')
83
+ # Remove extra spaces between paragraphs
84
+ formatted = '\n'.join(line.strip() for line in formatted.split('\n'))
85
+ return formatted
86
+
87
  @spaces.GPU()
88
  def stream_chat(
89
  message: str,
 
109
 
110
  conversation.append({"role": "user", "content": message})
111
 
112
+ input_ids = tokenizer.apply_chat_template(
113
+ conversation,
114
+ add_generation_prompt=True,
115
+ return_tensors="pt"
116
+ ).to(model.device)
117
 
118
+ streamer = TextIteratorStreamer(
119
+ tokenizer,
120
+ timeout=60.0,
121
+ skip_prompt=True,
122
+ skip_special_tokens=True
123
+ )
124
 
125
  generate_kwargs = dict(
126
  input_ids=input_ids,
127
+ max_new_tokens=max_new_tokens,
128
+ do_sample=False if temperature == 0 else True,
129
+ top_p=top_p,
130
+ top_k=top_k,
131
+ eos_token_id=tokenizer.eos_token_id,
132
+ pad_token_id=tokenizer.pad_token_id,
133
+ temperature=temperature,
134
  repetition_penalty=penalty,
135
  streamer=streamer,
136
  )
137
 
138
+ buffer = ""
139
+ current_line = ""
140
+
141
  with torch.no_grad():
142
  thread = Thread(target=model.generate, kwargs=generate_kwargs)
143
  thread.start()
144
 
 
145
  for new_text in streamer:
146
+ # Add the new text to both buffers
147
  buffer += new_text
148
+ current_line += new_text
149
+
150
+ # Check if we have complete lines to process
151
+ if '\n' in current_line:
152
+ lines = current_line.split('\n')
153
+ # The last element might be incomplete, so keep it in current_line
154
+ current_line = lines[-1]
155
+ # Format the complete text
156
+ formatted_buffer = format_text(buffer)
157
+ yield formatted_buffer
158
+ else:
159
+ yield buffer
160
 
161
 
162
+ chatbot = gr.Chatbot(
163
+ height=600,
164
+ placeholder=PLACEHOLDER,
165
+ bubble_full_width=False,
166
+ show_copy_button=True
167
+ )
 
 
 
 
 
 
 
168
 
169
+ DEFAULT_SYSTEM_PROMPT = """You are an AI expert at providing high-quality answers. Your process involves these steps:
170
  1. Initial Thought: Use the <Thinking> tag to reason step-by-step and generate your best possible response to the following request: [User's Request Here].
171
  Example:
172
  <Thinking> Step 1: Understand the request. Step 2: Analyze potential solutions. Step 3: Choose the optimal response. </Thinking>
 
 
173
  2. Self-Critique: Critically evaluate your initial response within <Critique> tags, focusing on:
 
174
  Accuracy: Is it factually correct and verifiable?
 
175
  Clarity: Is it easy to understand and free of ambiguity?
 
176
  Completeness: Does it fully address the user's request?
 
177
  Improvement: What specific aspects could be better?
178
  Example:
179
  <Critique> Accuracy: Verified. Clarity: Needs simplification. Completeness: Add examples. </Critique>
 
 
 
180
  3. Revision: Based on your critique, use <Revising> tags to refine and improve your response.
181
  Example:
182
  <Revising> Adjusting for clarity and adding an example to improve understanding. </Revising>
 
 
183
  4. Final Response: Present your revised answer clearly within <Final> tags.
184
  Example:
185
  <Final> This is the improved response. </Final>
 
 
186
  5. Tag Innovation: If necessary, create and define new tags to better structure your reasoning or enhance clarity. Use them consistently.
187
  Example:
188
  <Definition> This tag defines a new term introduced in the response. </Definition>
189
+ Ensure every part of your thought process and output is properly enclosed in appropriate tags for clarity and organization."""
190
 
191
+ with gr.Blocks(css=CSS, theme="soft") as demo:
192
+ gr.HTML(TITLE)
193
+ gr.DuplicateButton(
194
+ value="Duplicate Space for private use",
195
+ elem_classes="duplicate-button"
196
+ )
197
+
198
+ gr.ChatInterface(
199
+ fn=stream_chat,
200
+ chatbot=chatbot,
201
+ fill_height=True,
202
+ additional_inputs_accordion=gr.Accordion(
203
+ label="⚙️ Parameters",
204
+ open=False,
205
+ render=False
206
+ ),
207
+ additional_inputs=[
208
+ gr.Textbox(
209
+ value=DEFAULT_SYSTEM_PROMPT,
210
  label="System Prompt",
211
  lines=5,
212
  render=False,
 
223
  minimum=128,
224
  maximum=32000,
225
  step=1,
226
+ value=8192,
227
  label="Max new tokens",
228
  render=False,
229
  ),
 
253
  ),
254
  ],
255
  examples=[
256
+ ["What is meant by a Singularity?"],
257
+ ["Explain the theory of Relativity"],
258
+ ["Explain your thought process in details"],
259
+ ["Explain how mamba2 structure LLMs work and how do they differ from transformers?"],
260
  ],
261
  cache_examples=False,
262
  )
263
 
 
264
  if __name__ == "__main__":
265
  demo.launch()