hosseinhimself commited on
Commit
c3afc24
·
verified ·
1 Parent(s): d806317

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -92
app.py CHANGED
@@ -1,25 +1,14 @@
1
- import os
2
-
3
- os.system("pip uninstall -y gradio")
4
- os.system("pip install gradio==4.44.1")
5
- os.system("pip install langfuse")
6
-
7
- from threading import Thread
8
- from typing import Iterator
9
-
10
  import gradio as gr
11
- from langfuse import Langfuse
12
- from langfuse.decorators import observe
13
- import spaces
14
  import torch
15
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
16
  import time
17
-
18
- #from utils import load_list_from_json
19
 
20
  MAX_MAX_NEW_TOKENS = 2048
21
  DEFAULT_MAX_NEW_TOKENS = 1024
22
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
23
 
24
  DESCRIPTION = """\
25
  # ISANG-1.0-8B Chat
@@ -47,19 +36,16 @@ pre, code {
47
  }
48
  """
49
 
50
- system_prompt = str(os.getenv("SYSTEM_PROMPT"))
51
 
52
- secret_key = str(os.getenv("LANGFUSE_SECRET_KEY"))
53
- public_key = str(os.getenv("LANGFUSE_PUBLIC_KEY"))
54
- host = str(os.getenv("LANGFUSE_HOST"))
55
-
56
- langfuse = Langfuse(
57
- secret_key=secret_key,
58
- public_key=public_key,
59
- host=host
60
  )
 
61
 
62
- #REJECTED_VOCAB = load_list_from_json("rejected_vocab_extended.json")
63
 
64
  def execution_time_calculator(start_time, log=True):
65
  delta = time.time() - start_time
@@ -70,30 +56,6 @@ def execution_time_calculator(start_time, log=True):
70
  def token_per_second_calculator(tokens_count, time_delta):
71
  return tokens_count / time_delta
72
 
73
- if not torch.cuda.is_available():
74
- DESCRIPTION = "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
75
-
76
- if torch.cuda.is_available():
77
- # Use your own model here:
78
- model_id = "hosseinhimself/ISANG-1.0-8B"
79
- model = AutoModelForCausalLM.from_pretrained(
80
- model_id, device_map="auto", torch_dtype=torch.bfloat16
81
- )
82
- tokenizer = AutoTokenizer.from_pretrained(model_id)
83
-
84
- generation_speed = 0
85
-
86
- def get_generation_speed():
87
- global generation_speed
88
- return generation_speed
89
-
90
- @observe()
91
- def log_to_langfuse(message, chat_history, max_new_tokens, temperature, top_p, top_k,
92
- repetition_penalty, do_sample, generation_speed, model_outputs):
93
- print(f"generation_speed: {generation_speed}")
94
- return "".join(model_outputs)
95
-
96
- @spaces.GPU
97
  def generate(
98
  message: str,
99
  chat_history: list[tuple[str, str]],
@@ -104,24 +66,18 @@ def generate(
104
  repetition_penalty: float = 1.2,
105
  do_sample: bool = True,
106
  ) -> Iterator[str]:
107
- global generation_speed
108
- global system_prompt
109
-
110
  conversation = []
111
- if system_prompt:
112
- conversation.append({"role": "system", "content": system_prompt})
113
- for user, assistant in chat_history:
114
- conversation.extend([
115
- {"role": "user", "content": user},
116
- {"role": "assistant", "content": assistant}
117
- ])
118
  conversation.append({"role": "user", "content": message})
119
 
120
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
121
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
122
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
123
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
124
- input_ids = input_ids.to(model.device)
125
 
126
  streamer = TextIteratorStreamer(
127
  tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
@@ -136,7 +92,6 @@ def generate(
136
  temperature=temperature,
137
  num_beams=1,
138
  repetition_penalty=repetition_penalty,
139
- #bad_words_ids=REJECTED_VOCAB,
140
  )
141
 
142
  start_time = time.time()
@@ -148,36 +103,23 @@ def generate(
148
  for text in streamer:
149
  num_tokens = len(tokenizer.tokenize(text))
150
  sum_tokens += num_tokens
151
-
152
  outputs.append(text)
153
  yield "".join(outputs)
154
 
155
  time_delta = execution_time_calculator(start_time, log=False)
156
  generation_speed = token_per_second_calculator(sum_tokens, time_delta)
157
 
158
- log_function = log_to_langfuse(
159
- message=message,
160
- chat_history=chat_history,
161
- max_new_tokens=max_new_tokens,
162
- temperature=temperature,
163
- top_p=top_p,
164
- top_k=top_k,
165
- repetition_penalty=repetition_penalty,
166
- do_sample=do_sample,
167
- generation_speed=generation_speed,
168
- model_outputs=outputs,
169
- )
170
-
171
- chatbot = gr.Chatbot(placeholder=PLACEHOLDER, scale=1, show_copy_button=True, height="68%", rtl=True)
172
- chat_input = gr.Textbox(show_label=False, lines=2, rtl=True, placeholder="ورودی", show_copy_button=True, scale=4)
173
- submit_btn = gr.Button(variant="primary", value="ارسال", size="sm", scale=1, elem_classes=["_button"])
174
 
175
  chat_interface = gr.ChatInterface(
176
  fn=generate,
177
- additional_inputs_accordion=gr.Accordion(label="ورودی‌های اضافی", open=False),
178
  additional_inputs=[
179
  gr.Slider(
180
- label="حداکثر تعداد توکن ها",
181
  minimum=1,
182
  maximum=MAX_MAX_NEW_TOKENS,
183
  step=1,
@@ -188,7 +130,7 @@ chat_interface = gr.ChatInterface(
188
  minimum=0.01,
189
  maximum=4.0,
190
  step=0.01,
191
- value=0.5,
192
  ),
193
  gr.Slider(
194
  label="Top-p",
@@ -202,28 +144,28 @@ chat_interface = gr.ChatInterface(
202
  minimum=1,
203
  maximum=1000,
204
  step=1,
205
- value=20,
206
  ),
207
  gr.Slider(
208
- label="جریمه تکرار",
209
  minimum=1.0,
210
  maximum=2.0,
211
  step=0.05,
212
  value=1.2,
213
  ),
214
  gr.Dropdown(
215
- label="نمونه‌گیری",
216
  choices=[False, True],
217
  value=True
218
  )
219
  ],
220
- stop_btn="توقف",
221
  chatbot=chatbot,
222
  textbox=chat_input,
223
  submit_btn=submit_btn,
224
- retry_btn="🔄 تلاش مجدد",
225
- undo_btn="↩️ بازگشت",
226
- clear_btn="🗑️ پاک کردن",
227
  title="ISANG AI"
228
  )
229
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
+ from threading import Thread
5
+ from typing import Iterator
6
  import time
7
+ import os
 
8
 
9
  MAX_MAX_NEW_TOKENS = 2048
10
  DEFAULT_MAX_NEW_TOKENS = 1024
11
+ MAX_INPUT_TOKEN_LENGTH = 4096
12
 
13
  DESCRIPTION = """\
14
  # ISANG-1.0-8B Chat
 
36
  }
37
  """
38
 
39
+ system_prompt = "You are a helpful assistant."
40
 
41
+ # Load the model
42
+ model_id = "hosseinhimself/ISANG-1.0-8B"
43
+ model = AutoModelForCausalLM.from_pretrained(
44
+ model_id, device_map="auto", torch_dtype=torch.bfloat16
 
 
 
 
45
  )
46
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
47
 
48
+ generation_speed = 0
49
 
50
  def execution_time_calculator(start_time, log=True):
51
  delta = time.time() - start_time
 
56
  def token_per_second_calculator(tokens_count, time_delta):
57
  return tokens_count / time_delta
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def generate(
60
  message: str,
61
  chat_history: list[tuple[str, str]],
 
66
  repetition_penalty: float = 1.2,
67
  do_sample: bool = True,
68
  ) -> Iterator[str]:
 
 
 
69
  conversation = []
70
+ conversation.append({"role": "system", "content": system_prompt})
71
+
72
+ # Add previous conversation to history (send the last two exchanges as context)
73
+ for user, assistant in chat_history[-2:]:
74
+ conversation.append({"role": "user", "content": user})
75
+ conversation.append({"role": "assistant", "content": assistant})
76
+
77
  conversation.append({"role": "user", "content": message})
78
 
79
+ input_ids = tokenizer(conversation, return_tensors="pt", padding=True, truncation=True)
80
+ input_ids = input_ids.input_ids.to(model.device)
 
 
 
81
 
82
  streamer = TextIteratorStreamer(
83
  tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
 
92
  temperature=temperature,
93
  num_beams=1,
94
  repetition_penalty=repetition_penalty,
 
95
  )
96
 
97
  start_time = time.time()
 
103
  for text in streamer:
104
  num_tokens = len(tokenizer.tokenize(text))
105
  sum_tokens += num_tokens
 
106
  outputs.append(text)
107
  yield "".join(outputs)
108
 
109
  time_delta = execution_time_calculator(start_time, log=False)
110
  generation_speed = token_per_second_calculator(sum_tokens, time_delta)
111
 
112
+ # Define Gradio interface components
113
+ chatbot = gr.Chatbot(placeholder=PLACEHOLDER, scale=1, show_copy_button=True, height="68%")
114
+ chat_input = gr.Textbox(show_label=False, lines=2, placeholder="Enter your message", show_copy_button=True, scale=4)
115
+ submit_btn = gr.Button(variant="primary", value="Submit", size="sm", scale=1, elem_classes=["_button"])
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  chat_interface = gr.ChatInterface(
118
  fn=generate,
119
+ additional_inputs_accordion=gr.Accordion(label="Additional Inputs", open=False),
120
  additional_inputs=[
121
  gr.Slider(
122
+ label="Max New Tokens",
123
  minimum=1,
124
  maximum=MAX_MAX_NEW_TOKENS,
125
  step=1,
 
130
  minimum=0.01,
131
  maximum=4.0,
132
  step=0.01,
133
+ value=0.6,
134
  ),
135
  gr.Slider(
136
  label="Top-p",
 
144
  minimum=1,
145
  maximum=1000,
146
  step=1,
147
+ value=50,
148
  ),
149
  gr.Slider(
150
+ label="Repetition Penalty",
151
  minimum=1.0,
152
  maximum=2.0,
153
  step=0.05,
154
  value=1.2,
155
  ),
156
  gr.Dropdown(
157
+ label="Sampling",
158
  choices=[False, True],
159
  value=True
160
  )
161
  ],
162
+ stop_btn="Stop",
163
  chatbot=chatbot,
164
  textbox=chat_input,
165
  submit_btn=submit_btn,
166
+ retry_btn="Retry",
167
+ undo_btn="Undo",
168
+ clear_btn="Clear",
169
  title="ISANG AI"
170
  )
171