lmzjms commited on
Commit
1d93965
1 Parent(s): cfba0dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -164
app.py CHANGED
@@ -17,6 +17,7 @@ AudioGPT can not directly read audios, but it has a list of tools to finish diff
17
  AudioGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
18
  Human may provide new audios to AudioGPT with a description. The description helps AudioGPT to understand this audio, but AudioGPT should use tools to finish following tasks, rather than directly imagine from the description.
19
  Overall, AudioGPT is a powerful audio dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
 
20
  TOOLS:
21
  ------
22
  AudioGPT has access to the following tools:"""
@@ -57,8 +58,6 @@ def cut_dialogue_history(history_memory, keep_last_n_words = 500):
57
  paragraphs = paragraphs[1:]
58
  return '\n' + '\n'.join(paragraphs)
59
 
60
-
61
-
62
  class ConversationBot:
63
  def __init__(self, load_dict):
64
  print("Initializing AudioGPT")
@@ -67,6 +66,11 @@ class ConversationBot:
67
  self.models = dict()
68
  for class_name, device in load_dict.items():
69
  self.models[class_name] = globals()[class_name](device=device)
 
 
 
 
 
70
 
71
  def run_text(self, text, state):
72
  print("===============Running run_text =============")
@@ -79,7 +83,7 @@ class ConversationBot:
79
  response = res['output']
80
  state = state + [(text, response)]
81
  print("Outputs:", state)
82
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
83
  else:
84
  tool = res['intermediate_steps'][0][0].tool
85
  if tool == "Generate Image From User Input Text":
@@ -88,14 +92,14 @@ class ConversationBot:
88
  state = state + [(text, response)]
89
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
90
  f"Current Memory: {self.agent.memory.buffer}")
91
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
92
  elif tool == "Detect The Sound Event From The Audio":
93
  image_filename = res['intermediate_steps'][0][1]
94
  response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
95
  state = state + [(text, response)]
96
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
97
  f"Current Memory: {self.agent.memory.buffer}")
98
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
99
  elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
100
  print("======>Current memory:\n %s" % self.agent.memory)
101
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
@@ -103,21 +107,22 @@ class ConversationBot:
103
  #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
104
  state = state + [(text, response)]
105
  print("Outputs:", state)
106
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
107
  elif tool == "Audio Inpainting":
108
  audio_filename = res['intermediate_steps'][0][0].tool_input
109
  image_filename = res['intermediate_steps'][0][1]
110
  print("======>Current memory:\n %s" % self.agent.memory)
 
111
  response = res['output']
112
  state = state + [(text, response)]
113
  print("Outputs:", state)
114
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
115
  print("======>Current memory:\n %s" % self.agent.memory)
116
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
117
  audio_filename = res['intermediate_steps'][0][1]
118
  state = state + [(text, response)]
119
  print("Outputs:", state)
120
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
121
 
122
  def run_image_or_audio(self, file, state, txt):
123
  file_type = file.name[-3:]
@@ -126,9 +131,8 @@ class ConversationBot:
126
  print("Inputs:", file, state)
127
  print("======>Previous memory:\n %s" % self.agent.memory)
128
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
129
- # audio_load = whisper.load_audio(file.name)
130
- audio_load, sr = soundfile.read(file.name)
131
- soundfile.write(audio_filename, audio_load, samplerate = sr)
132
  description = self.models['A2T'].inference(audio_filename)
133
  Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
134
  "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
@@ -140,7 +144,7 @@ class ConversationBot:
140
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
141
  state = state + [(f"*{audio_filename}*", AI_prompt)]
142
  print("Outputs:", state)
143
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False)
144
  else:
145
  # print("===============Running run_image =============")
146
  # print("Inputs:", file, state)
@@ -166,69 +170,13 @@ class ConversationBot:
166
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
167
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
168
  f"Current Memory: {self.agent.memory.buffer}")
169
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False)
170
-
171
- def speech(self, speech_input, state):
172
- input_audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
173
- text = self.models['ASR'].translate_english(speech_input)
174
- print("Inputs:", text, state)
175
- print("======>Previous memory:\n %s" % self.agent.memory)
176
- self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
177
- res = self.agent({"input": text})
178
- if res['intermediate_steps'] == []:
179
- print("======>Current memory:\n %s" % self.agent.memory)
180
- response = res['output']
181
- output_audio_filename = self.models['TTS'].inference(response)
182
- state = state + [(text, response)]
183
- print("Outputs:", state)
184
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
185
- else:
186
- tool = res['intermediate_steps'][0][0].tool
187
- if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Target Sound Detection":
188
- print("======>Current memory:\n %s" % self.agent.memory)
189
- response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
190
- output_audio_filename = self.models['TTS'].inference(res['output'])
191
- state = state + [(text, response)]
192
- print("Outputs:", state)
193
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
194
- elif tool == "Transcribe Speech":
195
- print("======>Current memory:\n %s" % self.agent.memory)
196
- output_audio_filename = self.models['TTS'].inference(res['output'])
197
- response = res['output']
198
- state = state + [(text, response)]
199
- print("Outputs:", state)
200
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
201
- elif tool == "Detect The Sound Event From The Audio":
202
- print("======>Current memory:\n %s" % self.agent.memory)
203
- image_filename = res['intermediate_steps'][0][1]
204
- output_audio_filename = self.models['TTS'].inference(res['output'])
205
- response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
206
- state = state + [(text, response)]
207
- print("Outputs:", state)
208
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
209
- elif tool == "Generate a talking human portrait video given a input Audio":
210
- video_filename = res['intermediate_steps'][0][1]
211
- print("======>Current memory:\n %s" % self.agent.memory)
212
- response = res['output']
213
- output_audio_filename = self.models['TTS'].inference(res['output'])
214
- state = state + [(text, response)]
215
- print("Outputs:", state)
216
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(value=video_filename,visible=True)
217
- print("======>Current memory:\n %s" % self.agent.memory)
218
- response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
219
- audio_filename = res['intermediate_steps'][0][1]
220
- Res = "The audio file has been generated and the audio is "
221
- output_audio_filename = merge_audio(self.models['TTS'].inference(Res), audio_filename)
222
- print(output_audio_filename)
223
- state = state + [(text, response)]
224
- response = res['output']
225
- print("Outputs:", state)
226
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
227
 
228
  def inpainting(self, state, audio_filename, image_filename):
229
  print("===============Running inpainting =============")
230
  print("Inputs:", state)
231
  print("======>Previous memory:\n %s" % self.agent.memory)
 
232
  new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
233
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
234
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
@@ -238,50 +186,21 @@ class ConversationBot:
238
  return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
239
  def clear_audio(self):
240
  return gr.Audio.update(value=None, visible=False)
241
- def clear_input_audio(self):
242
- return gr.Audio.update(value=None)
243
  def clear_image(self):
244
  return gr.Image.update(value=None, visible=False)
245
- def clear_video(self):
246
- return gr.Video.update(value=None, visible=False)
247
  def clear_button(self):
248
  return gr.Button.update(visible=False)
249
-
250
- def init_agent(self, openai_api_key, interaction_type):
251
- if interaction_type == "text":
252
- for class_name, instance in self.models.items():
253
- for e in dir(instance):
254
- if e.startswith('inference'):
255
- func = getattr(instance, e)
256
- self.tools.append(Tool(name=func.name, description=func.description, func=func))
257
- self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
258
- self.agent = initialize_agent(
259
- self.tools,
260
- self.llm,
261
- agent="conversational-react-description",
262
- verbose=True,
263
- memory=self.memory,
264
- return_intermediate_steps=True,
265
- agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
266
- return gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)
267
- else:
268
- for class_name, instance in self.models.items():
269
- if class_name != 'T2A' and class_name != 'I2A' and class_name != 'Inpaint' and class_name != 'ASR' and class_name != 'SoundDetection' and class_name != 'Speech_Enh_SC' and class_name != 'Speech_SS':
270
- for e in dir(instance):
271
- if e.startswith('inference'):
272
- func = getattr(instance, e)
273
- self.tools.append(Tool(name=func.name, description=func.description, func=func))
274
-
275
- self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
276
- self.agent = initialize_agent(
277
- self.tools,
278
- self.llm,
279
- agent="conversational-react-description",
280
- verbose=True,
281
- memory=self.memory,
282
- return_intermediate_steps=True,
283
- agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
284
- return gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)
285
 
286
 
287
 
@@ -297,54 +216,39 @@ if __name__ == '__main__':
297
  'SoundDetection': 'cpu',
298
  'Binaural': 'cuda:0',
299
  'SoundExtraction': 'cuda:0',
300
- 'TargetSoundDetection': 'cuda:0',
301
- 'Speech_Enh_SC': 'cuda:0',
302
- 'Speech_SS': 'cuda:0'
303
  })
304
- with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
305
- with gr.Row():
306
- gr.Markdown("## AudioGPT")
307
- chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT", visible=False)
308
- state = gr.State([])
309
 
310
- with gr.Row() as select_raws:
311
- with gr.Column(scale=0.7):
312
- interaction_type = gr.Radio(choices=['text', 'speech'], value='text', label='Interaction Type')
313
  openai_api_key_textbox = gr.Textbox(
314
  placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
315
  show_label=False,
316
  lines=1,
317
  type="password",
318
  )
319
- with gr.Row(visible=False) as text_input_raws:
 
 
 
320
  with gr.Column(scale=0.7):
321
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
322
  with gr.Column(scale=0.1, min_width=0):
323
  run = gr.Button("🏃‍♂️Run")
324
  with gr.Column(scale=0.1, min_width=0):
325
- clear_txt = gr.Button("🔄Clear️")
326
  with gr.Column(scale=0.1, min_width=0):
327
  btn = gr.UploadButton("🖼️Upload", file_types=["image","audio"])
328
-
329
- with gr.Row():
330
- outaudio = gr.Audio(visible=False)
331
- with gr.Row():
332
- with gr.Column(scale=0.3, min_width=0):
333
- outvideo = gr.Video(visible=False)
334
- with gr.Row():
335
- show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
336
- with gr.Row():
337
- run_button = gr.Button("Predict Masked Place",visible=False)
338
-
339
- with gr.Row(visible=False) as speech_input_raws:
340
- with gr.Column(scale=0.7):
341
- speech_input = gr.Audio(source="microphone", type="filepath", label="Input")
342
- with gr.Column(scale=0.15, min_width=0):
343
- submit_btn = gr.Button("🏃‍♂️Submit")
344
- with gr.Column(scale=0.15, min_width=0):
345
- clear_speech = gr.Button("🔄Clear️")
346
- with gr.Row():
347
- speech_output = gr.Audio(label="Output",visible=False)
348
  gr.Examples(
349
  examples=["Generate a speech with text 'here we go'",
350
  "Transcribe this speech",
@@ -361,27 +265,18 @@ if __name__ == '__main__':
361
  inputs=txt
362
  )
363
 
364
- openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox, interaction_type], [select_raws, chatbot, text_input_raws, speech_input_raws])
365
-
366
- txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
367
  txt.submit(lambda: "", None, txt)
368
- run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
369
  run.click(lambda: "", None, txt)
370
- btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, outaudio, outvideo])
371
- run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, outvideo, run_button])
372
- clear_txt.click(bot.memory.clear)
373
- clear_txt.click(lambda: [], None, chatbot)
374
- clear_txt.click(lambda: [], None, state)
375
- clear_txt.click(lambda:None, None, txt)
376
- clear_txt.click(bot.clear_button, None, run_button)
377
- clear_txt.click(bot.clear_image, None, show_mel)
378
- clear_txt.click(bot.clear_audio, None, outaudio)
379
- clear_txt.click(bot.clear_video, None, outvideo)
380
-
381
- submit_btn.click(bot.speech, [speech_input, state], [speech_input, speech_output, state, outvideo])
382
- clear_speech.click(bot.clear_input_audio, None, speech_input)
383
- clear_speech.click(bot.clear_audio, None, speech_output)
384
- clear_speech.click(lambda: [], None, state)
385
- clear_speech.click(bot.clear_video, None, outvideo)
386
-
387
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
17
  AudioGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
18
  Human may provide new audios to AudioGPT with a description. The description helps AudioGPT to understand this audio, but AudioGPT should use tools to finish following tasks, rather than directly imagine from the description.
19
  Overall, AudioGPT is a powerful audio dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
20
+
21
  TOOLS:
22
  ------
23
  AudioGPT has access to the following tools:"""
 
58
  paragraphs = paragraphs[1:]
59
  return '\n' + '\n'.join(paragraphs)
60
 
 
 
61
  class ConversationBot:
62
  def __init__(self, load_dict):
63
  print("Initializing AudioGPT")
 
66
  self.models = dict()
67
  for class_name, device in load_dict.items():
68
  self.models[class_name] = globals()[class_name](device=device)
69
+ for class_name, instance in self.models.items():
70
+ for e in dir(instance):
71
+ if e.startswith('inference'):
72
+ func = getattr(instance, e)
73
+ self.tools.append(Tool(name=func.name, description=func.description, func=func))
74
 
75
  def run_text(self, text, state):
76
  print("===============Running run_text =============")
 
83
  response = res['output']
84
  state = state + [(text, response)]
85
  print("Outputs:", state)
86
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
87
  else:
88
  tool = res['intermediate_steps'][0][0].tool
89
  if tool == "Generate Image From User Input Text":
 
92
  state = state + [(text, response)]
93
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
94
  f"Current Memory: {self.agent.memory.buffer}")
95
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
96
  elif tool == "Detect The Sound Event From The Audio":
97
  image_filename = res['intermediate_steps'][0][1]
98
  response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
99
  state = state + [(text, response)]
100
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
101
  f"Current Memory: {self.agent.memory.buffer}")
102
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
103
  elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
104
  print("======>Current memory:\n %s" % self.agent.memory)
105
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
 
107
  #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
108
  state = state + [(text, response)]
109
  print("Outputs:", state)
110
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
111
  elif tool == "Audio Inpainting":
112
  audio_filename = res['intermediate_steps'][0][0].tool_input
113
  image_filename = res['intermediate_steps'][0][1]
114
  print("======>Current memory:\n %s" % self.agent.memory)
115
+ print(res)
116
  response = res['output']
117
  state = state + [(text, response)]
118
  print("Outputs:", state)
119
+ return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
120
  print("======>Current memory:\n %s" % self.agent.memory)
121
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
122
  audio_filename = res['intermediate_steps'][0][1]
123
  state = state + [(text, response)]
124
  print("Outputs:", state)
125
+ return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(visible=False), gr.Button.update(visible=False)
126
 
127
  def run_image_or_audio(self, file, state, txt):
128
  file_type = file.name[-3:]
 
131
  print("Inputs:", file, state)
132
  print("======>Previous memory:\n %s" % self.agent.memory)
133
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
134
+ audio_load = whisper.load_audio(file.name)
135
+ soundfile.write(audio_filename, audio_load, samplerate = 16000)
 
136
  description = self.models['A2T'].inference(audio_filename)
137
  Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
138
  "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
 
144
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
145
  state = state + [(f"*{audio_filename}*", AI_prompt)]
146
  print("Outputs:", state)
147
+ return state, state, txt + ' ' + audio_filename + ' ', gr.Audio.update(value=audio_filename,visible=True)
148
  else:
149
  # print("===============Running run_image =============")
150
  # print("Inputs:", file, state)
 
170
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
171
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
172
  f"Current Memory: {self.agent.memory.buffer}")
173
+ return state, state, txt + f'{txt} {image_filename} ', gr.Audio.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  def inpainting(self, state, audio_filename, image_filename):
176
  print("===============Running inpainting =============")
177
  print("Inputs:", state)
178
  print("======>Previous memory:\n %s" % self.agent.memory)
179
+ # inpaint = Inpaint(device="cpu")
180
  new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
181
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
182
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
 
186
  return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
187
  def clear_audio(self):
188
  return gr.Audio.update(value=None, visible=False)
 
 
189
  def clear_image(self):
190
  return gr.Image.update(value=None, visible=False)
 
 
191
  def clear_button(self):
192
  return gr.Button.update(visible=False)
193
+ def init_agent(self, openai_api_key):
194
+ self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
195
+ self.agent = initialize_agent(
196
+ self.tools,
197
+ self.llm,
198
+ agent="conversational-react-description",
199
+ verbose=True,
200
+ memory=self.memory,
201
+ return_intermediate_steps=True,
202
+ agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
203
+ return gr.update(visible = True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
 
206
 
 
216
  'SoundDetection': 'cpu',
217
  'Binaural': 'cuda:0',
218
  'SoundExtraction': 'cuda:0',
219
+ 'TargetSoundDetection': 'cuda:0'
 
 
220
  })
221
+ with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
222
+ gr.Markdown(_DESCRIPTION)
 
 
 
223
 
224
+ with gr.Row():
 
 
225
  openai_api_key_textbox = gr.Textbox(
226
  placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
227
  show_label=False,
228
  lines=1,
229
  type="password",
230
  )
231
+
232
+ chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
233
+ state = gr.State([])
234
+ with gr.Row(visible = False) as input_raws:
235
  with gr.Column(scale=0.7):
236
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
237
  with gr.Column(scale=0.1, min_width=0):
238
  run = gr.Button("🏃‍♂️Run")
239
  with gr.Column(scale=0.1, min_width=0):
240
+ clear = gr.Button("🔄Clear️")
241
  with gr.Column(scale=0.1, min_width=0):
242
  btn = gr.UploadButton("🖼️Upload", file_types=["image","audio"])
243
+ with gr.Row():
244
+ with gr.Column():
245
+ outaudio = gr.Audio(visible=False)
246
+ with gr.Row():
247
+ with gr.Column():
248
+ show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
249
+ with gr.Row():
250
+ with gr.Column():
251
+ run_button = gr.Button("Predict Masked Place",visible=False)
 
 
 
 
 
 
 
 
 
 
 
252
  gr.Examples(
253
  examples=["Generate a speech with text 'here we go'",
254
  "Transcribe this speech",
 
265
  inputs=txt
266
  )
267
 
268
+ openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
269
+ txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
 
270
  txt.submit(lambda: "", None, txt)
271
+ run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
272
  run.click(lambda: "", None, txt)
273
+ btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
274
+ run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, run_button])
275
+ clear.click(bot.memory.clear)
276
+ clear.click(lambda: [], None, chatbot)
277
+ clear.click(lambda: [], None, state)
278
+ clear.click(lambda:None, None, txt)
279
+ clear.click(bot.clear_button, None, run_button)
280
+ clear.click(bot.clear_image, None, show_mel)
281
+ clear.click(bot.clear_audio, None, outaudio)
 
 
 
 
 
 
 
 
282
  demo.launch(server_name="0.0.0.0", server_port=7860)