lmzjms commited on
Commit
9fe0f7e
·
1 Parent(s): 3032eab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -66
app.py CHANGED
@@ -48,67 +48,12 @@ def cut_dialogue_history(history_memory, keep_last_n_words = 500):
48
  last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
49
  paragraphs = paragraphs[1:]
50
  return '\n' + '\n'.join(paragraphs)
51
-
52
  class ConversationBot:
53
  def __init__(self):
54
  print("Initializing AudioChatGPT")
55
- self.llm = OpenAI(temperature=0)
56
- self.t2i = T2I(device="cuda:0")
57
- self.i2t = ImageCaptioning(device="cuda:1")
58
- self.t2a = T2A(device="cuda:0")
59
- self.tts = TTS(device="cuda:0")
60
- self.t2s = T2S(device="cuda:2")
61
- self.i2a = I2A(device="cuda:1")
62
- self.a2t = A2T(device="cuda:2")
63
- self.asr = ASR(device="cuda:1")
64
- self.inpaint = Inpaint(device="cuda:0")
65
- #self.tts_ood = TTS_OOD(device="cuda:0")
66
  self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
67
- self.tools = [
68
- Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
69
- description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
70
- "The input to this tool should be a string, representing the text used to generate image. "),
71
- Tool(name="Get Photo Description", func=self.i2t.inference,
72
- description="useful for when you want to know what is inside the photo. receives image_path as input. "
73
- "The input to this tool should be a string, representing the image_path. "),
74
- Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
75
- description="useful for when you want to generate an audio from a user input text and it saved it to a file."
76
- "The input to this tool should be a string, representing the text used to generate audio."),
77
- # Tool(
78
- # name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
79
- # description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
80
- # "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
81
- # "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
82
- Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
83
- description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
84
- "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
85
- "If Like: Generate a piece of singing voice. Text: xxx, Note: xxx, Duration: xxx. "
86
- "Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
87
- "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
88
- Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
89
- description="useful for when you want to convert a user input text into speech audio it saved it to a file."
90
- "The input to this tool should be a string, representing the text used to be converted to speech."),
91
- Tool(name="Generate Audio From The Image", func=self.i2a.inference,
92
- description="useful for when you want to generate an audio based on an image."
93
- "The input to this tool should be a string, representing the image_path. "),
94
- Tool(name="Generate Text From The Audio", func=self.a2t.inference,
95
- description="useful for when you want to describe an audio in text, receives audio_path as input."
96
- "The input to this tool should be a string, representing the audio_path."),
97
- Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
98
- description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
99
- "The input to this tool should be a string, representing the audio_path."),
100
- Tool(name="Transcribe speech", func=self.asr.inference,
101
- description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
102
- "The input to this tool should be a string, representing the audio_path.")]
103
- self.agent = initialize_agent(
104
- self.tools,
105
- self.llm,
106
- agent="conversational-react-description",
107
- verbose=True,
108
- memory=self.memory,
109
- return_intermediate_steps=True,
110
- agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
111
-
112
  def run_text(self, text, state):
113
  print("===============Running run_text =============")
114
  print("Inputs:", text, state)
@@ -125,7 +70,7 @@ class ConversationBot:
125
  tool = res['intermediate_steps'][0][0].tool
126
  if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
127
  print("======>Current memory:\n %s" % self.agent.memory)
128
- response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
129
  state = state + [(text, response)]
130
  print("Outputs:", state)
131
  return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
@@ -140,7 +85,7 @@ class ConversationBot:
140
  print("Outputs:", state)
141
  return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
142
  print("======>Current memory:\n %s" % self.agent.memory)
143
- response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
144
  audio_filename = res['intermediate_steps'][0][1]
145
  state = state + [(text, response)]
146
  print("Outputs:", state)
@@ -185,7 +130,7 @@ class ConversationBot:
185
  AI_prompt = "Received. "
186
  self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
187
  print("======>Current memory:\n %s" % self.agent.memory)
188
- state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
189
  print("Outputs:", state)
190
  return state, state, txt + ' ' + image_filename + ' ', gr.Audio.update(visible=False)
191
 
@@ -195,7 +140,7 @@ class ConversationBot:
195
  print("======>Previous memory:\n %s" % self.agent.memory)
196
  inpaint = Inpaint(device="cuda:0")
197
  new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
198
- AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
199
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
200
  print("======>Current memory:\n %s" % self.agent.memory)
201
  state = state + [(f"Audio Inpainting", AI_prompt)]
@@ -207,30 +152,106 @@ class ConversationBot:
207
  return gr.Image.update(value=None, visible=False)
208
  def clear_button(self):
209
  return gr.Button.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
 
212
  if __name__ == '__main__':
213
  bot = ConversationBot()
 
214
  with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
 
 
 
 
 
 
 
215
  with gr.Row():
216
  gr.Markdown("## Audio ChatGPT")
217
  chatbot = gr.Chatbot(elem_id="chatbot", label="Audio ChatGPT")
218
  state = gr.State([])
219
- with gr.Row():
220
  with gr.Column(scale=0.7):
221
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
222
  with gr.Column(scale=0.15, min_width=0):
223
  clear = gr.Button("Clear️")
224
  with gr.Column(scale=0.15, min_width=0):
225
  btn = gr.UploadButton("Upload", file_types=["image","audio"])
226
- with gr.Column():
227
- outaudio = gr.Audio(visible=False)
228
- with gr.Row():
229
  with gr.Column():
230
  show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
231
  run_button = gr.Button("Predict Masked Place",visible=False)
 
 
 
 
 
 
 
 
 
 
 
232
 
233
-
234
  txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
235
  txt.submit(lambda: "", None, txt)
236
  btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
 
48
  last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
49
  paragraphs = paragraphs[1:]
50
  return '\n' + '\n'.join(paragraphs)
51
+
52
  class ConversationBot:
53
  def __init__(self):
54
  print("Initializing AudioChatGPT")
55
+ self.tools = []
 
 
 
 
 
 
 
 
 
 
56
  self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def run_text(self, text, state):
58
  print("===============Running run_text =============")
59
  print("Inputs:", text, state)
 
70
  tool = res['intermediate_steps'][0][0].tool
71
  if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
72
  print("======>Current memory:\n %s" % self.agent.memory)
73
+ response = re.sub('(image/\S*png)', lambda m: f'![]({m.group(0)})*{m.group(0)}*', res['output'])
74
  state = state + [(text, response)]
75
  print("Outputs:", state)
76
  return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
 
85
  print("Outputs:", state)
86
  return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
87
  print("======>Current memory:\n %s" % self.agent.memory)
88
+ response = re.sub('(image/\S*png)', lambda m: f'![]({m.group(0)})*{m.group(0)}*', res['output'])
89
  audio_filename = res['intermediate_steps'][0][1]
90
  state = state + [(text, response)]
91
  print("Outputs:", state)
 
130
  AI_prompt = "Received. "
131
  self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
132
  print("======>Current memory:\n %s" % self.agent.memory)
133
+ state = state + [(f"![]({image_filename})*{image_filename}*", AI_prompt)]
134
  print("Outputs:", state)
135
  return state, state, txt + ' ' + image_filename + ' ', gr.Audio.update(visible=False)
136
 
 
140
  print("======>Previous memory:\n %s" % self.agent.memory)
141
  inpaint = Inpaint(device="cuda:0")
142
  new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
143
+ AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![]({new_image_filename})*{new_image_filename}*"
144
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
145
  print("======>Current memory:\n %s" % self.agent.memory)
146
  state = state + [(f"Audio Inpainting", AI_prompt)]
 
152
  return gr.Image.update(value=None, visible=False)
153
  def clear_button(self):
154
  return gr.Button.update(visible=False)
155
+ def init_agent(self, openai_api_key):
156
+ self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
157
+ self.t2i = T2I(device="cuda:0")
158
+ self.i2t = ImageCaptioning(device="cuda:0")
159
+ self.t2a = T2A(device="cuda:0")
160
+ self.tts = TTS(device="cuda:0")
161
+ self.t2s = T2S(device="cuda:0")
162
+ self.i2a = I2A(device="cuda:0")
163
+ self.a2t = A2T(device="cuda:0")
164
+ self.asr = ASR(device="cuda:0")
165
+ self.inpaint = Inpaint(device="cuda:0")
166
+ #self.tts_ood = TTS_OOD(device="cuda:0")
167
+ self.tools = [
168
+ Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
169
+ description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
170
+ "The input to this tool should be a string, representing the text used to generate image. "),
171
+ Tool(name="Get Photo Description", func=self.i2t.inference,
172
+ description="useful for when you want to know what is inside the photo. receives image_path as input. "
173
+ "The input to this tool should be a string, representing the image_path. "),
174
+ Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
175
+ description="useful for when you want to generate an audio from a user input text and it saved it to a file."
176
+ "The input to this tool should be a string, representing the text used to generate audio."),
177
+ # Tool(
178
+ # name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
179
+ # description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
180
+ # "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
181
+ # "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
182
+ Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
183
+ description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
184
+ "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
185
+ "If Like: Generate a piece of singing voice. Text: xxx, Note: xxx, Duration: xxx. "
186
+ "Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
187
+ "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
188
+ Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
189
+ description="useful for when you want to convert a user input text into speech audio it saved it to a file."
190
+ "The input to this tool should be a string, representing the text used to be converted to speech."),
191
+ Tool(name="Generate Audio From The Image", func=self.i2a.inference,
192
+ description="useful for when you want to generate an audio based on an image."
193
+ "The input to this tool should be a string, representing the image_path. "),
194
+ Tool(name="Generate Text From The Audio", func=self.a2t.inference,
195
+ description="useful for when you want to describe an audio in text, receives audio_path as input."
196
+ "The input to this tool should be a string, representing the audio_path."),
197
+ Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
198
+ description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
199
+ "The input to this tool should be a string, representing the audio_path."),
200
+ Tool(name="Transcribe speech", func=self.asr.inference,
201
+ description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
202
+ "The input to this tool should be a string, representing the audio_path.")]
203
+ self.agent = initialize_agent(
204
+ self.tools,
205
+ self.llm,
206
+ agent="conversational-react-description",
207
+ verbose=True,
208
+ memory=self.memory,
209
+ return_intermediate_steps=True,
210
+ agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
211
+ return gr.update(visible = True)
212
+
213
 
214
 
215
  if __name__ == '__main__':
216
  bot = ConversationBot()
217
+
218
  with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
219
+ with gr.Row():
220
+ openai_api_key_textbox = gr.Textbox(
221
+ placeholder="Paste your OpenAI API key here to start Visual ChatGPT(sk-...) and press Enter ↵️",
222
+ show_label=False,
223
+ lines=1,
224
+ type="password",
225
+ )
226
  with gr.Row():
227
  gr.Markdown("## Audio ChatGPT")
228
  chatbot = gr.Chatbot(elem_id="chatbot", label="Audio ChatGPT")
229
  state = gr.State([])
230
+ with gr.Row(visible = False) as input_raws:
231
  with gr.Column(scale=0.7):
232
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
233
  with gr.Column(scale=0.15, min_width=0):
234
  clear = gr.Button("Clear️")
235
  with gr.Column(scale=0.15, min_width=0):
236
  btn = gr.UploadButton("Upload", file_types=["image","audio"])
237
+ with gr.Column():
238
+ outaudio = gr.Audio(visible=False)
 
239
  with gr.Column():
240
  show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
241
  run_button = gr.Button("Predict Masked Place",visible=False)
242
+ gr.Examples(
243
+ examples=["Generate an audio of a dog barking",
244
+ "Generate an audio of this image",
245
+ "Can you describe the audio with text?",
246
+ "Generate a speech with text 'here we go'",
247
+ "Generate an image of a cat",
248
+ "I want to inpaint this audio",
249
+ # "generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340."
250
+ ],
251
+ inputs=txt
252
+ )
253
 
254
+ openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
255
  txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
256
  txt.submit(lambda: "", None, txt)
257
  btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])