lmzjms commited on
Commit
489cbec
1 Parent(s): 294a303

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -21
app.py CHANGED
@@ -70,7 +70,7 @@ class ConversationBot:
70
  tool = res['intermediate_steps'][0][0].tool
71
  if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
72
  print("======>Current memory:\n %s" % self.agent.memory)
73
- response = re.sub('(image/\S*png)', lambda m: f'![]({m.group(0)})*{m.group(0)}*', res['output'])
74
  state = state + [(text, response)]
75
  print("Outputs:", state)
76
  return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
@@ -85,7 +85,7 @@ class ConversationBot:
85
  print("Outputs:", state)
86
  return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
87
  print("======>Current memory:\n %s" % self.agent.memory)
88
- response = re.sub('(image/\S*png)', lambda m: f'![]({m.group(0)})*{m.group(0)}*', res['output'])
89
  audio_filename = res['intermediate_steps'][0][1]
90
  state = state + [(text, response)]
91
  print("Outputs:", state)
@@ -134,7 +134,7 @@ class ConversationBot:
134
  AI_prompt = "Received. "
135
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
136
  print("======>Current memory:\n %s" % self.agent.memory)
137
- state = state + [(f"![]({image_filename})*{image_filename}*", AI_prompt)]
138
  print("Outputs:", state)
139
  return state, state, txt + ' ' + image_filename + ' ', gr.Audio.update(visible=False)
140
 
@@ -144,7 +144,7 @@ class ConversationBot:
144
  print("======>Previous memory:\n %s" % self.agent.memory)
145
  inpaint = Inpaint(device="cuda:0")
146
  new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
147
- AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![]({new_image_filename})*{new_image_filename}*"
148
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
149
  print("======>Current memory:\n %s" % self.agent.memory)
150
  state = state + [(f"Audio Inpainting", AI_prompt)]
@@ -160,13 +160,13 @@ class ConversationBot:
160
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
161
  # self.t2i = T2I(device="cuda:0")
162
  # self.i2t = ImageCaptioning(device="cuda:0")
163
- self.t2a = T2A(device="cpu")
164
  self.tts = TTS(device="cpu")
165
  # self.t2s = T2S(device="cuda:0")
166
- self.i2a = I2A(device="cpu")
167
- self.a2t = A2T(device="cpu")
168
  # self.asr = ASR(device="cuda:0")
169
- self.inpaint = Inpaint(device="cpu")
170
  #self.tts_ood = TTS_OOD(device="cuda:0")
171
  self.tools = [
172
  # Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
@@ -175,9 +175,9 @@ class ConversationBot:
175
  # Tool(name="Get Photo Description", func=self.i2t.inference,
176
  # description="useful for when you want to know what is inside the photo. receives image_path as input. "
177
  # "The input to this tool should be a string, representing the image_path. "),
178
- Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
179
- description="useful for when you want to generate an audio from a user input text and it saved it to a file."
180
- "The input to this tool should be a string, representing the text used to generate audio."),
181
  # Tool(
182
  # name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
183
  # description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
@@ -191,16 +191,16 @@ class ConversationBot:
191
  # "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
192
  Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
193
  description="useful for when you want to convert a user input text into speech audio it saved it to a file."
194
- "The input to this tool should be a string, representing the text used to be converted to speech."),
195
- Tool(name="Generate Audio From The Image", func=self.i2a.inference,
196
- description="useful for when you want to generate an audio based on an image."
197
- "The input to this tool should be a string, representing the image_path. "),
198
- Tool(name="Generate Text From The Audio", func=self.a2t.inference,
199
- description="useful for when you want to describe an audio in text, receives audio_path as input."
200
- "The input to this tool should be a string, representing the audio_path."),
201
- Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
202
- description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
203
- "The input to this tool should be a string, representing the audio_path.")]
204
  # Tool(name="Transcribe speech", func=self.asr.inference,
205
  # description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
206
  # "The input to this tool should be a string, representing the audio_path.")]
 
70
  tool = res['intermediate_steps'][0][0].tool
71
  if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
72
  print("======>Current memory:\n %s" % self.agent.memory)
73
+ response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
74
  state = state + [(text, response)]
75
  print("Outputs:", state)
76
  return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
 
85
  print("Outputs:", state)
86
  return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
87
  print("======>Current memory:\n %s" % self.agent.memory)
88
+ response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
89
  audio_filename = res['intermediate_steps'][0][1]
90
  state = state + [(text, response)]
91
  print("Outputs:", state)
 
134
  AI_prompt = "Received. "
135
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
136
  print("======>Current memory:\n %s" % self.agent.memory)
137
+ state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
138
  print("Outputs:", state)
139
  return state, state, txt + ' ' + image_filename + ' ', gr.Audio.update(visible=False)
140
 
 
144
  print("======>Previous memory:\n %s" % self.agent.memory)
145
  inpaint = Inpaint(device="cuda:0")
146
  new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
147
+ AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
148
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
149
  print("======>Current memory:\n %s" % self.agent.memory)
150
  state = state + [(f"Audio Inpainting", AI_prompt)]
 
160
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
161
  # self.t2i = T2I(device="cuda:0")
162
  # self.i2t = ImageCaptioning(device="cuda:0")
163
+ # self.t2a = T2A(device="cpu")
164
  self.tts = TTS(device="cpu")
165
  # self.t2s = T2S(device="cuda:0")
166
+ # self.i2a = I2A(device="cpu")
167
+ # self.a2t = A2T(device="cpu")
168
  # self.asr = ASR(device="cuda:0")
169
+ # self.inpaint = Inpaint(device="cpu")
170
  #self.tts_ood = TTS_OOD(device="cuda:0")
171
  self.tools = [
172
  # Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
 
175
  # Tool(name="Get Photo Description", func=self.i2t.inference,
176
  # description="useful for when you want to know what is inside the photo. receives image_path as input. "
177
  # "The input to this tool should be a string, representing the image_path. "),
178
+ # Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
179
+ # description="useful for when you want to generate an audio from a user input text and it saved it to a file."
180
+ # "The input to this tool should be a string, representing the text used to generate audio."),
181
  # Tool(
182
  # name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
183
  # description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
 
191
  # "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
192
  Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
193
  description="useful for when you want to convert a user input text into speech audio it saved it to a file."
194
+ "The input to this tool should be a string, representing the text used to be converted to speech.")]
195
+ # Tool(name="Generate Audio From The Image", func=self.i2a.inference,
196
+ # description="useful for when you want to generate an audio based on an image."
197
+ # "The input to this tool should be a string, representing the image_path. "),
198
+ # Tool(name="Generate Text From The Audio", func=self.a2t.inference,
199
+ # description="useful for when you want to describe an audio in text, receives audio_path as input."
200
+ # "The input to this tool should be a string, representing the audio_path."),
201
+ # Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
202
+ # description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
203
+ # "The input to this tool should be a string, representing the audio_path.")]
204
  # Tool(name="Transcribe speech", func=self.asr.inference,
205
  # description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
206
  # "The input to this tool should be a string, representing the audio_path.")]