not-lain commited on
Commit
8b809c0
2 Parent(s): 7dc22ca af23861

Merge branch 'main' of https://huggingface.co/spaces/TeamTonic/MultiMed

Browse files
Files changed (1) hide show
  1. app.py +14 -0
app.py CHANGED
@@ -31,6 +31,11 @@ from lang_list import (
31
  LANG_TO_SPKR_ID,
32
  )
33
 
 
 
 
 
 
34
 
35
  def process_speech(sound):
36
  """
@@ -221,6 +226,15 @@ def process_and_query(text, image,audio):
221
  # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
222
  if image is not None:
223
  text = process_image_with_openai(image)
 
 
 
 
 
 
 
 
 
224
 
225
  # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
226
  vectara_response_json = query_vectara(text)
 
31
  LANG_TO_SPKR_ID,
32
  )
33
 
34
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
35
+
36
+ #processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
37
+ #model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
38
+
39
 
40
  def process_speech(sound):
41
  """
 
226
  # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
227
  if image is not None:
228
  text = process_image_with_openai(image)
229
+
230
+ if audio is not None:
231
+ audio = audio[0].numpy()
232
+ audio = audio.astype(np.float32)
233
+ audio = audio / np.max(np.abs(audio))
234
+ audio = audio * 32768
235
+ audio = audio.astype(np.int16)
236
+ audio = audio.tobytes()
237
+ audio = base64.b64encode(audio).decode('utf-8')
238
 
239
  # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
240
  vectara_response_json = query_vectara(text)