Raghavan1988 commited on
Commit
7bd7744
1 Parent(s): 4ffa9cc

adding seamlessM4TModel and conditional check to see if user has added an audio

Browse files
Files changed (1) hide show
  1. app.py +14 -0
app.py CHANGED
@@ -25,6 +25,11 @@ DEFAULT_TARGET_LANGUAGE = "English"
25
  AUDIO_SAMPLE_RATE = 16000.0
26
  MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
27
 
 
 
 
 
 
28
 
29
  def predict(
30
  task_name: str,
@@ -243,6 +248,15 @@ def process_and_query(text, image,audio):
243
  # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
244
  if image is not None:
245
  text = process_image_with_openai(image)
 
 
 
 
 
 
 
 
 
246
 
247
  # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
248
  vectara_response_json = query_vectara(text)
 
25
  AUDIO_SAMPLE_RATE = 16000.0
26
  MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
27
 
28
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
29
+
30
+ processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
31
+ model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
32
+
33
 
34
  def predict(
35
  task_name: str,
 
248
  # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
249
  if image is not None:
250
  text = process_image_with_openai(image)
251
+
252
+ if audio is not None:
253
+ audio = audio[0].numpy()
254
+ audio = audio.astype(np.float32)
255
+ audio = audio / np.max(np.abs(audio))
256
+ audio = audio * 32768
257
+ audio = audio.astype(np.int16)
258
+ audio = audio.tobytes()
259
+ audio = base64.b64encode(audio).decode('utf-8')
260
 
261
  # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
262
  vectara_response_json = query_vectara(text)