MultiMed

Runtime error

Raghavan1988 commited on Nov 7, 2023

Commit

7bd7744

1 Parent(s): 4ffa9cc

adding seamlessM4TModel and conditional check to see if user has added an audio

Files changed (1) hide show

app.py CHANGED Viewed

@@ -25,6 +25,11 @@ DEFAULT_TARGET_LANGUAGE = "English"
 AUDIO_SAMPLE_RATE = 16000.0
 MAX_INPUT_AUDIO_LENGTH = 60  # in seconds
 def predict(
     task_name: str,
@@ -243,6 +248,15 @@ def process_and_query(text, image,audio):
         # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
         if image is not None:
             text = process_image_with_openai(image)
         # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
         vectara_response_json = query_vectara(text)

 AUDIO_SAMPLE_RATE = 16000.0
 MAX_INPUT_AUDIO_LENGTH = 60  # in seconds
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
+model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
 def predict(
     task_name: str,
         # If an image is provided, process it with OpenAI and use the response as the text query for Vectara
         if image is not None:
             text = process_image_with_openai(image)
+        if audio is not None:
+            audio = audio[0].numpy()
+            audio = audio.astype(np.float32)
+            audio = audio / np.max(np.abs(audio))
+            audio = audio * 32768
+            audio = audio.astype(np.int16)
+            audio = audio.tobytes()
+            audio = base64.b64encode(audio).decode('utf-8')
         # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
         vectara_response_json = query_vectara(text)