MultiMed

Running

App Files Files Community

not-lain commited on Nov 7, 2023

Commit

be980dd

•

1 Parent(s): 2885a71

fixed audio interface

Browse files

Files changed (4) hide show

1.wav +0 -0
app.py +25 -66
requirements.txt +1 -0
test.py +21 -46

1.wav ADDED Viewed

Binary file (317 kB). View file

app.py CHANGED Viewed

@@ -1,80 +1,47 @@
 # Welcome to Team Tonic's MultiMed
-from lang_list import (
-    LANGUAGE_NAME_TO_CODE,
-    S2ST_TARGET_LANGUAGE_NAMES,
-    S2TT_TARGET_LANGUAGE_NAMES,
-    T2TT_TARGET_LANGUAGE_NAMES,
-    TEXT_SOURCE_LANGUAGE_NAMES,
-    LANG_TO_SPKR_ID,
-)
 from gradio_client import Client
 import os
 import numpy as np
 import base64
-import torch
-import torchaudio
 import gradio as gr
 import requests
 import json
 import dotenv
-from transformers import AutoProcessor, SeamlessM4TModel
-import torchaudio
 import PIL
 dotenv.load_dotenv()
-client = Client("https://facebook-seamless-m4t.hf.space/--replicas/frq8b/")
-AUDIO_SAMPLE_RATE = 16000.0
-MAX_INPUT_AUDIO_LENGTH = 60  # in seconds
-DEFAULT_TARGET_LANGUAGE = "English"
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
-# model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
-def process_speech(sound):
-    """
-    processing sound using seamless_m4t
-    """
-    # task_name = "T2TT"
-    result = client.predict(task_name="S2TT",
-                            audio_source="microphone",
-                            input_audio_mic=sound,
-                            input_audio_file=None,
-                            input_text=None,
-                            source_language=None,
-                            target_language="English")
-    print(result)
-    return result[1]
-def process_speech_using_model(sound):
     """
     processing sound using seamless_m4t
     """
-    # task_name = "T2TT"
-    arr, org_sr = torchaudio.load(sound)
-    target_language_code = LANGUAGE_NAME_TO_CODE[DEFAULT_TARGET_LANGUAGE]
-    new_arr = torchaudio.functional.resample(
-        arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
-    max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
-    if new_arr.shape[1] > max_length:
-        new_arr = new_arr[:, :max_length]
-        gr.Warning(
-            f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
-    input_data = processor(
-        audios=new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
-    tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code,
-                                num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
-    text_out = processor.decode(tokens_ids, skip_special_tokens=True)
-    return text_out
 def process_image(image) :
@@ -258,15 +225,7 @@ def process_and_query(text, image, audio):
             text = process_image(image)
         if audio is not None:
-            # audio = audio[0].numpy()
-            # audio = audio.astype(np.float32)
-            # audio = audio / np.max(np.abs(audio))
-            # audio = audio * 32768
-            # audio = audio.astype(np.int16)
-            # audio = audio.tobytes()
-            # audio = base64.b64encode(audio).decode('utf-8')
             text = process_speech(audio)
-            print(text)
         # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
         vectara_response_json = query_vectara(text)

 # Welcome to Team Tonic's MultiMed
 from gradio_client import Client
 import os
 import numpy as np
 import base64
 import gradio as gr
 import requests
 import json
 import dotenv
+from scipy.io.wavfile import write
 import PIL
 dotenv.load_dotenv()
+client = Client("facebook/seamless_m4t")
+def process_speech(audio):
     """
     processing sound using seamless_m4t
     """
+    audio_name = f"{np.random.randint(0, 100)}.wav"
+    sr, data = audio
+    write(audio_name, sr, data.astype(np.int16))
+    out = client.predict(
+        "S2TT",
+        "file",
+        None,
+        audio_name,
+        "",
+        "French",# source language
+        "English",# target language
+        api_name="/run",
+    )
+    out = out[1] # get the text
+    try :
+        return f"{out}"
+    except Exception as e :
+        return f"{e}"
 def process_image(image) :
             text = process_image(image)
         if audio is not None:
             text = process_speech(audio)
         # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
         vectara_response_json = query_vectara(text)

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ torchaudio==2.0.2
 sentencepiece
 python-dotenv
 Pillow

 sentencepiece
 python-dotenv
 Pillow
+scipy

test.py CHANGED Viewed

@@ -5,57 +5,32 @@ import requests
 import gradio as gr
 import PIL
 import numpy as np
 dotenv.load_dotenv()
-def process_image(image) :
-    # img_name = f"{np.random.randint(0, 100)}.jpg"
-    img_name = f"{1}.jpg"
-    PIL.Image.fromarray(image.astype('uint8'), 'RGB').save(img_name)
-    image = open(img_name, "rb").read()
-    base64_image = base64_image = base64.b64encode(image).decode('utf-8')
-    openai_api_key = os.getenv('OPENAI_API_KEY')
-    # oai_org = os.getenv('OAI_ORG')
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {openai_api_key}"
-    }
-    payload = {
-        "model": "gpt-4-vision-preview",
-        "messages": [
-        {
-            "role": "user",
-            "content": [
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-            {
-                "type": "image_url",
-                "image_url": {
-                "url": f"data:image/jpeg;base64,{base64_image}"
-                }
-            }
-            ]
-        }
-        ],
-        "max_tokens": 300
-    }
-    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
     try :
-        out = response.json()
-        out = out["choices"][0]["message"]["content"]
-        print("out : ", out)
-        print("type(out) : ", type(out))
         return f"{out}"
     except Exception as e :
         return f"{e}"
-iface = gr.Interface(fn=process_image, inputs="image", outputs="text")
 iface.launch()

 import gradio as gr
 import PIL
 import numpy as np
+from scipy.io.wavfile import write
+import gradio_client as grc
 dotenv.load_dotenv()
+client = grc.Client("facebook/seamless_m4t")
+def process_image(audio):
+    # audio_name = f"{np.random.randint(0, 100)}.jpg"
+    audio_name = f"{1}.wav"
+    sr, data = audio
+    write(audio_name, sr, data.astype(np.int16))
+    out = client.predict(
+        "S2TT",
+        "file",
+        None,
+        audio_name,
+        "",
+        "French",# source language
+        "English",# target language
+        api_name="/run",
+    )
+    out = out[1] # get the text
     try :
         return f"{out}"
     except Exception as e :
         return f"{e}"
+iface = gr.Interface(fn=process_image, inputs="audio", outputs="text")
 iface.launch()