Spaces:
Running
on
T4
Running
on
T4
File size: 4,749 Bytes
5a766fd a7361bc 2d522b6 a7361bc 5cae5d7 5a766fd 1727d3b 2d522b6 1727d3b 5a766fd 5cae5d7 1727d3b 5a766fd 5cae5d7 1727d3b 2d522b6 1727d3b 5a766fd 5cae5d7 5a766fd 5cae5d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from __future__ import annotations
import os
import gradio as gr
import numpy as np
import torch
import torchaudio
from seamless_communication.models.inference.translator import Translator
from m4t_app import *
from simuleval_transcoder import *
# from simuleval_transcoder import *
from pydub import AudioSegment
import time
from time import sleep
# m4t_demo()
USE_M4T = True
Transcoder = SimulevalTranscoder()
def translate_audio_file_segment(audio_file):
print("translate_m4t state")
return predict(
task_name="S2ST",
audio_source="microphone",
input_audio_mic=audio_file,
input_audio_file=None,
input_text="",
source_language="English",
target_language="Portuguese",
)
def translate_m4t_callback(
audio_file, translated_audio_bytes_state, translated_text_state
):
translated_wav_segment, translated_text = translate_audio_file_segment(audio_file)
print('translated_audio_bytes_state', translated_audio_bytes_state)
print('translated_wav_segment', translated_wav_segment)
# combine translated wav into larger..
if type(translated_audio_bytes_state) is not tuple:
translated_audio_bytes_state = translated_wav_segment
else:
translated_audio_bytes_state = (translated_audio_bytes_state[0], np.append(translated_audio_bytes_state[1], translated_wav_segment[1]))
# translated_wav_segment[1]
translated_text_state += " | " + str(translated_text)
return [
audio_file,
translated_wav_segment,
translated_audio_bytes_state,
translated_text_state,
translated_audio_bytes_state,
translated_text_state,
]
def clear():
print("Clearing State")
return [bytes(), ""]
def blocks():
with gr.Blocks() as demo:
translated_audio_bytes_state = gr.State(None)
translated_text_state = gr.State("")
# input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3")
if USE_M4T:
input_audio = gr.Audio(
label="Input Audio",
type="filepath",
source="microphone",
streaming=True,
)
else:
input_audio = gr.Audio(
label="Input Audio",
type="filepath",
format="mp3",
source="microphone",
streaming=True,
)
most_recent_input_audio_segment = gr.Audio(
label="Recent Input Audio Segment segments",
format="bytes",
streaming=True
)
# TODO: Should add combined input audio segments...
stream_as_bytes_btn = gr.Button("Translate most recent recording segment")
output_translation_segment = gr.Audio(
label="Translated audio segment",
autoplay=False,
streaming=True,
type="numpy",
)
output_translation_combined = gr.Audio(
label="Translated audio combined",
autoplay=False,
streaming=True,
type="numpy",
)
# Could add output text segment
stream_output_text = gr.Textbox(label="Translated text")
stream_as_bytes_btn.click(
translate_m4t_callback,
[input_audio, translated_audio_bytes_state, translated_text_state],
[
most_recent_input_audio_segment,
output_translation_segment,
output_translation_combined,
stream_output_text,
translated_audio_bytes_state,
translated_text_state,
],
)
input_audio.change(
translate_m4t_callback,
[input_audio, translated_audio_bytes_state, translated_text_state],
[
most_recent_input_audio_segment,
output_translation_segment,
output_translation_combined,
stream_output_text,
translated_audio_bytes_state,
translated_text_state,
],
)
# input_audio.change(stream_bytes, [input_audio, translated_audio_bytes_state, translated_text_state], [most_recent_input_audio_segment, stream_output_text, translated_audio_bytes_state, translated_text_state])
# input_audio.change(lambda input_audio: recorded_audio, [input_audio], [recorded_audio])
input_audio.clear(
clear, None, [translated_audio_bytes_state, translated_text_state]
)
input_audio.start_recording(
clear, None, [translated_audio_bytes_state, translated_text_state]
)
demo.queue().launch()
# if __name__ == "__main__":
blocks()
|