File size: 4,749 Bytes
5a766fd
 
 
 
 
 
 
 
 
 
a7361bc
 
2d522b6
 
a7361bc
5cae5d7
 
 
5a766fd
1727d3b
 
 
 
2d522b6
1727d3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a766fd
 
5cae5d7
 
1727d3b
 
5a766fd
5cae5d7
1727d3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d522b6
 
 
1727d3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a766fd
5cae5d7
5a766fd
 
5cae5d7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from __future__ import annotations

import os

import gradio as gr
import numpy as np
import torch
import torchaudio
from seamless_communication.models.inference.translator import Translator


from m4t_app import *
from simuleval_transcoder import *
# from simuleval_transcoder import *

from pydub import AudioSegment
import time
from time import sleep

# m4t_demo()

USE_M4T = True

Transcoder = SimulevalTranscoder()

def translate_audio_file_segment(audio_file):
    print("translate_m4t state")

    return predict(
        task_name="S2ST",
        audio_source="microphone",
        input_audio_mic=audio_file,
        input_audio_file=None,
        input_text="",
        source_language="English",
        target_language="Portuguese",
    )


def translate_m4t_callback(
    audio_file, translated_audio_bytes_state, translated_text_state
):
    translated_wav_segment, translated_text = translate_audio_file_segment(audio_file)
    print('translated_audio_bytes_state', translated_audio_bytes_state)
    print('translated_wav_segment', translated_wav_segment)

    # combine translated wav into larger..
    if type(translated_audio_bytes_state) is not tuple:
        translated_audio_bytes_state = translated_wav_segment
    else:

        translated_audio_bytes_state = (translated_audio_bytes_state[0], np.append(translated_audio_bytes_state[1], translated_wav_segment[1]))

    # translated_wav_segment[1]


    translated_text_state += " | " + str(translated_text)
    return [
        audio_file,
        translated_wav_segment,
        translated_audio_bytes_state,
        translated_text_state,
        translated_audio_bytes_state,
        translated_text_state,
    ]


def clear():
    print("Clearing State")
    return [bytes(), ""]


def blocks():
    with gr.Blocks() as demo:
        translated_audio_bytes_state = gr.State(None)
        translated_text_state = gr.State("")

        # input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3")
        if USE_M4T:
            input_audio = gr.Audio(
                label="Input Audio",
                type="filepath",
                source="microphone",
                streaming=True,
            )
        else:
            input_audio = gr.Audio(
                label="Input Audio",
                type="filepath",
                format="mp3",
                source="microphone",
                streaming=True,
            )

        most_recent_input_audio_segment = gr.Audio(
            label="Recent Input Audio Segment segments",
            format="bytes",
            streaming=True
        )
        # TODO: Should add combined input audio segments...

        stream_as_bytes_btn = gr.Button("Translate most recent recording segment")

        output_translation_segment = gr.Audio(
            label="Translated audio segment",
            autoplay=False,
            streaming=True,
            type="numpy",
        )

        output_translation_combined = gr.Audio(
            label="Translated audio combined",
            autoplay=False,
            streaming=True,
            type="numpy",
        )

        # Could add output text segment
        stream_output_text = gr.Textbox(label="Translated text")

        stream_as_bytes_btn.click(
            translate_m4t_callback,
            [input_audio, translated_audio_bytes_state, translated_text_state],
            [
                most_recent_input_audio_segment,
                output_translation_segment,
                output_translation_combined,
                stream_output_text,
                translated_audio_bytes_state,
                translated_text_state,
            ],
        )

        input_audio.change(
            translate_m4t_callback,
            [input_audio, translated_audio_bytes_state, translated_text_state],
            [
                most_recent_input_audio_segment,
                output_translation_segment,
                output_translation_combined,
                stream_output_text,
                translated_audio_bytes_state,
                translated_text_state,
            ],
        )
        # input_audio.change(stream_bytes, [input_audio, translated_audio_bytes_state, translated_text_state], [most_recent_input_audio_segment, stream_output_text, translated_audio_bytes_state, translated_text_state])
        # input_audio.change(lambda input_audio: recorded_audio, [input_audio], [recorded_audio])
        input_audio.clear(
            clear, None, [translated_audio_bytes_state, translated_text_state]
        )
        input_audio.start_recording(
            clear, None, [translated_audio_bytes_state, translated_text_state]
        )

    demo.queue().launch()


# if __name__ == "__main__":
blocks()