File size: 5,072 Bytes
e667211
 
 
 
 
 
 
f6806df
 
 
 
 
 
 
 
e667211
 
 
 
 
 
49f03db
e667211
 
e58b4bb
 
e667211
e58b4bb
 
 
 
 
 
e667211
 
 
 
 
 
 
e58b4bb
 
 
 
e667211
 
 
 
e58b4bb
e667211
b293439
 
 
3cb13e6
b293439
e667211
 
 
 
e58b4bb
78e56be
b293439
 
 
 
78e56be
 
3cb13e6
b293439
e667211
3cb13e6
e667211
 
24fdc8a
e667211
 
 
 
 
 
 
49f03db
e667211
 
 
 
 
 
 
49f03db
e667211
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import torchaudio
import torch
import os
import time
import soundfile as sf

languages = {
    "English": "eng",
    "Hindi": "hin",
    "Portuguese": "por",
    "Russian": "rus",
    "Spanish": "spa"
}

welcome_message = """
# Welcome to Tonic's Unity On Device!

Tonic's Unity On Device uses [facebook/seamless-m4t-unity-small](https://huggingface.co/facebook/seamless-m4t-unity-small) for audio translation & accessibility.
Tonic's Unity On Device!🚀 on your own data & in your own way by cloning this space. Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/TeamTonic/SeamlessOnDevice?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3>
### Join us : 
TeamTonic is always making cool demos! Join our active builder's community on Discord: [Discord](https://discord.gg/GWpVpekp) On Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On Github: [Polytonic](https://github.com/tonic-ai) & contribute to [PolyGPT](https://github.com/tonic-ai/polygpt-alpha)"             
"""

def save_and_resample_audio(input_audio_path, output_audio_path, resample_rate=16000):
    waveform, sample_rate = torchaudio.load(input_audio_path)

    resampler = torchaudio.transforms.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
    resampled_waveform = resampler(waveform)

    torchaudio.save(output_audio_path, resampled_waveform, resample_rate)

def save_audio(audio_input, output_dir="saved_audio", resample_rate=16000):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    sample_rate, audio_data = audio_input
    file_name = f"audio_{int(time.time())}.wav"
    file_path = os.path.join(output_dir, file_name)
    sf.write(file_path, audio_data, sample_rate)

    resampled_file_path = os.path.join(output_dir, f"resampled_{file_name}")
    save_and_resample_audio(file_path, resampled_file_path, resample_rate)

    return resampled_file_path

def speech_to_text(audio_data, tgt_lang):
    file_path = save_audio(audio_data)
    audio_input, _ = torchaudio.load(file_path)
    s2t_model = torch.jit.load("unity_on_device.ptl", map_location=torch.device('cpu'))
    with torch.no_grad():
        model_output = s2t_model(audio_input, tgt_lang=languages[tgt_lang])
    transcribed_text = model_output[0] if model_output else ""
    print("Speech to Text Model Output:", transcribed_text)

    return transcribed_text

def speech_to_speech_translation(audio_data, tgt_lang):
    file_path = save_audio(audio_data)
    audio_input, _ = torchaudio.load(file_path)
    s2st_model = torch.jit.load("unity_on_device.ptl", map_location=torch.device('cpu'))
    with torch.no_grad():
        translated_text, units, waveform = s2st_model(audio_input, tgt_lang=languages[tgt_lang])
    output_file = "/tmp/result.wav"
    torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000)
    print("Translated Text:", translated_text)
    print("Units:", units)
    print("Waveform Shape:", waveform.shape)

    return translated_text, output_file


def create_interface():
    with gr.Blocks(theme='ParityError/Anime') as interface:
        gr.Markdown(welcome_message)
        input_language = gr.Dropdown(list(languages.keys()), label="Select Target Language", value="English")

        with gr.Accordion("Speech to Text", open=False) as stt_accordion:
            audio_input_stt = gr.Audio(label="Upload or Record Audio")
            text_output_stt = gr.Text(label="Transcribed Text")
            stt_button = gr.Button("Transcribe")
            stt_button.click(speech_to_text, inputs=[audio_input_stt, input_language], outputs=text_output_stt)
            gr.Examples([["audio1.wav"]], inputs=[audio_input_stt], outputs=[text_output_stt])

        with gr.Accordion("Speech to Speech Translation", open=False) as s2st_accordion:
            audio_input_s2st = gr.Audio(label="Upload or Record Audio")
            text_output_s2st = gr.Text(label="Translated Text")
            audio_output_s2st = gr.Audio(label="Translated Audio", type="filepath")
            s2st_button = gr.Button("Translate")
            s2st_button.click(speech_to_speech_translation, inputs=[audio_input_s2st, input_language], outputs=[text_output_s2st, audio_output_s2st])
            gr.Examples([["audio1.wav"]], inputs=[audio_input_s2st], outputs=[text_output_s2st, audio_output_s2st])

    return interface

app = create_interface()
app.launch(show_error=True, debug=True)