import gradio as gr import torchaudio import torch def speech_to_text(audio_file): audio_input, _ = torchaudio.load(audio_file.name) s2t_model = torch.jit.load("unity_on_device_s2t.ptl") with torch.no_grad(): text = s2t_model(audio_input, tgt_lang=TGT_LANG) return text def speech_to_speech_translation(audio_file): audio_input, _ = torchaudio.load(audio_file.name) s2st_model = torch.jit.load("unity_on_device.ptl") with torch.no_grad(): text, units, waveform = s2st_model(audio_input, tgt_lang=TGT_LANG) output_file = "/tmp/result.wav" torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000) return text, output_file # Gradio interfaces iface_s2t = gr.Interface( fn=speech_to_text, inputs=gr.inputs.Audio(type="file", label="Upload Audio for Speech to Text"), outputs="text", title="Speech to Text" ) iface_s2st = gr.Interface( fn=speech_to_speech_translation, inputs=gr.inputs.Audio(type="file", label="Upload Audio for Speech to Speech Translation"), outputs=["text", "audio"], title="Speech to Speech Translation" ) # Combine into a tabbed interface iface = gr.TabbedInterface([iface_s2t, iface_s2st], ["Speech to Text", "Speech to Speech Translation"]) iface.launch()