File size: 3,777 Bytes
feba911
 
 
 
076cb4c
 
40817ec
422e690
076cb4c
 
9684673
076cb4c
 
 
feba911
 
076cb4c
 
 
feba911
 
 
 
2ca0f71
933b458
feba911
33b8d42
 
 
d3cc82a
db5c8d0
 
a95d76f
 
d3cc82a
 
33b8d42
241ba79
d3cc82a
 
8ce4e5c
d3cc82a
 
8ce4e5c
33b8d42
 
a95d76f
d3cc82a
 
33b8d42
241ba79
d3cc82a
33b8d42
933b458
feba911
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr

#Get models
#ASR model for input speech
speech2text = gr.Interface.load("huggingface/facebook/hubert-large-ls960-ft",
                                inputs=gr.inputs.Audio(label="Upload Audio", type="filepath", source = "upload"))
                                     
#translates English to Spanish text                      
translator = gr.Interface.load("huggingface/Helsinki-NLP/opus-mt-en-es",
                                outputs=gr.outputs.Textbox(label="English to Spanish Translated Text"))
#TTS model for output speech                                
text2speech = gr.Interface.load("huggingface/facebook/tts_transformer-es-css10",
                                outputs=gr.outputs.Audio(label="English to Spanish Translated Audio"),
                                allow_flagging="never")

                                
translate = gr.Series(speech2text, translator) #outputs Spanish text translation
en2es = gr.Series(translate, text2speech) #outputs Spanish audio
ui = gr.Parallel(translate, en2es) #allows transcription of Spanish audio

#gradio interface
ui.title = "English to Spanish Speech Translator"
ui.description = """<center>A useful tool in translating English to Spanish audio. All pre-trained models are found in huggingface.</center>"""
ui.examples = [['ljspeech.wav'],['ljspeech2.wav'], ['longspeech.wav']]
ui.allow_flagging = "never" 
ui.theme = "peach"
ui.article = """<h2>Pre-trained model Information</h2>
                <h3>Automatic Speech Recognition</h3>
                <p style='text-align: justify'>The model used for the ASR part of this space is from                
                <a href=\"https://huggingface.co/facebook/hubert-large-ls960-ft">hubert-large-ls960-ft</a> which is pretrained and fine-tuned on <b>960 hours of 
                Librispeech</b> on 16kHz sampled speech audio. This model has a self-reported <b>word error rate (WER)</b> of <b>1.9 
                percent</b> and ranks first in <i>paperswithcode</i> for ASR on Librispeech. More information can be 
                found on its website at <a href=\"https://ai.facebook.com/blog/hubert-self-supervised-representation-learning-for-speech-
                recognition-
                generation-and-compression">hubert-self</a> and 
                original model is under <a href=\"https://github.com/pytorch/fairseq/tree/main/examples/hubert">pytorch/fairseq</a>.</p>
                <h3>Text Translator</h3>
                <p style='text-align: justify'>The English to Spanish text translator pre-trained model is from 
                <a href=\"https://huggingface.co/Helsinki-NLP/opus-mt-en-es">Helsinki-NLP/opus-mt-en-es</a> which is part of the <b>The 
                Tatoeba Translation Challenge 
                (v2021-08-07)</b> as seen from its github repo at 
                <a href=\"https://github.com/Helsinki-NLP/Tatoeba-Challenge">Helsinki-NLP/Tatoeba-Challenge</a>. This project aims to develop 
                machine 
                translation in real-world 
                cases for many languages. </p>
                <h3>Text to Speech</h3>
                <p style='text-align: justify'> The TTS model used is from <a href=\"https://huggingface.co/facebook/tts_transformer-es-
                css10">facebook/tts_transformer-es-
                css10</a>. 
                This model uses the <b>Fairseq(-py)</b> sequence modeling toolkit for speech synthesis, in this case, specifically TTS 
                for Spanish. More information can be seen on their git at 
                <a href=\"https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis">speech_synthesis</a>. </p>
            """           
                                             
ui.launch(inbrowser=True)