File size: 3,773 Bytes
feba911
 
 
 
c52add3
36e373b
40817ec
f794f86
c52add3
 
feba911
c52add3
 
 
feba911
 
c52add3
 
 
feba911
 
 
 
2ca0f71
933b458
feba911
33b8d42
 
 
d3cc82a
db5c8d0
 
a95d76f
 
d3cc82a
 
33b8d42
241ba79
d3cc82a
 
8ce4e5c
d3cc82a
 
8ce4e5c
33b8d42
 
a95d76f
d3cc82a
 
33b8d42
241ba79
d3cc82a
33b8d42
933b458
feba911
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr

#Get models
#ASR model for input speech
ui = gr.Interface.load("huggingface/facebook/hubert-large-ls960-ft",
                                inputs=gr.inputs.Audio(label="Record Audio", type="filepath", source = "microphone"))
                                     
#translates English to Spanish text                      
#translator = gr.Interface.load("huggingface/Helsinki-NLP/opus-mt-en-es",
#                                outputs=gr.outputs.Textbox(label="English to Spanish Translated Text"))
#TTS model for output speech                                
#text2speech = gr.Interface.load("huggingface/facebook/tts_transformer-es-css10",
#                                outputs=gr.outputs.Audio(label="English to Spanish Translated Audio"),
#                                allow_flagging="never")

                                
#ui = gr.Series(speech2text, translator) #outputs Spanish text translation
#en2es = gr.Series(translate, text2speech) #outputs Spanish audio
#ui = gr.Parallel(translate, en2es) #allows transcription of Spanish audio

#gradio interface
ui.title = "English to Spanish Speech Translator"
ui.description = """<center>A useful tool in translating English to Spanish audio. All pre-trained models are found in huggingface.</center>"""
ui.examples = [['ljspeech.wav'],['ljspeech2.wav'], ['longspeech.wav']]
ui.allow_flagging = "never" 
ui.theme = "peach"
ui.article = """<h2>Pre-trained model Information</h2>
                <h3>Automatic Speech Recognition</h3>
                <p style='text-align: justify'>The model used for the ASR part of this space is from                
                <a href=\"https://huggingface.co/facebook/hubert-large-ls960-ft">hubert-large-ls960-ft</a> which is pretrained and fine-tuned on <b>960 hours of 
                Librispeech</b> on 16kHz sampled speech audio. This model has a self-reported <b>word error rate (WER)</b> of <b>1.9 
                percent</b> and ranks first in <i>paperswithcode</i> for ASR on Librispeech. More information can be 
                found on its website at <a href=\"https://ai.facebook.com/blog/hubert-self-supervised-representation-learning-for-speech-
                recognition-
                generation-and-compression">hubert-self</a> and 
                original model is under <a href=\"https://github.com/pytorch/fairseq/tree/main/examples/hubert">pytorch/fairseq</a>.</p>
                <h3>Text Translator</h3>
                <p style='text-align: justify'>The English to Spanish text translator pre-trained model is from 
                <a href=\"https://huggingface.co/Helsinki-NLP/opus-mt-en-es">Helsinki-NLP/opus-mt-en-es</a> which is part of the <b>The 
                Tatoeba Translation Challenge 
                (v2021-08-07)</b> as seen from its github repo at 
                <a href=\"https://github.com/Helsinki-NLP/Tatoeba-Challenge">Helsinki-NLP/Tatoeba-Challenge</a>. This project aims to develop 
                machine 
                translation in real-world 
                cases for many languages. </p>
                <h3>Text to Speech</h3>
                <p style='text-align: justify'> The TTS model used is from <a href=\"https://huggingface.co/facebook/tts_transformer-es-
                css10">facebook/tts_transformer-es-
                css10</a>. 
                This model uses the <b>Fairseq(-py)</b> sequence modeling toolkit for speech synthesis, in this case, specifically TTS 
                for Spanish. More information can be seen on their git at 
                <a href=\"https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis">speech_synthesis</a>. </p>
            """           
                                             
ui.launch(inbrowser=True)