File size: 2,547 Bytes
3119dd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da923e2
3119dd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1416675
3119dd6
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import gradio as gr
import librosa
import soundfile as sf
import torch
import warnings
import os 
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model

from engine import SpeechToTextEngine
import wave
from nemo_asr import transcribe


warnings.filterwarnings("ignore")

from speechbrain.pretrained import EncoderDecoderASR

asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")

# define speech-to-text function
def asr_transcript(audio):
   
    if audio == None:
        return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
    text = ""
    data={}
    if audio:
        text_asr = asr_model.transcribe_file(audio.name) 
        text_nemo_trasducer = transcribe(audio, "stt_rw_conformer_transducer_large")
        with open(audio.name,'rb') as f:
            audio_proper = f.read()
        stt_engine = SpeechToTextEngine()
        all_hot_words = []
        if data:
            all_hot_words = stt_engine.add_hot_words(data)
        if not audio_proper:
            raise InvalidUsage('Audio not provided')
        # Running the transcription
        text_coqui = stt_engine.run(audio_proper)
        
        return text_asr.lower() , text_coqui , text_nemo_trasducer
    else:
        return  "File not valid"
    
gradio_ui = gr.Interface(
    fn=asr_transcript,
    title="Kinyarwanda Speech Recognition",
    description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.",
    article = """
    This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb).
    """,
    inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
    outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"),
             gr.outputs.Textbox(label="Recognized speech from coqui STT model"),
             gr.outputs.Textbox(label="Recognized speech from NVIDIA Conformer transduver large model")]
    # examples =  [["sample_1.wav"],["sample_2.wav"]]
)

gradio_ui.launch(enable_queue=True)