Spaces:

DigitalUmuganda
/

Kinyarwanda-asr

Runtime error

App Files Files Community

rutsam commited on Sep 20, 2022

Commit

3119dd6

•

1 Parent(s): 4f6bed6

push the code

Browse files

Files changed (8) hide show

README.md +1 -1
app.py +65 -0
app_single.py +45 -0
app_upload_model_input.py +48 -0
engine.py +129 -0
nemo_asr.py +22 -0
packages.txt +2 -0
requirements.txt +14 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Kinyarwanda Asr
-emoji: 💩
 colorFrom: yellow
 colorTo: indigo
 sdk: gradio

 ---
 title: Kinyarwanda Asr
+emoji: 🚀
 colorFrom: yellow
 colorTo: indigo
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import gradio as gr
+import librosa
+import soundfile as sf
+import torch
+import warnings
+import os
+from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
+from engine import SpeechToTextEngine
+import wave
+import gradio as gr
+import librosa
+import soundfile as sf
+import warnings
+from nemo_asr import transcribe
+warnings.filterwarnings("ignore")
+from speechbrain.pretrained import EncoderDecoderASR
+asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
+#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
+# define speech-to-text function
+def asr_transcript(audio):
+    if audio == None:
+        return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
+    text = ""
+    data={}
+    if audio:
+        text_asr = asr_model.transcribe_file(audio.name)
+        text_nemo_trasducer = transcribe(audio.name, "stt_rw_conformer_transducer_large")
+        with open(audio.name,'rb') as f:
+            audio_proper = f.read()
+        stt_engine = SpeechToTextEngine()
+        all_hot_words = []
+        if data:
+            all_hot_words = stt_engine.add_hot_words(data)
+        if not audio_proper:
+            raise InvalidUsage('Audio not provided')
+        # Running the transcription
+        text_coqui = stt_engine.run(audio_proper)
+        return text_asr.lower() , text_coqui , text_nemo_trasducer
+    else:
+        return  "File not valid"
+gradio_ui = gr.Interface(
+    fn=asr_transcript,
+    title="Kinyarwanda Speech Recognition",
+    description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.",
+    article = """
+    This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb).
+    """,
+    inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
+    outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"),
+             gr.outputs.Textbox(label="Recognized speech from coqui STT model")
+             gr.outputs.Textbox(label="Recognized speech from NVIDIA Conformer transduver large model")]
+    # examples =  [["sample_1.wav"],["sample_2.wav"]]
+)
+gradio_ui.launch(enable_queue=True)

app_single.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import gradio as gr
+import librosa
+import soundfile as sf
+import torch
+import warnings
+import os
+from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
+warnings.filterwarnings("ignore")
+from speechbrain.pretrained import EncoderDecoderASR
+asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
+#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
+# define speech-to-text function
+def asr_transcript(audio):
+    if audio == None:
+        return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
+    text = ""
+    if audio:
+        text = asr_model.transcribe_file(audio.name)
+        return text
+    else:
+        return  "File not valid"
+gradio_ui = gr.Interface(
+    fn=asr_transcript,
+    title="Kinyarwanda Speech Recognition",
+    description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
+    article = """
+    This demo showcases the pretrained model from deepspeech.
+    """,
+    inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
+    outputs=[gr.outputs.Textbox(label="Recognized speech")],
+    examples =  [["sample_1.wav"],["sample_2.wav"]]
+)
+gradio_ui.launch(enable_queue=True)

app_upload_model_input.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import gradio as gr
+import librosa
+import soundfile as sf
+import torch
+import warnings
+import os
+from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
+warnings.filterwarnings("ignore")
+from speechbrain.pretrained import EncoderDecoderASR
+asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
+#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
+# define speech-to-text function
+def asr_transcript(audio, audio_microphone, model_params):
+    audio = audio_microphone if audio_microphone else audio
+    if audio == None and audio_microphone == None:
+        return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
+    text = ""
+    if audio:
+        text = asr_model.transcribe_file(audio.name)
+        return text
+    else:
+        return  "File not valid"
+gradio_ui = gr.Interface(
+    fn=asr_transcript,
+    title="Kinyarwanda Speech Recognition",
+    description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
+    article = """
+    This demo showcases the pretrained model from deepspeech.
+    """,
+    inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["deepspeech","coqui (soon)"], type="value", default="deepspeech", label="Select speech recognition model ", optional=False)],
+    outputs=[gr.outputs.Textbox(label="Recognized speech")],
+    examples =  [["sample_1.wav","sample_1.wav","deepspeech"],["sample_2.wav","sample_2.wav","deepspeech"]]
+)
+gradio_ui.launch(enable_queue=True)

engine.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import queue
+import wave
+from io import BytesIO
+from pathlib import Path
+import wget
+import ffmpeg
+import numpy as np
+import webrtcvad
+from stt import Metadata
+from stt import Model, version
+def normalize_audio_input(audio):
+    output, err = ffmpeg.input('pipe:0').output('pipe:1', f='WAV', acodec='pcm_s16le', ac=1, ar='16k', loglevel='error',
+                                                hide_banner=None).run(input=audio, capture_stdout=True,
+                                                                      capture_stderr=True)
+    if err:
+        raise Exception(err)
+    return output
+class Frame(object):
+    """Represents a "frame" of audio data."""
+    def __init__(self, frame_bytes, timestamp, duration):
+        self.bytes = frame_bytes
+        self.timestamp = timestamp
+        self.duration = duration
+class SpeechToTextEngine:
+    """ Class to perform speech-to-text transcription and related functionality """
+    FORMAT = 8
+    SAMPLE_RATE = 16000
+    CHANNELS = 1
+    BLOCKS_PER_SECOND = 50
+    def __init__(self, scorer='kinyarwanda.scorer') -> None:
+        """ Initializing the DeepSpeech model """
+        wget.download("https://huggingface.co/mbazaNLP/kinyarwanda-coqui-stt-model/resolve/main/kinyarwanda.scorer")
+        wget.download("https://huggingface.co/mbazaNLP/kinyarwanda-coqui-stt-model/resolve/main/kinyarwanda.tflite")
+        self.model = Model('kinyarwanda.tflite')
+        self.model.enableExternalScorer(
+            scorer_path=Path(__file__).parents[0].joinpath(scorer).absolute().as_posix())
+        self.vad = webrtcvad.Vad(mode=3)
+        self.sample_rate = self.SAMPLE_RATE
+        self.buffer_queue = queue.Queue()
+    def run(self, audio) -> str:
+        """ Receives the audio,  normalizes it and is sent to the model to be transcribed. Returns the result of the
+        transcribe audio in string format."""
+        normalized_audio = normalize_audio_input(audio)
+        audio_streams = BytesIO(normalized_audio)
+        with wave.Wave_read(audio_streams) as wav:
+            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
+        results = self.model.stt(audio_buffer=audio_streams)
+        return results
+    def run_with_metadata(self, audio) -> Metadata:
+        normalized_audio = normalize_audio_input(audio)
+        audio_streams = BytesIO(normalized_audio)
+        with wave.Wave_read(audio_streams) as wav:
+            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
+        results = self.model.sttWithMetadata(audio_buffer=audio_streams)
+        return results
+    def add_hot_words(self, data) -> list:
+        """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
+        added hot-words """
+        all_hot_words = []
+        try:
+            print('----------------------------------------------------')
+            for hot_word in data:
+                # Change all the characters of the hot-word to lower case
+                word = hot_word.lower()
+                # Get numeric value of the boost
+                boost = float(data.get(hot_word))
+                # Adding the hot-word and its boost to the language model
+                self.model.addHotWord(hot_word, boost)
+                # Printing on the prompt the activity
+                print(f"`{word}` hot-word with boost `{boost}` was added.")
+                all_hot_words.append(word)
+            return all_hot_words
+        except RuntimeError:
+            return []
+    def erase_hot_word(self, hot_words) -> None:
+        try:
+            for hot_word in hot_words:
+                self.model.eraseHotWord(hot_word)
+                print(f"`{hot_word}` hot-word is erased.")
+            print('----------------------------------------------------')
+        except RuntimeError:
+            return
+    def clear_hot_words(self) -> str:
+        try:
+            self.model.clearHotWords()
+            return f"All hot-words were erased."
+        except RuntimeError:
+            return f"No more hot-words are left."
+    def deep_stream(self):
+        return self.model.createStream()
+    def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30):
+        """
+        Takes the desired frame duration in milliseconds, the PCM data, and
+        the sample rate. Yields Frames of the requested duration.
+        """
+        # audio = np.frombuffer(audio, np.int16)
+        n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
+        offset = 0
+        timestamp = 0.0
+        duration = (float(n) / sample_rate) / 2.0
+        while offset + n < len(audio):
+            yield Frame(audio[offset:offset + n], timestamp, duration)
+            timestamp += duration
+            offset += n

nemo_asr.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import aiofiles
+import nemo
+import nemo.collections.asr as nemo_asr
+def transcribe(file, modelName="stt_rw_conformer_transducer_large"):
+  with aiofiles.open(file.filename, 'wb') as out_file:
+    content = file.read()  # async read
+    out_file.write(content)  # async write
+    print(out_file.name)
+    asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
+                model_name=modelName)
+    if not file.name.endswith("wav"):
+      sound = AudioSegment.from_mp3(out_file.name)
+      sound.export(out_file.name, format="wav")
+      files = [out_file.name]
+    pac.convert_wav_to_16bit_mono(out_file.name,out_file.name)
+    # print("file loaded is **************",file.file)
+    for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
+      print(f"Audio in {fname} was recognized as: {transcription}")
+      print(transcription[0])
+      return {"text": transcription[0], "filename": file.filename}

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ libsndfile1
2	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+librosa==0.9.1
+soundfile==0.10.3.post1
+torch==1.11.0
+transformers==4.18.0
+speechbrain
+stt
+webrtcvad
+numpy
+ffmpeg-python
+librosa==0.9.1
+soundfile==0.10.3.post1
+wget
+aiofiles
+-e https://github.com/NVIDIA/NeMo.git