Spaces:

DigitalUmuganda
/

Kinyarwanda-asr

Runtime error

App Files Files Community

Kinyarwanda-asr / app.py

rutsam

pass file in nemo function

da923e2 almost 2 years ago

raw

history blame

No virus

2.62 kB

	import gradio as gr
	import librosa
	import soundfile as sf
	import torch
	import warnings
	import os
	from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model

	from engine import SpeechToTextEngine
	import wave
	import gradio as gr
	import librosa
	import soundfile as sf
	import warnings

	from nemo_asr import transcribe


	warnings.filterwarnings("ignore")

	from speechbrain.pretrained import EncoderDecoderASR

	asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
	#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")

	# define speech-to-text function
	def asr_transcript(audio):

	if audio == None:
	return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
	text = ""
	data={}
	if audio:
	text_asr = asr_model.transcribe_file(audio.name)
	text_nemo_trasducer = transcribe(audio, "stt_rw_conformer_transducer_large")
	with open(audio.name,'rb') as f:
	audio_proper = f.read()
	stt_engine = SpeechToTextEngine()
	all_hot_words = []
	if data:
	all_hot_words = stt_engine.add_hot_words(data)
	if not audio_proper:
	raise InvalidUsage('Audio not provided')
	# Running the transcription
	text_coqui = stt_engine.run(audio_proper)

	return text_asr.lower() , text_coqui , text_nemo_trasducer
	else:
	return "File not valid"

	gradio_ui = gr.Interface(
	fn=asr_transcript,
	title="Kinyarwanda Speech Recognition",
	description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.",
	article = """
	This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb).
	""",
	inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
	outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"),
	gr.outputs.Textbox(label="Recognized speech from coqui STT model"),
	gr.outputs.Textbox(label="Recognized speech from NVIDIA Conformer transduver large model")]
	# examples = [["sample_1.wav"],["sample_2.wav"]]
	)

	gradio_ui.launch(enable_queue=True)