Spaces:

nata0801
/

RuEn_ASR_with_Voice_Recorder

Runtime error

App Files Files Community

RuEn_ASR_with_Voice_Recorder / app.py

nata0801

Update app.py

1842c19 almost 3 years ago

raw

history blame

2.14 kB

	#Importing all the necessary packages
	import nltk
	import librosa
	import torch
	import gradio as gr
	from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
	nltk.download("punkt")



	def correct_casing(input_sentence):
	""" This function is for correcting the casing of the generated transcribed text
	"""
	sentences = nltk.sent_tokenize(input_sentence)
	return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))



	def asr_transcript(audio_file, language):
	"""Generating transcripts for the audio input
	"""

	#Selecting the language and loading the model and the tokenizer
	if language == "English":
	model_name = "facebook/wav2vec2-large-960h-lv60-self"
	elif language == "Russian":
	model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-russian"

	tokenizer = Wav2Vec2Tokenizer.from_pretrained(model)
	model = Wav2Vec2ForCTC.from_pretrained(model)

	#read the file and resample to 16KHz
	stream = librosa.stream(audio_file.name, block_length=20, frame_length=16000, hop_length=16000)

	for speech in stream:
	if len(speech.shape) > 1:
	speech = speech[:, 0] + speech[:, 1]

	input_values = tokenizer(speech, return_tensors="pt").input_values
	logits = model(input_values).logits

	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = tokenizer.batch_decode(predicted_ids)[0]
	transcript += transcription.lower() + " "

	return transcript


	gr.Interface(asr_transcript,
	inputs = [gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Please record your message/Пожалуйста, введите Ваше сообщение"),
	gr.inputs.Radio(label="Pick a language/Выберите язык", choices=["English", "Russian"])],
	outputs = gr.outputs.Textbox(label="Output Text/Результат"),
	title="Automatic speech recognition with voice recorder in Russian and English",
	description = "This application displays transcribed text for given audio input",
	theme="grass").launch()