Spaces:

rexoscare
/

Speech_to_Text_Hindi

Runtime error

App Files Files Community

Speech_to_Text_Hindi / app.py

rexoscare

Upload app.py

516501d almost 3 years ago

raw

history blame

No virus

1.88 kB

	import soundfile as sf
	import torch
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	import argparse
	from glob import glob
	import torchaudio
	import subprocess
	import gradio as gr

	resampler = torchaudio.transforms.Resample(48_000, 16_000)

	def get_filename(wav_file):
	filename_local = wav_file.split('/')[-1][:-4]
	filename_new = '/tmp/'+filename_local+'_16.wav'


	subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(wav_file, str(16000), filename_new)], shell=True)
	return filename_new

	def parse_transcription(wav_file):
	# load pretrained model
	processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
	model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")

	# load audio


	wav_file = get_filename(wav_file.name)
	audio_input, sample_rate = sf.read(wav_file)
	#test_file = resampler(test_file[0])

	# pad input values and return pt tensor
	input_values = processor(audio_input, sampling_rate=16_000, return_tensors="pt").input_values

	# INFERENCE
	# retrieve logits & take argmax
	logits = model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)

	# transcribe
	transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
	return transcription


	title = "Speech-to-Text (Hindi) using Vakyansh"
	description = "Upload a hindi audio clip, and let AI do the hard work of transcribing."
	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2104.06678'>Large-Scale Self- and Semi-Supervised Learning for Speech Translation</a></p>"
	gr.Interface(
	parse_transcription,
	title=title,
	inputs=gr.inputs.Audio(label="Record Audio File", type="file", source = "microphone"),
	description=description, article = article, outputs = "text").launch(inline = False)