Spaces:

Bishan
/

Speech_To_Text_Hindi

Runtime error

App Files Files Community

Speech_To_Text_Hindi / app.py

Bishan

Update app.py

af2f7e5 over 1 year ago

raw

history blame contribute delete

2.8 kB

	import soundfile as sf
	import torch
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
	import gradio as gr
	import sox
	import subprocess
	import time


	def read_file_and_process(wav_file):
	filename = wav_file.split('.')[0]
	filename_16k = filename + "16k.wav"
	resampler(wav_file, filename_16k)
	speech, _ = sf.read(filename_16k)
	print("---------------------------------------------------------")
	print(speech)
	inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
	print("---------------------------------------------------------")
	print(inputs)

	return inputs


	def resampler(input_file_path, output_file_path):
	command = (
	f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
	f"{output_file_path}"
	)
	subprocess.call(command, shell=True)


	def parse_transcription_with_lm(logits):
	result = processor_with_LM.batch_decode(logits.cpu().numpy())
	text = result.text
	transcription = text[0].replace('<s>','')
	return transcription

	def parse_transcription(logits):
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
	return transcription

	def parse(wav_file, applyLM):

	# record start time
	start = time.time()
	input_values = read_file_and_process(wav_file)
	with torch.no_grad():
	logits = model(**input_values).logits

	# if applyLM:
	# return parse_transcription_with_lm(logits)
	# else:
	# return parse_transcription(logits)

	output = parse_transcription(logits)
	# record end time
	end = time.time()
	print("------------------------------------------------------------------------------------------")
	print("The time of execution of above program is :",(end-start) * 10**3, "ms")
	# total time taken
	print("Execution time of the program is- ", end-start)
	print("------------------------------------------------------------------------------------------")
	return output


	model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"
	processor = Wav2Vec2Processor.from_pretrained(model_id)
	processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)
	model = Wav2Vec2ForCTC.from_pretrained(model_id)


	input_ = gr.Audio(source="upload", type="filepath")
	txtbox = gr.Textbox(
	label="Output from model will appear here:",
	lines=5
	)
	chkbox = gr.Checkbox(label="Apply LM", value=False)


	gr.Interface(parse, inputs = [input_, chkbox], outputs=txtbox,
	streaming=True, interactive=True,
	analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);