wav2vec2-uk-demo

Build error

App Files Files Community

wav2vec2-uk-demo / inference_gpu.py

peteralexandercharles

Duplicate from Yehor/wav2vec2-uk-demo

8691b07 over 1 year ago

raw history blame contribute delete

No virus

2.55 kB

	import argparse
	import torch
	import torchaudio
	from pathlib import Path
	from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC


	def main(args):
	processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id)
	model = Wav2Vec2ForCTC.from_pretrained(args.model_id)
	model.to('cuda')

	files = args.path_files.split(',')

	for path_file in files:
	print('File:', path_file)

	wav_file_path = str(Path(path_file).absolute())
	waveform, sample_rate = torchaudio.load(wav_file_path)

	if sample_rate != 16000:
	resample = torchaudio.transforms.Resample(
	sample_rate, 16000, resampling_method='sinc_interpolation')
	speech_array = resample(waveform)
	sp = speech_array.squeeze().numpy()
	else:
	sp = waveform.squeeze().numpy()

	# stride_length_s is a tuple of the left and right stride length.
	# With only 1 number, both sides get the same stride, by default
	# the stride_length on one side is 1/6th of the chunk_length_s
	input_values = processor(sp,
	sample_rate=16000,
	chunk_length_s=args.chunk_length_s,
	stride_length_s=(args.stride_length_s_l, args.stride_length_s_r),
	return_tensors="pt").input_values
	input_values = input_values.cuda()

	with torch.no_grad():
	logits = model(input_values).logits

	prediction = processor.batch_decode(logits.cpu().numpy()).text
	print(prediction[0])


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()

	parser.add_argument(
	"--path_files", type=str, required=True, help="WAV files to transcribe, separated by a comma"
	)
	parser.add_argument(
	"--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
	)
	parser.add_argument(
	"--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
	)
	parser.add_argument(
	"--stride_length_s_l", type=int, default=None, help="Stride of the audio chunks, left value."
	)
	parser.add_argument(
	"--stride_length_s_r", type=int, default=None, help="Stride of the audio chunks, right value."
	)
	parser.add_argument(
	"--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
	)
	args = parser.parse_args()

	main(args)