Spaces:
Running
Running
File size: 2,548 Bytes
8cd0fcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import argparse
import torch
import torchaudio
from pathlib import Path
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
def main(args):
processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id)
model = Wav2Vec2ForCTC.from_pretrained(args.model_id)
model.to('cuda')
files = args.path_files.split(',')
for path_file in files:
print('File:', path_file)
wav_file_path = str(Path(path_file).absolute())
waveform, sample_rate = torchaudio.load(wav_file_path)
if sample_rate != 16000:
resample = torchaudio.transforms.Resample(
sample_rate, 16000, resampling_method='sinc_interpolation')
speech_array = resample(waveform)
sp = speech_array.squeeze().numpy()
else:
sp = waveform.squeeze().numpy()
# stride_length_s is a tuple of the left and right stride length.
# With only 1 number, both sides get the same stride, by default
# the stride_length on one side is 1/6th of the chunk_length_s
input_values = processor(sp,
sample_rate=16000,
chunk_length_s=args.chunk_length_s,
stride_length_s=(args.stride_length_s_l, args.stride_length_s_r),
return_tensors="pt").input_values
input_values = input_values.cuda()
with torch.no_grad():
logits = model(input_values).logits
prediction = processor.batch_decode(logits.cpu().numpy()).text
print(prediction[0])
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--path_files", type=str, required=True, help="WAV files to transcribe, separated by a comma"
)
parser.add_argument(
"--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
)
parser.add_argument(
"--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
)
parser.add_argument(
"--stride_length_s_l", type=int, default=None, help="Stride of the audio chunks, left value."
)
parser.add_argument(
"--stride_length_s_r", type=int, default=None, help="Stride of the audio chunks, right value."
)
parser.add_argument(
"--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
)
args = parser.parse_args()
main(args)
|