File size: 2,548 Bytes
8cd0fcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import argparse
import torch
import torchaudio
from pathlib import Path
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC


def main(args):
    processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id)
    model = Wav2Vec2ForCTC.from_pretrained(args.model_id)
    model.to('cuda')

    files = args.path_files.split(',')

    for path_file in files:
        print('File:', path_file)

        wav_file_path = str(Path(path_file).absolute())
        waveform, sample_rate = torchaudio.load(wav_file_path)

        if sample_rate != 16000:
            resample = torchaudio.transforms.Resample(
                sample_rate, 16000, resampling_method='sinc_interpolation')
            speech_array = resample(waveform)
            sp = speech_array.squeeze().numpy()
        else:
            sp = waveform.squeeze().numpy()

        # stride_length_s is a tuple of the left and right stride length.
        # With only 1 number, both sides get the same stride, by default
        # the stride_length on one side is 1/6th of the chunk_length_s
        input_values = processor(sp,
                                 sample_rate=16000,
                                 chunk_length_s=args.chunk_length_s,
                                 stride_length_s=(args.stride_length_s_l, args.stride_length_s_r),
                                 return_tensors="pt").input_values
        input_values = input_values.cuda()

        with torch.no_grad():
            logits = model(input_values).logits

        prediction = processor.batch_decode(logits.cpu().numpy()).text
        print(prediction[0])


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--path_files", type=str, required=True, help="WAV files to transcribe, separated by a comma"
    )
    parser.add_argument(
        "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
    )
    parser.add_argument(
        "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
    )
    parser.add_argument(
        "--stride_length_s_l", type=int, default=None, help="Stride of the audio chunks, left value."
    )
    parser.add_argument(
        "--stride_length_s_r", type=int, default=None, help="Stride of the audio chunks, right value."
    )
    parser.add_argument(
        "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
    )
    args = parser.parse_args()

    main(args)