import argparse from time import gmtime, strftime import torch import torchaudio from pathlib import Path from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer def main(args): tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(args.model_id) processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id) model = Wav2Vec2ForCTC.from_pretrained(args.model_id) model.to('cpu') files = args.path_files.split(',') for path_file in files: print('File:', path_file) wav_file_path = str(Path(path_file).absolute()) waveform, sample_rate = torchaudio.load(wav_file_path) if sample_rate != 16000: resample = torchaudio.transforms.Resample( sample_rate, 16000, resampling_method='sinc_interpolation') sample_rate = 16000 speech_array = resample(waveform) sp = speech_array.squeeze().numpy() else: sp = waveform.squeeze().numpy() # stride_length_s is a tuple of the left and right stride length. # With only 1 number, both sides get the same stride, by default # the stride_length on one side is 1/6th of the chunk_length_s input_values = processor(sp, sample_rate=16000, chunk_length_s=args.chunk_length_s, stride_length_s=(args.stride_length_s_l, args.stride_length_s_r), return_tensors="pt").input_values with torch.no_grad(): logits = model(input_values).logits # prediction = tokenizer.decode(pred_ids[0], output_word_offsets=True) # prediction = tokenizer.decode(pred_ids[0], output_char_offsets=True) pred_ids = torch.argmax(logits, axis=-1).cpu().tolist() prediction = tokenizer.decode(pred_ids[0], output_word_offsets=True) print(f'Sample rate: {sample_rate}') time_offset = 320 / sample_rate for item in prediction.word_offsets: r = item s = round(r['start_offset'] * time_offset, 2) e = round(r['end_offset'] * time_offset, 2) print(f"{s} - {e}: {r['word']}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--path_files", type=str, required=True, help="WAV files to transcribe, separated by a comma" ) parser.add_argument( "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers" ) parser.add_argument( "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds." ) parser.add_argument( "--stride_length_s_l", type=int, default=None, help="Stride of the audio chunks, left value." ) parser.add_argument( "--stride_length_s_r", type=int, default=None, help="Stride of the audio chunks, right value." ) parser.add_argument( "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis." ) args = parser.parse_args() main(args)