import os, sys, re import shutil import argparse import subprocess import soundfile from process_audio import segment_audio from write_srt import write_to_file from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Tokenizer import torch import gradio as gr model = "facebook/wav2vec2-large-960h-lv60-self" tokenizer = Wav2Vec2Tokenizer.from_pretrained(model) asr_model = Wav2Vec2ForCTC.from_pretrained(model)#.to('cuda') vocab_dict = tokenizer.get_vocab() sort_vocab = sorted((value, key) for (key,value) in vocab_dict.items()) vocab = ([x[1].replace("|", " ") if x[1] not in tokenizer.all_special_tokens else "_" for x in sort_vocab]) # Line count for SRT file line_count = 0 def sort_alphanumeric(data): convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] return sorted(data, key = alphanum_key) def transcribe_audio(tokenizer, asr_model, audio_file, file_handle): # Run Wav2Vec2.0 inference on each audio file generated after VAD segmentation. global line_count speech, rate = soundfile.read(audio_file) input_values = tokenizer(speech, sampling_rate=16000, return_tensors = "pt", padding='longest').input_values logits = asr_model(input_values).logits prediction = torch.argmax(logits, dim = -1) infered_text = tokenizer.batch_decode(prediction)[0].lower() infered_text = re.sub(r' ', ' ', infered_text) infered_text = re.sub(r'\bi\s', 'I ', infered_text) infered_text = re.sub(r'\si$', ' I', infered_text) infered_text = re.sub(r'i\'', 'I\'', infered_text) limits = audio_file.split(os.sep)[-1][:-4].split("_")[-1].split("-") if len(infered_text) > 1: line_count += 1 write_to_file(file_handle, infered_text, line_count, limits) def get_subs(input_file): # Get directory for audio base_directory = os.getcwd() audio_directory = os.path.join(base_directory, "audio") if os.path.isdir(audio_directory): shutil.rmtree(audio_directory) os.mkdir(audio_directory) # Extract audio from video file video_file = input_file audio_file = audio_directory+'/temp.wav' command = ["ffmpeg", "-i", video_file, "-ac", "1", "-ar", "16000","-vn", "-f", "wav", audio_file] subprocess.run(command) video_file = input_file.split('/')[-1][:-4] srt_directory = os.path.join(base_directory, "srt") srt_file_name = os.path.join(srt_directory, video_file + ".srt") # Split audio file based on VAD silent segments segment_audio(audio_file) os.remove(audio_file) # Output SRT file file_handle = open(srt_file_name, "a+") file_handle.seek(0) for file in sort_alphanumeric(os.listdir(audio_directory)): audio_segment_path = os.path.join(audio_directory, file) if audio_segment_path.split(os.sep)[-1] != audio_file.split(os.sep)[-1]: transcribe_audio(tokenizer, asr_model, audio_segment_path, file_handle) file_handle.close() shutil.rmtree(audio_directory) return srt_file_name gradio_ui = gr.Interface( fn=get_subs, title="Autoblog - Video to Subtitle", inputs=gr.inputs.Video(label="Upload Video File"), outputs=gr.outputs.File(label="Auto-Transcript") ) gradio_ui.launch(inline=False)