import torch from transformers import Wav2Vec2Processor, HubertForCTC import gradio as gr from moviepy.editor import * import cv2 import librosa def get_optimal_font_scale(text, width): for scale in reversed(range(0, 60, 1)): textSize = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_DUPLEX, fontScale=scale/10, thickness=1) new_width = textSize[0][0] print(new_width) if (new_width <= width): return scale/10 return 1 processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-xlarge-ls960-ft") model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft") def inference(audio, image): y, sr = librosa.load(audio.name,sr=16000) input_values = processor(y, return_tensors="pt").input_values # Batch size 1 logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) audio_clip = AudioFileClip(audio.name) image_clip = ImageClip(image.name).set_duration(audio_clip.duration) image_clip = image_clip.resize(height=360) # make the height 360px ( According to moviePy documenation The width is then computed so that the width/height ratio is conserved.) image_clip.write_videofile("my_video.mp4", fps=len(transcription.split())/audio_clip.duration) videoclip = VideoFileClip("my_video.mp4") new_audioclip = CompositeAudioClip([audio_clip]) videoclip.audio = new_audioclip videoclip.write_videofile("new_filename.mp4") frames = {k + 1: v.strip() for k, v in enumerate(transcription.split())} cap = cv2.VideoCapture('new_filename.mp4') fps = cap.get(cv2.CAP_PROP_FPS) w = cap.get(cv2.CAP_PROP_FRAME_WIDTH) h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) fourcc = cv2.VideoWriter_fourcc(*'XVID') font = cv2.FONT_HERSHEY_SIMPLEX frame_list = [] while cap.isOpened(): ret, frame = cap.read() if ret: frame_no = cap.get(cv2.CAP_PROP_POS_FRAMES) if frame_no in frames: fontScale = get_optimal_font_scale(frames[frame_no], w - 20) print(frames[frame_no], (10, int(h)//2), font, fontScale, (0, 0, 0), 2, cv2.LINE_AA) cv2.putText(frame, frames[frame_no], (10, int(h)//2), font, fontScale, (0, 0, 0), 2, cv2.LINE_AA) frame_list.append(frame) else: break output_clip = ImageSequenceClip(frame_list, fps=len(transcription.split())/audio_clip.duration) output_clip.audio = new_audioclip output_clip.write_videofile("output6.mp4") return transcription, 'output6.mp4' title = "Hubert-xlarge-ls960-ft" description = "Gradio demo for hubert-xlarge-ls960-ft. To use it, simply add your audio file and image, or click one of the examples to load them. Read more at the links below." article = "

HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units | Github Repo

" examples = [['sample.wav','example.jpeg']] gr.Interface( inference, [gr.inputs.Audio(type='file'),gr.inputs.Image(type="file", label="Input")], [gr.outputs.Textbox(label="Output"),gr.outputs.Video(label="Video Out")], title=title, description=description, article=article, enable_queue=True, examples=examples ).launch(debug=True)