Spaces:
Runtime error
Runtime error
File size: 3,407 Bytes
2410583 4552bf9 2410583 5d41f4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import torch
from transformers import Wav2Vec2Processor, HubertForCTC
import soundfile as sf
import gradio as gr
from moviepy.editor import *
import cv2
def get_optimal_font_scale(text, width):
for scale in reversed(range(0, 60, 1)):
textSize = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_DUPLEX, fontScale=scale/10, thickness=1)
new_width = textSize[0][0]
print(new_width)
if (new_width <= width):
return scale/10
return 1
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
def map_to_array(file):
speech, _ = sf.read(file)
return speech
def inference(audio, image):
input_values = processor(map_to_array(audio.name), return_tensors="pt").input_values # Batch size 1
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
audio_clip = AudioFileClip(audio.name)
image_clip = ImageClip(image.name).set_duration(audio_clip.duration)
image_clip.write_videofile("my_video.mp4", fps=len(transcription.split())/audio_clip.duration)
videoclip = VideoFileClip("my_video.mp4")
new_audioclip = CompositeAudioClip([audio_clip])
videoclip.audio = new_audioclip
videoclip.write_videofile("new_filename.mp4")
frames = {k + 1: v.strip() for k, v in enumerate(transcription.split())}
cap = cv2.VideoCapture('new_filename.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
w = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
font = cv2.FONT_HERSHEY_SIMPLEX
frame_list = []
while cap.isOpened():
ret, frame = cap.read()
if ret:
frame_no = cap.get(cv2.CAP_PROP_POS_FRAMES)
if frame_no in frames:
fontScale = get_optimal_font_scale(frames[frame_no], w - 20)
print(frames[frame_no], (10, int(h)//2), font,
fontScale,
(0, 0, 0), 2, cv2.LINE_AA)
cv2.putText(frame, frames[frame_no], (10, int(h)//2), font,
fontScale,
(0, 0, 0), 2, cv2.LINE_AA)
frame_list.append(frame)
else:
break
output_clip = ImageSequenceClip(frame_list, fps=len(transcription.split())/audio_clip.duration)
output_clip.audio = new_audioclip
output_clip.write_videofile("output6.mp4")
cap.release()
cv2.destroyAllWindows()
return transcription, 'output6.mp4'
title = "Hubert"
description = "Gradio demo for hubert-xlarge-ls960-ft. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.07447'>HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units</a> | <a href='https://github.com/pytorch/fairseq/tree/main/examples/hubert'>Github Repo</a></p>"
gr.Interface(
inference,
[gr.inputs.Audio(type='file'),gr.inputs.Image(type="file", label="Input")],
[gr.outputs.Textbox(label="Output"),gr.outputs.Video(label="Video Out")],
title=title,
description=description,
article=article,
enable_queue=True
).launch(debug=True) |