Spaces:

akhaliq
/

hubert-xlarge-ls960-ft

Runtime error

Ahsen Khaliq

Update app.py

4552bf9 over 2 years ago

No virus

3.41 kB

	import torch
	from transformers import Wav2Vec2Processor, HubertForCTC
	import soundfile as sf
	import gradio as gr
	from moviepy.editor import *
	import cv2

	def get_optimal_font_scale(text, width):
	for scale in reversed(range(0, 60, 1)):
	textSize = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_DUPLEX, fontScale=scale/10, thickness=1)
	new_width = textSize[0][0]
	print(new_width)
	if (new_width <= width):
	return scale/10
	return 1

	processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
	model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
	def map_to_array(file):
	speech, _ = sf.read(file)
	return speech
	def inference(audio, image):
	input_values = processor(map_to_array(audio.name), return_tensors="pt").input_values # Batch size 1
	logits = model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.decode(predicted_ids[0])
	audio_clip = AudioFileClip(audio.name)
	image_clip = ImageClip(image.name).set_duration(audio_clip.duration)
	image_clip.write_videofile("my_video.mp4", fps=len(transcription.split())/audio_clip.duration)
	videoclip = VideoFileClip("my_video.mp4")

	new_audioclip = CompositeAudioClip([audio_clip])
	videoclip.audio = new_audioclip
	videoclip.write_videofile("new_filename.mp4")

	frames = {k + 1: v.strip() for k, v in enumerate(transcription.split())}

	cap = cv2.VideoCapture('new_filename.mp4')
	fps = cap.get(cv2.CAP_PROP_FPS)
	w = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
	h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
	fourcc = cv2.VideoWriter_fourcc(*'XVID')
	font = cv2.FONT_HERSHEY_SIMPLEX

	frame_list = []

	while cap.isOpened():
	ret, frame = cap.read()
	if ret:
	frame_no = cap.get(cv2.CAP_PROP_POS_FRAMES)
	if frame_no in frames:
	fontScale = get_optimal_font_scale(frames[frame_no], w - 20)
	print(frames[frame_no], (10, int(h)//2), font,
	fontScale,
	(0, 0, 0), 2, cv2.LINE_AA)
	cv2.putText(frame, frames[frame_no], (10, int(h)//2), font,
	fontScale,
	(0, 0, 0), 2, cv2.LINE_AA)
	frame_list.append(frame)
	else:
	break

	output_clip = ImageSequenceClip(frame_list, fps=len(transcription.split())/audio_clip.duration)
	output_clip.audio = new_audioclip
	output_clip.write_videofile("output6.mp4")
	cap.release()
	cv2.destroyAllWindows()
	return transcription, 'output6.mp4'

	title = "Hubert"
	description = "Gradio demo for hubert-xlarge-ls960-ft. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below."
	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.07447'>HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units</a> \| <a href='https://github.com/pytorch/fairseq/tree/main/examples/hubert'>Github Repo</a></p>"

	gr.Interface(
	inference,
	[gr.inputs.Audio(type='file'),gr.inputs.Image(type="file", label="Input")],
	[gr.outputs.Textbox(label="Output"),gr.outputs.Video(label="Video Out")],
	title=title,
	description=description,
	article=article,
	enable_queue=True
	).launch(debug=True)