Spaces:

anshuln
/

peekaboo-demo

Running on A10G

peekaboo-demo / src /app_modelscope.py

Anshul Nasery

Demo commit

44f2ca8 5 months ago

No virus

12 kB

	from models.pipelines import TextToVideoSDPipelineSpatialAware
	import torch.nn.functional as F
	import torch
	import cv2
	import sys
	import gradio as gr
	import os
	import numpy as np
	from gradio_utils import *


	def image_mod(image):
	return image.rotate(45)


	sys.path.insert(1, os.path.join(sys.path[0], '..'))


	NUM_POINTS = 3
	NUM_FRAMES = 16
	LARGE_BOX_SIZE = 176


	def generate_video(pipe, overall_prompt, latents, get_latents=False, num_frames=24, num_inference_steps=50, fg_masks=None,
	fg_masked_latents=None, frozen_steps=0, frozen_prompt=None, custom_attention_mask=None, fg_prompt=None):

	video_frames = pipe(overall_prompt, num_frames=num_frames, latents=latents, num_inference_steps=num_inference_steps, frozen_mask=fg_masks,
	frozen_steps=frozen_steps, latents_all_input=fg_masked_latents, frozen_prompt=frozen_prompt, custom_attention_mask=custom_attention_mask, fg_prompt=fg_prompt,
	make_attention_mask_2d=True, attention_mask_block_diagonal=True, height=256, width=256).frames
	if get_latents:
	video_latents = pipe(overall_prompt, num_frames=num_frames, latents=latents,
	num_inference_steps=num_inference_steps, output_type="latent").frames
	return video_frames, video_latents

	return video_frames


	# def generate_bb(prompt, fg_object, aspect_ratio, size, trajectory):

	# if len(trajectory['layers']) < NUM_POINTS:
	# raise ValueError
	# final_canvas = torch.zeros((NUM_FRAMES,320,576))

	# bbox_size_x = LARGE_BOX_SIZE if size == "large" else int(LARGE_BOX_SIZE * 0.75) if size == "medium" else LARGE_BOX_SIZE//2
	# bbox_size_y = bbox_size_x if aspect_ratio == "square" else int(bbox_size_x * 0.75) if aspect_ratio == "horizontal" else int(bbox_size_x * 1.25)

	# bbox_coords = []
	# # TODO add checks for trajectory
	# for t in trajectory['layers']:
	# bbox_coords.append([int(t.sum(axis=-2).argmax()*576/800), int(t.sum(axis=-1)[140:460].argmax())])
	# bbox_coords = np.array(bbox_coords)
	# # Make a list of length 24
	# # Each element is a list of length 2
	# # First element is the x coordinate of the bbox
	# # Second element is a set of y coordinates of the bbox
	# new_bbox_coords = [np.zeros(2,) for i in range(NUM_FRAMES)]
	# divisor = int(NUM_FRAMES / (NUM_POINTS-1))
	# for i in range(NUM_POINTS-1):
	# new_bbox_coords[i*divisor] = bbox_coords[i]
	# new_bbox_coords[-1] = bbox_coords[-1]

	# # Linearly interpolate in the middle
	# for i in range(NUM_POINTS-1):
	# for j in range(1,divisor):
	# new_bbox_coords[idivisor+j][1] = int((bbox_coords[i][0] (divisor-j) + bbox_coords[(i+1)][0] * j) / divisor)
	# new_bbox_coords[idivisor+j][0] = int((bbox_coords[i][1] (divisor-j) + bbox_coords[(i+1)][1] * j) / divisor)

	# for i in range(NUM_FRAMES):
	# x = int(new_bbox_coords[i][0])
	# y = int(new_bbox_coords[i][1])
	# final_canvas[i,int(x-bbox_size_x/2):int(x+bbox_size_x/2), int(y-bbox_size_y/2):int(y+bbox_size_y/2)] = 1

	# torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# try:
	# pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
	# "cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
	# except:
	# pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
	# "cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)

	# fg_masks = F.interpolate(final_canvas.unsqueeze(1), size=(40,72), mode="nearest").to(torch_device)

	# # Save fg_masks as images
	# for i in range(NUM_FRAMES):
	# cv2.imwrite(f"./fg_masks/frame_{i:04d}.png", fg_masks[i,0].cpu().numpy()*255)


	# seed = 2
	# random_latents = torch.randn([1, 4, NUM_FRAMES, 40, 72], generator=torch.Generator().manual_seed(seed)).to(torch_device)
	# overall_prompt = f"A realistic lively {prompt}"
	# video_frames = generate_video(pipe, overall_prompt, random_latents, get_latents=False, num_frames=NUM_FRAMES, num_inference_steps=40,
	# fg_masks=fg_masks, fg_masked_latents=None, frozen_steps=2, frozen_prompt=None, fg_prompt=fg_object)

	# return create_video(video_frames,fps=8, type="final")


	def interpolate_points(points, target_length):
	print(points)
	if len(points) == target_length:
	return points
	elif len(points) > target_length:
	# Subsample the points uniformly
	indices = np.round(np.linspace(
	0, len(points) - 1, target_length)).astype(int)
	return [points[i] for i in indices]
	else:
	# Linearly interpolate to get more points
	interpolated_points = []
	num_points_to_add = target_length - len(points)
	points_added_per_segment = num_points_to_add // (len(points) - 1)

	for i in range(len(points) - 1):
	start, end = points[i], points[i + 1]
	interpolated_points.append(start)
	for j in range(1, points_added_per_segment + 1):
	fraction = j / (points_added_per_segment + 1)
	new_point = np.round(start + fraction * (end - start))
	interpolated_points.append(new_point)

	# Add the last point
	interpolated_points.append(points[-1])

	# If there are still not enough points, add extras at the end
	while len(interpolated_points) < target_length:
	interpolated_points.append(points[-1])

	return interpolated_points


	torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	try:
	pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
	"damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float, variant="fp32").to(torch_device)
	except:
	pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
	"damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float, variant="fp32").to(torch_device)


	def generate_bb(prompt, fg_object, aspect_ratio, size, motion_direction, seed, peekaboo_steps, trajectory):

	if not set(fg_object.split()).issubset(set(prompt.split())):
	raise gr.Error("Foreground object should be present in the video prompt")
	# if len(trajectory['layers']) < NUM_POINTS:
	# raise ValueError
	final_canvas = torch.zeros((NUM_FRAMES, 256//8, 256//8))

	bbox_size_x = LARGE_BOX_SIZE if size == "large" else int(
	LARGE_BOX_SIZE * 0.75) if size == "medium" else LARGE_BOX_SIZE//2
	bbox_size_y = bbox_size_x if aspect_ratio == "square" else int(
	bbox_size_x * 1.33) if aspect_ratio == "horizontal" else int(bbox_size_x * 0.75)

	bbox_coords = []

	image = trajectory['composite']
	print(image.shape)

	image = cv2.resize(image, (256, 256))
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	_, thresh = cv2.threshold(gray, 30, 255, cv2.THRESH_BINARY_INV)
	contours, _ = cv2.findContours(
	thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

	# Process each contour
	bbox_points = []
	for contour in contours:
	# You can approximate the contour to reduce the number of points
	epsilon = 0.01 * cv2.arcLength(contour, True)
	approx = cv2.approxPolyDP(contour, epsilon, True)

	# Extracting and printing coordinates
	for point in approx:
	y, x = point.ravel()
	if x in range(1, 255) and y in range(1, 255):
	# bbox_points.append([min(max(x, 32), 256-32),min(max(y, 32), 256-32)])
	bbox_points.append([min(max(x, 0), 256), min(max(y, 0), 256)])

	if motion_direction in ['Left to Right', 'Right to Left']:
	sorted_points = sorted(
	bbox_points, key=lambda x: x[1], reverse=motion_direction == "Right to Left")
	else:
	sorted_points = sorted(
	bbox_points, key=lambda x: x[0], reverse=motion_direction == "Down to Up")
	target_length = NUM_FRAMES
	final_points = interpolate_points(np.array(sorted_points), target_length)

	# Remember to reverse the co-ordinates
	for i in range(NUM_FRAMES):
	x = int(final_points[i][0])
	y = int(final_points[i][1])
	# Added Padding
	final_canvas[i, max(int(x-bbox_size_x/2), 0) // 8:min(int(x+bbox_size_x/2), 256) // 8,
	max(int(y-bbox_size_y/2), 0) // 8:min(int(y+bbox_size_y/2), 256) // 8] = 1

	torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	fg_masks = final_canvas.unsqueeze(1).to(torch_device)
	# # Save fg_masks as images
	for i in range(NUM_FRAMES):
	cv2.imwrite(f"./fg_masks/frame_{i:04d}.png",
	fg_masks[i, 0].cpu().numpy()*255)

	seed = seed
	random_latents = torch.randn([1, 4, NUM_FRAMES, 32, 32], generator=torch.Generator(
	).manual_seed(seed)).to(torch_device)
	overall_prompt = f"{prompt} , high quality"
	video_frames = generate_video(pipe, overall_prompt, random_latents, get_latents=False, num_frames=NUM_FRAMES, num_inference_steps=40,
	fg_masks=fg_masks, fg_masked_latents=None, frozen_steps=int(peekaboo_steps), frozen_prompt=None, fg_prompt=fg_object)
	video_frames_original = generate_video(pipe, overall_prompt, random_latents, get_latents=False, num_frames=NUM_FRAMES, num_inference_steps=40,
	fg_masks=None, fg_masked_latents=None, frozen_steps=0, frozen_prompt=None, fg_prompt=None)

	return create_video(video_frames_original, fps=8, type="modelscope"), create_video(video_frames, fps=8, type="final")


	instructions_md = """
	## Usage Instructions
	- Video Prompt: Enter a brief description of the scene you want to generate.
	- Foreground Object: Specify the main object in the video.
	- Aspect Ratio: Choose the aspect ratio for the bounding box.
	- Size of the Bounding Box: Select how large the foreground object should be.
	- Trajectory of the Bounding Box: Draw the trajectory of the bounding box.
	- Motion Direction: Indicate the direction of movement for the object.
	- Geek Settings: Advanced settings for fine-tuning (optional).
	- Generate Video: Click the button to create your video.

	Feel free to experiment with different settings to see how they affect the output!
	"""

	with gr.Blocks() as demo:
	gr.Markdown("""
	# Peekaboo Demo
	""")
	with gr.Row():
	video_1 = gr.Video(label="Original Modelscope Video")
	video_2 = gr.Video(label="Peekaboo Video")


	with gr.Accordion(label="Usage Instructions", open=False):
	gr.Markdown(instructions_md)
	with gr.Group("User Input"):
	txt_1 = gr.Textbox(lines=1, label="Video Prompt", value="Darth Vader surfing on some waves")
	txt_2 = gr.Textbox(lines=1, label="Foreground Object in the Video Prompt", value="Darth Vader")
	aspect_ratio = gr.Radio(choices=["square", "horizontal", "vertical"], label="Aspect Ratio", value="vertical")
	trajectory = gr.Paint(value={'background': np.zeros((256, 256)), 'layers': [], 'composite': np.zeros((256, 256))}, type="numpy", image_mode="RGB", height=256, width=256, label="Trajectory of the Bounding Box")
	size = gr.Radio(choices=["small", "medium", "large"], label="Size of the Bounding Box", value="medium")
	motion_direction = gr.Radio(choices=["Left to Right", "Right to Left", "Up to Down", "Down to Up"], label="Motion Direction", value="Left to Right")

	with gr.Accordion(label="Geek settings", open=False):
	with gr.Group():
	seed = gr.Slider(0, 10, step=1., value=2, label="Seed")
	peekaboo_steps = gr.Slider(0, 20, step=1., value=2, label="Number of Peekaboo Steps")


	btn = gr.Button(value="Generate Video")

	btn.click(generate_bb, inputs=[txt_1, txt_2, aspect_ratio, size, motion_direction, seed, peekaboo_steps, trajectory], outputs=[video_1, video_2])




	if __name__ == "__main__":
	demo.launch(share=True)