Spaces:

ali-vilab
/

modelscope-text-to-video-synthesis

App Files Files Community

modelscope-text-to-video-synthesis / app.py

cybermark's picture

Video Py

6b357fe over 1 year ago

3.7 kB

	#!/usr/bin/env python

	from __future__ import annotations

	import os
	import random
	import tempfile

	import gradio as gr
	import imageio
	import numpy as np
	import torch
	from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler



	MAX_NUM_FRAMES = int(os.getenv('MAX_NUM_FRAMES', '200'))
	DEFAULT_NUM_FRAMES = min(MAX_NUM_FRAMES,
	int(os.getenv('DEFAULT_NUM_FRAMES', '16')))

	pipe = DiffusionPipeline.from_pretrained('damo-vilab/text-to-video-ms-1.7b',
	torch_dtype=torch.float16,
	variant='fp16')
	pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
	pipe.enable_model_cpu_offload()
	pipe.enable_vae_slicing()


	def to_video(frames: list[np.ndarray], fps: int) -> str:
	out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
	writer = imageio.get_writer(out_file.name, format='FFMPEG', fps=fps)
	for frame in frames:
	writer.append_data(frame)
	writer.close()
	return out_file.name


	def generate(prompt: str, seed: int, num_frames: int,
	num_inference_steps: int) -> str:
	if seed == -1:
	seed = random.randint(0, 1000000)
	generator = torch.Generator().manual_seed(seed)
	frames = pipe(prompt,
	num_inference_steps=num_inference_steps,
	num_frames=num_frames,
	generator=generator).frames
	return to_video(frames, 8)


	examples = [
	['An astronaut riding a horse.', 0, 16, 25],
	['A panda eating bamboo on a rock.', 0, 16, 25],
	['Spiderman is surfing.', 0, 16, 25],
	]

	with gr.Blocks(css='style.css') as demo:
	gr.Markdown(DESCRIPTION)
	with gr.Group():
	with gr.Box():
	with gr.Row(elem_id='prompt-container').style(equal_height=True):
	prompt = gr.Text(
	label='Prompt',
	show_label=False,
	max_lines=1,
	placeholder='Enter your prompt',
	elem_id='prompt-text-input').style(container=False)
	run_button = gr.Button('Generate video').style(
	full_width=False)
	result = gr.Video(label='Result', show_label=False, elem_id='gallery')
	with gr.Accordion('Advanced options', open=False):
	seed = gr.Slider(
	label='Seed',
	minimum=-1,
	maximum=1000000,
	step=1,
	value=-1,
	info='If set to -1, a different seed will be used each time.')
	num_frames = gr.Slider(
	label='Number of frames',
	minimum=16,
	maximum=MAX_NUM_FRAMES,
	step=1,
	value=16,
	info=
	'Note that the content of the video also changes when you change the number of frames.'
	)
	num_inference_steps = gr.Slider(label='Number of inference steps',
	minimum=10,
	maximum=50,
	step=1,
	value=25)

	inputs = [
	prompt,
	seed,
	num_frames,
	num_inference_steps,
	]
	gr.Examples(examples=examples,
	inputs=inputs,
	outputs=result,
	fn=generate,
	cache_examples=os.getenv('SYSTEM') == 'spaces')

	prompt.submit(fn=generate, inputs=inputs, outputs=result)
	run_button.click(fn=generate, inputs=inputs, outputs=result)





	demo.queue(api_open=False, max_size=15).launch()