import gradio as gr import numpy as np import os from PIL import Image import cv2 from moviepy.editor import VideoFileClip import torch from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler from diffusers.utils import export_to_video SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret') DESCRIPTION = 'This space is an API service meant to be used by VideoChain and VideoQuest.\nWant to use this space for yourself? Please use the original code: [https://huggingface.co/spaces/fffiloni/zeroscope-XL](https://huggingface.co/spaces/fffiloni/zeroscope-XL)' pipe_xl = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16, revision="refs/pr/17") pipe_xl.vae.enable_slicing() pipe_xl.scheduler = DPMSolverMultistepScheduler.from_config(pipe_xl.scheduler.config) pipe_xl.enable_model_cpu_offload() pipe_xl.to("cuda") def convert_mp4_to_frames(video_path): # Read the video file video = cv2.VideoCapture(video_path) # Get the frames per second (fps) of the video fps = video.get(cv2.CAP_PROP_FPS) # Calculate the number of frames to extract # Note: we cannot go beyond 3 seconds on the large A10G # num_frames = int(fps * min(duration, 3)) # let's use a fixed max for now, no need to have a duration field num_frames = int(fps * 3) frames = [] frame_count = 0 # Iterate through each frame while True: # Read a frame ret, frame = video.read() # If the frame was not successfully read or we have reached the desired duration, break the loop if not ret or frame_count == num_frames: break # Convert BGR to RGB frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Append the frame to the list of frames frames.append(frame) frame_count += 1 # Release the video object video.release() # Convert the list of frames to a numpy array frames = np.array(frames) return frames def infer(prompt, video_in, denoise_strength, secret_token): if secret_token != SECRET_TOKEN: raise gr.Error( f'Invalid secret token. Please fork the original space if you want to use it for yourself.') negative_prompt = "text, watermark, copyright, blurry, cropped, noisy, pixelated, nsfw" video = convert_mp4_to_frames(video_in) video_resized = [Image.fromarray(frame).resize((1024, 576)) for frame in video] video_frames = pipe_xl(prompt, negative_prompt=negative_prompt, video=video_resized, strength=denoise_strength).frames video_path = export_to_video(video_frames, output_video_path="xl_result.mp4") return "xl_result.mp4", gr.Group.update(visible=True) with gr.Blocks() as demo: gr.Markdown(DESCRIPTION) with gr.Column(): secret_token = gr.Text(label='Secret Token', max_lines=1) video_in = gr.Video(type="numpy", source="upload") prompt_in = gr.Textbox(label="Prompt", elem_id="prompt-in") denoise_strength = gr.Slider(label="Denoise strength", minimum=0.6, maximum=0.9, step=0.01, value=0.66) #inference_steps = gr.Slider(label="Inference Steps", minimum=7, maximum=100, step=1, value=40, interactive=False) submit_btn = gr.Button("Submit") video_result = gr.Video(label="Video Output", elem_id="video-output") submit_btn.click(fn=infer, inputs=[prompt_in, video_in, denoise_strength, secret_token], outputs=[video_result], api_name="zero_xl" ) demo.queue(max_size=6).launch()