import gradio as gr import os import cv2 import numpy as np from moviepy.editor import * #from share_btn import community_icon_html, loading_icon_html, share_js os.system("python -m pip install git+https://github.com/MaureenZOU/detectron2-xyz.git") import torch import argparse from xdecoder.BaseModel import BaseModel from xdecoder import build_model from utils.distributed import init_distributed from utils.arguments import load_opt_from_config_files from tasks import * def parse_option(): parser = argparse.ArgumentParser('X-Decoder All-in-One Demo', add_help=False) parser.add_argument('--conf_files', default="configs/xdecoder/svlp_focalt_lang.yaml", metavar="FILE", help='path to config file', ) args = parser.parse_args() return args ''' build args ''' args = parse_option() opt = load_opt_from_config_files(args.conf_files) opt = init_distributed(opt) # META DATA pretrained_pth_last = os.path.join("xdecoder_focalt_last.pt") pretrained_pth_novg = os.path.join("xdecoder_focalt_last_novg.pt") if not os.path.exists(pretrained_pth_last): os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last.pt")) if not os.path.exists(pretrained_pth_novg): os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last_novg.pt")) ''' build model ''' model_last = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_last).eval().cuda() with torch.no_grad(): model_last.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True) ''' inference model ''' @torch.no_grad() def xdecoder(image, instruction, *args, **kwargs): image = image.convert("RGB") with torch.autocast(device_type='cuda', dtype=torch.float16): return referring_inpainting_gpt3(model_last, image, instruction, *args, **kwargs) #xdecoder = gr.Interface.load(name="spaces/xdecoder/Instruct-X-Decoder") def get_frames(video_in): frames = [] #resize the video clip = VideoFileClip(video_in) #check fps if clip.fps > 30: print("vide rate is over 30, resetting to 30") clip_resized = clip.resize(height=512) clip_resized.write_videofile("video_resized.mp4", fps=30) else: print("video rate is OK") clip_resized = clip.resize(height=512) clip_resized.write_videofile("video_resized.mp4", fps=clip.fps) print("video resized to 512 height") # Opens the Video file with CV2 cap= cv2.VideoCapture("video_resized.mp4") fps = cap.get(cv2.CAP_PROP_FPS) print("video fps: " + str(fps)) i=0 while(cap.isOpened()): ret, frame = cap.read() if ret == False: break cv2.imwrite('kang'+str(i)+'.jpg',frame) frames.append('kang'+str(i)+'.jpg') i+=1 cap.release() cv2.destroyAllWindows() print("broke the video into frames") return frames, fps def create_video(frames, fps): print("building video result") clip = ImageSequenceClip(frames, fps=fps) clip.write_videofile("movie.mp4", fps=fps) return 'movie.mp4' def infer(prompt,video_in, trim_value): print(prompt) break_vid = get_frames(video_in) frames_list= break_vid[0] fps = break_vid[1] n_frame = int(trim_value*fps) if n_frame >= len(frames_list): print("video is shorter than the cut value") n_frame = len(frames_list) result_frames = [] print("set stop frames to: " + str(n_frame)) for i in frames_list[0:int(n_frame)]: #xdecoder_img = xdecoder(i, prompt, fn_index=0) xdecoder_img = xdecoder(i, prompt) #res_image = xdecoder_img[0] #rgb_im = images[0].convert("RGB") # exporting the image #res_image.save(f"result_img-{i}.jpg") result_frames.append(xdecoder_img) print("frame " + i + "/" + str(n_frame) + ": done;") print(result_frames) final_vid = create_video(result_frames, fps) print("finished !") #return final_vid, gr.Group.update(visible=True) return final_vid title = """

Instruct X-Decoder Video

Apply Instruct X-Decoder Diffusion to a video

""" article = """

You may also like:

""" with gr.Blocks(css='style.css') as demo: with gr.Column(elem_id="col-container"): gr.HTML(title) with gr.Row(): with gr.Column(): video_inp = gr.Video(label="Video source", source="upload", type="filepath", elem_id="input-vid") prompt = gr.Textbox(label="Prompt", placeholder="enter prompt", show_label=False, elem_id="prompt-in") with gr.Row(): trim_in = gr.Slider(label="Cut video at (s)", minimun=1, maximum=3, step=1, value=1) with gr.Column(): video_out = gr.Video(label="Pix2pix video result", elem_id="video-output") gr.HTML(""" Duplicate Space work with longer videos / skip the queue: """, elem_id="duplicate-container") submit_btn = gr.Button("Generate X-Decoder video") #with gr.Group(elem_id="share-btn-container", visible=False) as share_group: # community_icon = gr.HTML(community_icon_html) # loading_icon = gr.HTML(loading_icon_html) # share_button = gr.Button("Share to community", elem_id="share-btn") inputs = [prompt, video_inp, trim_in] #outputs = [video_out, share_group] outputs = [video_out] gr.HTML(article) submit_btn.click(infer, inputs, outputs) #share_button.click(None, [], [], _js=share_js) demo.launch().queue(max_size=12)