Spaces:

chenyangqi
/

FateZero

Runtime error

App Files Files Community

chenyangqi commited on Mar 31, 2023

Commit

a8b3fe4

1 Parent(s): 556db34

add preview for input video

Browse files

Files changed (6) hide show

FateZero/data/.gitignore +3 -1
FateZero/data/teaser_car-turn.mp4 +0 -0
FateZero/script/png_to_mp4.py +42 -0
app_fatezero.py +29 -103
example.py +5 -5
inference_fatezero.py +5 -2

FateZero/data/.gitignore CHANGED Viewed

@@ -2,4 +2,6 @@
 !teaser_car-turn
 !teaser_car-turn/*
 !.gitignore
-!download.sh

 !teaser_car-turn
 !teaser_car-turn/*
 !.gitignore
+!download.sh
+!*.mp4
+!*/*.mp4

FateZero/data/teaser_car-turn.mp4 ADDED Viewed

Binary file (147 kB). View file

FateZero/script/png_to_mp4.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import imageio
+import os
+from glob import glob
+def png_to_mp4(example_input_path, out_path=None):
+    # Create output folder if it doesn't exist
+    if out_path is None:
+        out_path = example_input_path[0:-4] + '/out.mp4'
+    print(out_path)
+    # os.makedirs(out_path, exist_ok=True)
+    png_list = sorted(glob(example_input_path + '/*.*g'))
+    # Read the GIF file using imageio
+    frames = []
+    for png in png_list:
+        print(png)
+        # fps = reader.get_meta_data()['fps'] # Get the FPS of the GIF
+        # Iterate over each frame in the GIF and save it as a PNG image
+        # for i, frame in enumerate(reader):
+        # frame_path = os.path.join(out_path, f'frame_{i:05d}.png')
+        # imageio.imwrite(frame_path, frame)
+        frames.append(imageio.imread(png))
+    # Save the frames as an MP4 video using imageio
+    # mp4_path = os.path.join(out_path, 'output.mp4')
+    print(out_path)
+    # breakpoint()
+    imageio.mimsave(out_path, frames, fps=10)
+video_all_folder = '/home/cqiaa/diffusion/hugging_face/FateZero/FateZero/data/style'
+video_list = glob(video_all_folder+'/*')
+for example_input_path in video_list:
+    print(example_input_path)
+    out_path = example_input_path+'.mp4'
+    png_to_mp4(example_input_path, out_path)
+# example_input_path = 'data/style/blackswan'
+# out_path = example_input_path+'.mp4'
+# png_to_mp4(example_input_path, out_path)

app_fatezero.py CHANGED Viewed

@@ -6,24 +6,8 @@ import os
 import gradio as gr
-# from inference import InferencePipeline
-# from FateZero import test_fatezero
 from inference_fatezero import merge_config_then_run
-# class InferenceUtil:
-#     def __init__(self, hf_token: str | None):
-#         self.hf_token = hf_token
-#     def load_model_info(self, model_id: str) -> tuple[str, str]:
-#         # todo FIXME
-#         try:
-#             card = InferencePipeline.get_model_card(model_id, self.hf_token)
-#         except Exception:
-#             return '', ''
-#         base_model = getattr(card.data, 'base_model', '')
-#         training_prompt = getattr(card.data, 'training_prompt', '')
-#         return base_model, training_prompt
 # TITLE = '# [FateZero](http://fate-zero-edit.github.io/)'
 HF_TOKEN = os.getenv('HF_TOKEN')
@@ -93,32 +77,33 @@ with gr.Blocks(css='style.css') as demo:
               </span>
     </h2>
     <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
-        FateZero is a first zero-shot framework for text-driven video editing via pretrained diffusion models without training.
     </h2>
     </div>
     """)
     gr.HTML("""
-    <p>Note that due to limite of memory and computing resource on hugging face, the results here are only toy examples and takes longer time to edit.
-    <p>For better performance and faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
     <br/>
     <a href="https://huggingface.co/spaces/chenyangqi/FateZero?duplicate=true">
     <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-    <p>Or try our github <a href=https://github.com/ChenyangQiQi/FateZero> code  </a> on your own GPU.
     </p>""")
     with gr.Row():
         with gr.Column():
             with gr.Accordion('Input Video', open=True):
-                user_input_video = gr.File(label='Input Source Video')
                 with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
-                    n_sample_frame = gr.Slider(label='Number of Frames in Video',
                                         minimum=0,
                                         maximum=32,
                                         step=1,
                                         value=8)
-                    stride = gr.Slider(label='Temporal sampling stride in Video',
                                             minimum=0,
                                             maximum=20,
                                             step=1,
@@ -153,16 +138,6 @@ with gr.Blocks(css='style.css') as demo:
                    stride
                 ] + offset_list
-                data_path = gr.Dropdown(
-                label='Or use provided data in our paper',
-                choices=[
-                    'FateZero/data/teaser_car-turn',
-                    'FateZero/data/style/sunflower',
-                    'FateZero/data/attribute/swan_swarov',
-                    # add shape editing ckpt here
-                ],
-                value='FateZero/data/teaser_car-turn')
                 model_id = gr.Dropdown(
                     label='Model ID',
                     choices=[
@@ -170,16 +145,6 @@ with gr.Blocks(css='style.css') as demo:
                         # add shape editing ckpt here
                     ],
                     value='CompVis/stable-diffusion-v1-4')
-                # with gr.Accordion(
-                #         label=
-                #         'Model info (Base model and prompt used for training)',
-                #         open=False):
-                #     with gr.Row():
-                #         base_model_used_for_training = gr.Text(
-                #             label='Base model', interactive=False)
-                #         prompt_used_for_training = gr.Text(
-                #             label='Training prompt', interactive=False)
             with gr.Accordion('Text Prompt', open=True):
@@ -197,91 +162,60 @@ with gr.Blocks(css='style.css') as demo:
-            with gr.Accordion('DDIM Parameters', open=True):
-                num_steps = gr.Slider(label='Number of Steps',
-                                      info='larger value has better editing capacity, but takes more time and memory',
-                                      minimum=0,
-                                      maximum=50,
-                                      step=1,
-                                      value=10)
-                guidance_scale = gr.Slider(label='CFG Scale',
-                                           minimum=0,
-                                           maximum=50,
-                                           step=0.1,
-                                           value=7.5)
             run_button = gr.Button('Generate')
-            # gr.Markdown('''
-            # - It takes a few minutes to download model first.
-            # - Expected time to generate an 8-frame video: 70 seconds with T4, 24 seconds with A10G, (10 seconds with A100)
-            # ''')
-            # gr.Markdown('''
-            # todo
-            # ''')
         with gr.Column():
             result = gr.Video(label='Result')
-            result.style(height=512, width=512)
             with gr.Accordion('FateZero Parameters for attention fusing', open=True):
-                cross_replace_steps = gr.Slider(label='cross-attention replace steps',
                                 info='More steps, replace more cross attention to preserve semantic layout.',
                                 minimum=0.0,
                                 maximum=1.0,
                                 step=0.1,
                                 value=0.7)
-                self_replace_steps = gr.Slider(label='self-attention replace steps',
                                 info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
                                 minimum=0.0,
                                 maximum=1.0,
                                 step=0.1,
                                 value=0.7)
-                enhance_words = gr.Textbox(label='words to be enhanced',
                                     info='Amplify the target-words cross attention',
                                     max_lines=1,
                                     placeholder='Example: "watercolor "',
                                     value='watercolor')
-                enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
                                 info='larger value, more elements of target words',
                                 minimum=0.0,
                                 maximum=20.0,
                                 step=1,
                                 value=10)
     with gr.Row():
         from example import style_example
         examples = style_example
-        # examples = [
-        #     [
-        #         'CompVis/stable-diffusion-v1-4',
-        #         'FateZero/data/teaser_car-turn',
-        #         'a silver jeep driving down a curvy road in the countryside',
-        #         'watercolor painting of a silver jeep driving down a curvy road in the countryside',
-        #         0.8,
-        #         0.8,
-        #         "watercolor",
-        #         10,
-        #         10,
-        #         7.5,
-        #     ],
-        #     [
-        #         'CompVis/stable-diffusion-v1-4',
-        #         'FateZero/data/style/sunflower',
-        #         'a yellow sunflower',
-        #         'van gogh style painting of a yellow sunflower',
-        #         0.5,
-        #         0.5,
-        #         'van gogh',
-        #         10,
-        #         10,
-        #         7.5,
-        #     ],
-        # ]
         gr.Examples(examples=examples,
                     inputs=[
                         model_id,
-                        data_path,
                         source_prompt,
                         target_prompt,
                         cross_replace_steps,
@@ -299,15 +233,9 @@ with gr.Blocks(css='style.css') as demo:
                     # cache_examples=os.getenv('SYSTEM') == 'spaces'
                     )
-    # model_id.change(fn=app.load_model_info,
-    #                 inputs=model_id,
-    #                 outputs=[
-    #                     base_model_used_for_training,
-    #                     prompt_used_for_training,
-    #                 ])
     inputs = [
             model_id,
-            data_path,
             source_prompt,
             target_prompt,
             cross_replace_steps,
@@ -319,9 +247,7 @@ with gr.Blocks(css='style.css') as demo:
             user_input_video,
             *ImageSequenceDataset_list
     ]
-    # prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
     target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
-    # run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
     run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
 demo.queue().launch()

 import gradio as gr
 from inference_fatezero import merge_config_then_run
 # TITLE = '# [FateZero](http://fate-zero-edit.github.io/)'
 HF_TOKEN = os.getenv('HF_TOKEN')
               </span>
     </h2>
     <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
+        FateZero is the first zero-shot framework for text-driven video editing via pretrained diffusion models without training.
     </h2>
     </div>
     """)
     gr.HTML("""
+    <p>Note that due to the limits of memory and computing resources on hugging-face, the results here are only toy examples and take longer to edit.
+    <p>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue.
     <br/>
     <a href="https://huggingface.co/spaces/chenyangqi/FateZero?duplicate=true">
     <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+    <p>Alternatively, try our GitHub <a href=https://github.com/ChenyangQiQi/FateZero> code  </a> on your GPU.
     </p>""")
     with gr.Row():
         with gr.Column():
             with gr.Accordion('Input Video', open=True):
+                # user_input_video = gr.File(label='Input Source Video')
+                user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
                 with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
+                    n_sample_frame = gr.Slider(label='Number of Frames',
                                         minimum=0,
                                         maximum=32,
                                         step=1,
                                         value=8)
+                    stride = gr.Slider(label='Temporal stride',
                                             minimum=0,
                                             maximum=20,
                                             step=1,
                    stride
                 ] + offset_list
                 model_id = gr.Dropdown(
                     label='Model ID',
                     choices=[
                         # add shape editing ckpt here
                     ],
                     value='CompVis/stable-diffusion-v1-4')
             with gr.Accordion('Text Prompt', open=True):
             run_button = gr.Button('Generate')
         with gr.Column():
             result = gr.Video(label='Result')
+            # result.style(height=512, width=512)
             with gr.Accordion('FateZero Parameters for attention fusing', open=True):
+                cross_replace_steps = gr.Slider(label='Cross-att replace steps',
                                 info='More steps, replace more cross attention to preserve semantic layout.',
                                 minimum=0.0,
                                 maximum=1.0,
                                 step=0.1,
                                 value=0.7)
+                self_replace_steps = gr.Slider(label='Self-att replace steps',
                                 info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
                                 minimum=0.0,
                                 maximum=1.0,
                                 step=0.1,
                                 value=0.7)
+                enhance_words = gr.Textbox(label='Enhanced words',
                                     info='Amplify the target-words cross attention',
                                     max_lines=1,
                                     placeholder='Example: "watercolor "',
                                     value='watercolor')
+                enhance_words_value = gr.Slider(label='Target cross-att amplification',
                                 info='larger value, more elements of target words',
                                 minimum=0.0,
                                 maximum=20.0,
                                 step=1,
                                 value=10)
+            with gr.Accordion('DDIM Parameters', open=True):
+                num_steps = gr.Slider(label='Number of Steps',
+                                      info='larger value has better editing capacity, but takes more time and memory. (50 steps may produces memory errors)',
+                                      minimum=0,
+                                      maximum=50,
+                                      step=1,
+                                      value=10)
+                guidance_scale = gr.Slider(label='CFG Scale',
+                                           minimum=0,
+                                           maximum=50,
+                                           step=0.1,
+                                           value=7.5)
     with gr.Row():
         from example import style_example
         examples = style_example
         gr.Examples(examples=examples,
                     inputs=[
                         model_id,
+                        user_input_video,
                         source_prompt,
                         target_prompt,
                         cross_replace_steps,
                     # cache_examples=os.getenv('SYSTEM') == 'spaces'
                     )
     inputs = [
             model_id,
+            user_input_video,
             source_prompt,
             target_prompt,
             cross_replace_steps,
             user_input_video,
             *ImageSequenceDataset_list
     ]
     target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
     run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
 demo.queue().launch()

example.py CHANGED Viewed

@@ -2,7 +2,7 @@ num_steps = 10
 style_example = [
             [
                 'CompVis/stable-diffusion-v1-4',
-                'FateZero/data/teaser_car-turn',
                 'a silver jeep driving down a curvy road in the countryside',
                 'watercolor painting of a silver jeep driving down a curvy road in the countryside',
                 0.8,
@@ -17,7 +17,7 @@ style_example = [
             ],
             [
                 'CompVis/stable-diffusion-v1-4',
-                'FateZero/data/style/sunflower',
                 'a yellow sunflower',
                 'van gogh style painting of a yellow sunflower',
                 0.5,
@@ -30,7 +30,7 @@ style_example = [
             ],
             [
                 'CompVis/stable-diffusion-v1-4',
-                'FateZero/data/style/surf',
                 'a man with round helmet surfing on a white wave in blue ocean with a rope',
                 'The Ukiyo-e style painting of a man with round helmet surfing on a white wave in blue ocean with a rope',
                 0.9,
@@ -43,7 +43,7 @@ style_example = [
             ],
             [
                 'CompVis/stable-diffusion-v1-4',
-                'FateZero/data/style/train',
                 'a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track',
                 'a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track Makoto Shinkai style',
                 0.9,
@@ -57,7 +57,7 @@ style_example = [
             [
                 'CompVis/stable-diffusion-v1-4',
-                'FateZero/data/attribute/swan_swarov',
                 'a black swan with a red beak swimming in a river near a wall and bushes',
                 'a Swarovski crystal swan with a red beak swimming in a river near a wall and bushes',
                 0.8,

 style_example = [
             [
                 'CompVis/stable-diffusion-v1-4',
+                'FateZero/data/teaser_car-turn.mp4',
                 'a silver jeep driving down a curvy road in the countryside',
                 'watercolor painting of a silver jeep driving down a curvy road in the countryside',
                 0.8,
             ],
             [
                 'CompVis/stable-diffusion-v1-4',
+                'FateZero/data/style/sunflower.mp4',
                 'a yellow sunflower',
                 'van gogh style painting of a yellow sunflower',
                 0.5,
             ],
             [
                 'CompVis/stable-diffusion-v1-4',
+                'FateZero/data/style/surf.mp4',
                 'a man with round helmet surfing on a white wave in blue ocean with a rope',
                 'The Ukiyo-e style painting of a man with round helmet surfing on a white wave in blue ocean with a rope',
                 0.9,
             ],
             [
                 'CompVis/stable-diffusion-v1-4',
+                'FateZero/data/style/train.mp4',
                 'a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track',
                 'a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track Makoto Shinkai style',
                 0.9,
             [
                 'CompVis/stable-diffusion-v1-4',
+                'FateZero/data/attribute/swan_swarov.mp4',
                 'a black swan with a red beak swimming in a river near a wall and bushes',
                 'a Swarovski crystal swan with a red beak swimming in a river near a wall and bushes',
                 0.8,

inference_fatezero.py CHANGED Viewed

@@ -92,8 +92,11 @@ class merge_config_then_run():
         config_now['train_dataset'].update(ImageSequenceDataset_dict)
         if user_input_video and data_path is None:
             raise gr.Error('You need to upload a video or choose a provided video')
-        if user_input_video is not None and user_input_video.name is not None:
-            config_now['train_dataset']['path'] = user_input_video.name
         config_now['validation_sample_logger_config']['prompts'] = [target_prompt]

         config_now['train_dataset'].update(ImageSequenceDataset_dict)
         if user_input_video and data_path is None:
             raise gr.Error('You need to upload a video or choose a provided video')
+        if user_input_video is not None:
+            if isinstance(user_input_video, str):
+                config_now['train_dataset']['path'] = user_input_video
+            elif hasattr(user_input_video, 'name') and user_input_video.name is not None:
+                config_now['train_dataset']['path'] = user_input_video.name
         config_now['validation_sample_logger_config']['prompts'] = [target_prompt]