ai-tube-model-adl-2

Paused

App Files Files Community

jbilcke-hf HF staff commited on Apr 20

Commit

f9a691e

•

1 Parent(s): 7e1bff8

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -12

app.py CHANGED Viewed

@@ -3,12 +3,17 @@ import torch
 import os
 import base64
 import uuid
 from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 from PIL import Image
 SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')
 # Constants
@@ -29,9 +34,11 @@ dtype = torch.float16
 pipe = AnimateDiffPipeline.from_pretrained(bases[base_loaded], torch_dtype=dtype).to(device)
 pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")
-import tempfile
-import numpy as np
-import cv2
 def export_to_video_file(video_frames, output_video_path=None, fps=10):
     if output_video_path is None:
@@ -55,7 +62,76 @@ def export_to_video_file(video_frames, output_video_path=None, fps=10):
     return output_video_path
-def generate_image(secret_token, prompt, base, width, height, motion, step):
     if secret_token != SECRET_TOKEN:
         raise gr.Error(
             f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
@@ -98,7 +174,11 @@ def generate_image(secret_token, prompt, base, width, height, motion, step):
     # I think we are looking time here too, converting to webm is too slow, we should return
     # the frames unencoded to the frontend renderer
-    export_to_video_file(output.frames[0], path, fps=10)
     # Read the content of the video file and encode it to base64
     with open(path, "rb") as video_file:
@@ -139,22 +219,21 @@ with gr.Blocks() as demo:
                     "ToonYou",
                     "epiCRealism",
                 ],
-                value=base_loaded,
-                interactive=True
             )
             width = gr.Slider(
                 label='Width',
                 minimum=128,
                 maximum=2048,
                 step=32,
-                value=1024,
             )
             height = gr.Slider(
                 label='Height',
                 minimum=128,
                 maximum=2048,
                 step=32,
-                value=1024,
             )
             select_motion = gr.Dropdown(
                 label='Motion',
@@ -170,7 +249,6 @@ with gr.Blocks() as demo:
                     ("Roll right", "guoyww/animatediff-motion-lora-rolling-clockwise"),
                 ],
                 value="",
-                interactive=True
             )
             select_step = gr.Dropdown(
                 label='Inference steps',
@@ -180,15 +258,17 @@ with gr.Blocks() as demo:
                     ('4-Step', 4),
                     ('8-Step', 8)],
                 value=4,
-                interactive=True
             )
             submit = gr.Button()
     output_video_base64 = gr.Text()
     submit.click(
         fn=generate_image,
-        inputs=[secret_token, prompt, select_base, width, height, select_motion, select_step],
         outputs=output_video_base64,
     )

 import os
 import base64
 import uuid
+import tempfile
+import numpy as np
+import cv2
+import subprocess
 from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 from PIL import Image
 SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')
 # Constants
 pipe = AnimateDiffPipeline.from_pretrained(bases[base_loaded], torch_dtype=dtype).to(device)
 pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")
+# ----------------------------- VIDEO ENCODING ---------------------------------
+# Unfortunately, the Hugging Face Diffusers utils hardcode MP4V as a codec,
+# which is not supported by all browsers. This is a critical issue for AiTube,
+# so we are forced to implement our own encoding algorithm.
+# ------------------------------------------------------------------------------
 def export_to_video_file(video_frames, output_video_path=None, fps=10):
     if output_video_path is None:
     return output_video_path
+# ----------------------------- FRAME INTERPOLATION ---------------------------------
+# we cannot afford to use AI-based algorithms such as FILM or ST-MFNet,
+# those are way too slow for a AiTube which needs things to be as fast as possible
+# -----------------------------------------------------------------------------------
+def interpolate_video_frames(file_path, output_fps=10, desired_duration=2):
+    """
+    Interpolates frames in a video file to adjust frame rate and duration using ffmpeg's minterpolate.
+    Parameters:
+        file_path (str): Path to the input video file.
+        output_fps (int): Target frames per second for the output video.
+        desired_duration (int): Desired duration of the video in seconds.
+    Returns:
+        str: The file path of the modified video.
+    """
+    # Calculate the input fps required to stretch the video to the desired duration
+    input_fps = find_input_fps(file_path, desired_duration)
+    # Construct the ffmpeg command for interpolation
+    cmd = [
+        'ffmpeg',
+        '-i', file_path,  # input file
+        '-filter:v', f'minterpolate=fps={output_fps}',  # minterpolate filter options
+        '-r', str(output_fps),  # output frame rate
+        '-y',  # Overwrite output files without asking
+        file_path  # Output file (Overwrites the original)
+    ]
+    # Execute the command
+    try:
+        subprocess.run(cmd, check=True)
+        print("Video interpolation successful.")
+    except subprocess.CalledProcessError as e:
+        print("Failed to interpolate video. Error:", e)
+    return file_path
+def find_input_fps(file_path, desired_duration):
+    """
+    Determine the input fps that, when stretched to the desired duration, matches the original video length.
+    Parameters:
+        file_path (str): Path to the video file.
+        desired_duration (int or float): Desired duration in seconds.
+    Returns:
+        float: Calculated input fps.
+    """
+    # FFprobe command to find the duration of the video
+    ffprobe_cmd = [
+        'ffprobe',
+        '-v', 'error',
+        '-show_entries', 'format=duration',
+        '-of', 'default=noprint_wrappers=1:nokey=1',
+        file_path
+    ]
+    try:
+        result = subprocess.run(ffprobe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        duration = float(result.stdout.strip())
+        input_fps = duration / desired_duration
+    except Exception as e:
+        print("Failed to get video duration. Error:", e)
+        input_fps = 10  # Assume a default value if unable to fetch duration
+    return input_fps
+def generate_image(secret_token, prompt, base, width, height, motion, step, desired_duration, desired_fps):
     if secret_token != SECRET_TOKEN:
         raise gr.Error(
             f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
     # I think we are looking time here too, converting to webm is too slow, we should return
     # the frames unencoded to the frontend renderer
+    path = export_to_video_file(output.frames[0], path, fps=10)
+        # Optional frame interpolation
+    if desired_duration != 2 or desired_fps != 10:
+        path = interpolate_video_frames(path, output_fps=desired_fps, desired_duration=desired_duration)
     # Read the content of the video file and encode it to base64
     with open(path, "rb") as video_file:
                     "ToonYou",
                     "epiCRealism",
                 ],
+                value=base_loaded
             )
             width = gr.Slider(
                 label='Width',
                 minimum=128,
                 maximum=2048,
                 step=32,
+                value=512,
             )
             height = gr.Slider(
                 label='Height',
                 minimum=128,
                 maximum=2048,
                 step=32,
+                value=256,
             )
             select_motion = gr.Dropdown(
                 label='Motion',
                     ("Roll right", "guoyww/animatediff-motion-lora-rolling-clockwise"),
                 ],
                 value="",
             )
             select_step = gr.Dropdown(
                 label='Inference steps',
                     ('4-Step', 4),
                     ('8-Step', 8)],
                 value=4,
             )
+            duration_slider = gr.Slider(label="Desired Duration (seconds)", min_value=2, max_value=30, value=2, step=1)
+            fps_slider = gr.Slider(label="Desired Frames Per Second", min_value=10, max_value=60, value=10, step=1)
             submit = gr.Button()
     output_video_base64 = gr.Text()
     submit.click(
         fn=generate_image,
+        inputs=[secret_token, prompt, select_base, width, height, select_motion, select_step, duration_slider, fps_slider],
         outputs=output_video_base64,
     )